diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,215072 @@ +{ + "best_global_step": 20142, + "best_metric": 0.6375064253807068, + "best_model_checkpoint": "saves_multiple/lora/llama-3-8b-instruct/train_math_qa_42_1760637607/checkpoint-20142", + "epoch": 20.0, + "eval_steps": 6714, + "global_step": 134280, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007447125409591898, + "grad_norm": 7.678571701049805, + "learning_rate": 1.4894250819183796e-08, + "loss": 1.0438, + "num_input_tokens_seen": 2688, + "step": 5 + }, + { + "epoch": 0.0014894250819183796, + "grad_norm": 9.54932689666748, + "learning_rate": 3.351206434316354e-08, + "loss": 1.4509, + "num_input_tokens_seen": 5856, + "step": 10 + }, + { + "epoch": 0.002234137622877569, + "grad_norm": 14.317206382751465, + "learning_rate": 5.2129877867143284e-08, + "loss": 1.1626, + "num_input_tokens_seen": 8512, + "step": 15 + }, + { + "epoch": 0.002978850163836759, + "grad_norm": 9.742072105407715, + "learning_rate": 7.074769139112303e-08, + "loss": 1.1287, + "num_input_tokens_seen": 11200, + "step": 20 + }, + { + "epoch": 0.0037235627047959487, + "grad_norm": 6.446687698364258, + "learning_rate": 8.936550491510277e-08, + "loss": 1.2403, + "num_input_tokens_seen": 14336, + "step": 25 + }, + { + "epoch": 0.004468275245755138, + "grad_norm": 8.17249584197998, + "learning_rate": 1.0798331843908253e-07, + "loss": 1.1193, + "num_input_tokens_seen": 17536, + "step": 30 + }, + { + "epoch": 0.005212987786714328, + "grad_norm": 7.446258544921875, + "learning_rate": 1.2660113196306226e-07, + "loss": 1.2912, + "num_input_tokens_seen": 20704, + "step": 35 + }, + { + "epoch": 0.005957700327673518, + "grad_norm": 8.377558708190918, + "learning_rate": 1.45218945487042e-07, + "loss": 1.0157, + "num_input_tokens_seen": 23552, + "step": 40 + }, + { + "epoch": 0.006702412868632708, + "grad_norm": 5.199667930603027, + "learning_rate": 1.6383675901102174e-07, + "loss": 1.0662, + "num_input_tokens_seen": 26528, + "step": 45 + }, + { + "epoch": 0.0074471254095918975, + "grad_norm": 16.554712295532227, + "learning_rate": 1.824545725350015e-07, + "loss": 1.461, + "num_input_tokens_seen": 29152, + "step": 50 + }, + { + "epoch": 0.008191837950551088, + "grad_norm": 12.252357482910156, + "learning_rate": 2.0107238605898125e-07, + "loss": 1.3834, + "num_input_tokens_seen": 31872, + "step": 55 + }, + { + "epoch": 0.008936550491510277, + "grad_norm": 7.521791458129883, + "learning_rate": 2.1969019958296101e-07, + "loss": 1.0165, + "num_input_tokens_seen": 34784, + "step": 60 + }, + { + "epoch": 0.009681263032469467, + "grad_norm": 5.918280601501465, + "learning_rate": 2.3830801310694073e-07, + "loss": 1.2152, + "num_input_tokens_seen": 37664, + "step": 65 + }, + { + "epoch": 0.010425975573428656, + "grad_norm": 15.757433891296387, + "learning_rate": 2.569258266309205e-07, + "loss": 1.7297, + "num_input_tokens_seen": 40512, + "step": 70 + }, + { + "epoch": 0.011170688114387846, + "grad_norm": 6.456410884857178, + "learning_rate": 2.755436401549002e-07, + "loss": 1.3359, + "num_input_tokens_seen": 43424, + "step": 75 + }, + { + "epoch": 0.011915400655347037, + "grad_norm": 6.361699104309082, + "learning_rate": 2.9416145367888e-07, + "loss": 1.1618, + "num_input_tokens_seen": 46208, + "step": 80 + }, + { + "epoch": 0.012660113196306225, + "grad_norm": 6.674674987792969, + "learning_rate": 3.1277926720285975e-07, + "loss": 1.1754, + "num_input_tokens_seen": 48832, + "step": 85 + }, + { + "epoch": 0.013404825737265416, + "grad_norm": 11.940991401672363, + "learning_rate": 3.3139708072683946e-07, + "loss": 1.0431, + "num_input_tokens_seen": 51584, + "step": 90 + }, + { + "epoch": 0.014149538278224605, + "grad_norm": 8.062139511108398, + "learning_rate": 3.5001489425081923e-07, + "loss": 1.0679, + "num_input_tokens_seen": 54368, + "step": 95 + }, + { + "epoch": 0.014894250819183795, + "grad_norm": 7.98508882522583, + "learning_rate": 3.6863270777479894e-07, + "loss": 1.0747, + "num_input_tokens_seen": 57248, + "step": 100 + }, + { + "epoch": 0.015638963360142984, + "grad_norm": 13.409063339233398, + "learning_rate": 3.872505212987787e-07, + "loss": 1.3667, + "num_input_tokens_seen": 60352, + "step": 105 + }, + { + "epoch": 0.016383675901102176, + "grad_norm": 8.287819862365723, + "learning_rate": 4.058683348227585e-07, + "loss": 1.2789, + "num_input_tokens_seen": 63072, + "step": 110 + }, + { + "epoch": 0.017128388442061365, + "grad_norm": 15.549361228942871, + "learning_rate": 4.244861483467382e-07, + "loss": 1.4066, + "num_input_tokens_seen": 65952, + "step": 115 + }, + { + "epoch": 0.017873100983020553, + "grad_norm": 7.6186418533325195, + "learning_rate": 4.431039618707179e-07, + "loss": 0.9153, + "num_input_tokens_seen": 68672, + "step": 120 + }, + { + "epoch": 0.018617813523979745, + "grad_norm": 9.17758846282959, + "learning_rate": 4.617217753946977e-07, + "loss": 1.2409, + "num_input_tokens_seen": 71488, + "step": 125 + }, + { + "epoch": 0.019362526064938934, + "grad_norm": 11.263129234313965, + "learning_rate": 4.803395889186774e-07, + "loss": 1.1523, + "num_input_tokens_seen": 74304, + "step": 130 + }, + { + "epoch": 0.020107238605898123, + "grad_norm": 14.885274887084961, + "learning_rate": 4.989574024426572e-07, + "loss": 1.3362, + "num_input_tokens_seen": 77120, + "step": 135 + }, + { + "epoch": 0.02085195114685731, + "grad_norm": 9.998028755187988, + "learning_rate": 5.17575215966637e-07, + "loss": 1.0845, + "num_input_tokens_seen": 80000, + "step": 140 + }, + { + "epoch": 0.021596663687816504, + "grad_norm": 14.405006408691406, + "learning_rate": 5.361930294906167e-07, + "loss": 1.3448, + "num_input_tokens_seen": 82912, + "step": 145 + }, + { + "epoch": 0.022341376228775692, + "grad_norm": 9.569189071655273, + "learning_rate": 5.548108430145964e-07, + "loss": 1.1236, + "num_input_tokens_seen": 85824, + "step": 150 + }, + { + "epoch": 0.02308608876973488, + "grad_norm": 10.867964744567871, + "learning_rate": 5.734286565385761e-07, + "loss": 1.3071, + "num_input_tokens_seen": 88640, + "step": 155 + }, + { + "epoch": 0.023830801310694073, + "grad_norm": 13.226275444030762, + "learning_rate": 5.920464700625559e-07, + "loss": 1.2794, + "num_input_tokens_seen": 91584, + "step": 160 + }, + { + "epoch": 0.024575513851653262, + "grad_norm": 6.288146018981934, + "learning_rate": 6.106642835865357e-07, + "loss": 1.3167, + "num_input_tokens_seen": 94592, + "step": 165 + }, + { + "epoch": 0.02532022639261245, + "grad_norm": 13.753358840942383, + "learning_rate": 6.292820971105154e-07, + "loss": 1.342, + "num_input_tokens_seen": 97760, + "step": 170 + }, + { + "epoch": 0.026064938933571643, + "grad_norm": 9.274106979370117, + "learning_rate": 6.478999106344952e-07, + "loss": 1.2563, + "num_input_tokens_seen": 100960, + "step": 175 + }, + { + "epoch": 0.02680965147453083, + "grad_norm": 6.239742279052734, + "learning_rate": 6.665177241584749e-07, + "loss": 0.9274, + "num_input_tokens_seen": 104320, + "step": 180 + }, + { + "epoch": 0.02755436401549002, + "grad_norm": 10.177925109863281, + "learning_rate": 6.851355376824546e-07, + "loss": 0.9063, + "num_input_tokens_seen": 107008, + "step": 185 + }, + { + "epoch": 0.02829907655644921, + "grad_norm": 12.237645149230957, + "learning_rate": 7.037533512064343e-07, + "loss": 1.1456, + "num_input_tokens_seen": 110048, + "step": 190 + }, + { + "epoch": 0.0290437890974084, + "grad_norm": 6.673046588897705, + "learning_rate": 7.223711647304142e-07, + "loss": 0.9727, + "num_input_tokens_seen": 112896, + "step": 195 + }, + { + "epoch": 0.02978850163836759, + "grad_norm": 6.110115051269531, + "learning_rate": 7.409889782543939e-07, + "loss": 0.9092, + "num_input_tokens_seen": 116192, + "step": 200 + }, + { + "epoch": 0.03053321417932678, + "grad_norm": 6.690563678741455, + "learning_rate": 7.596067917783736e-07, + "loss": 1.1284, + "num_input_tokens_seen": 119200, + "step": 205 + }, + { + "epoch": 0.03127792672028597, + "grad_norm": 8.389464378356934, + "learning_rate": 7.782246053023533e-07, + "loss": 1.1307, + "num_input_tokens_seen": 122144, + "step": 210 + }, + { + "epoch": 0.032022639261245156, + "grad_norm": 13.39493465423584, + "learning_rate": 7.96842418826333e-07, + "loss": 1.3167, + "num_input_tokens_seen": 125312, + "step": 215 + }, + { + "epoch": 0.03276735180220435, + "grad_norm": 6.844549655914307, + "learning_rate": 8.154602323503128e-07, + "loss": 1.1237, + "num_input_tokens_seen": 128160, + "step": 220 + }, + { + "epoch": 0.03351206434316354, + "grad_norm": 18.376718521118164, + "learning_rate": 8.340780458742925e-07, + "loss": 1.1886, + "num_input_tokens_seen": 131168, + "step": 225 + }, + { + "epoch": 0.03425677688412273, + "grad_norm": 10.922321319580078, + "learning_rate": 8.526958593982724e-07, + "loss": 0.9799, + "num_input_tokens_seen": 133888, + "step": 230 + }, + { + "epoch": 0.03500148942508192, + "grad_norm": 6.057826042175293, + "learning_rate": 8.713136729222521e-07, + "loss": 0.8781, + "num_input_tokens_seen": 136608, + "step": 235 + }, + { + "epoch": 0.035746201966041107, + "grad_norm": 7.977631092071533, + "learning_rate": 8.899314864462318e-07, + "loss": 1.0926, + "num_input_tokens_seen": 139840, + "step": 240 + }, + { + "epoch": 0.036490914507000295, + "grad_norm": 7.061324119567871, + "learning_rate": 9.085492999702115e-07, + "loss": 1.0264, + "num_input_tokens_seen": 143072, + "step": 245 + }, + { + "epoch": 0.03723562704795949, + "grad_norm": 7.556655406951904, + "learning_rate": 9.271671134941912e-07, + "loss": 0.5904, + "num_input_tokens_seen": 145920, + "step": 250 + }, + { + "epoch": 0.03798033958891868, + "grad_norm": 7.220543384552002, + "learning_rate": 9.457849270181709e-07, + "loss": 0.866, + "num_input_tokens_seen": 149088, + "step": 255 + }, + { + "epoch": 0.03872505212987787, + "grad_norm": 6.447434425354004, + "learning_rate": 9.644027405421507e-07, + "loss": 1.0469, + "num_input_tokens_seen": 152704, + "step": 260 + }, + { + "epoch": 0.03946976467083706, + "grad_norm": 4.263437271118164, + "learning_rate": 9.830205540661306e-07, + "loss": 0.8658, + "num_input_tokens_seen": 155552, + "step": 265 + }, + { + "epoch": 0.040214477211796246, + "grad_norm": 7.677776336669922, + "learning_rate": 1.0016383675901103e-06, + "loss": 1.1373, + "num_input_tokens_seen": 158496, + "step": 270 + }, + { + "epoch": 0.040959189752755434, + "grad_norm": 3.6514954566955566, + "learning_rate": 1.02025618111409e-06, + "loss": 1.0503, + "num_input_tokens_seen": 161536, + "step": 275 + }, + { + "epoch": 0.04170390229371462, + "grad_norm": 10.927120208740234, + "learning_rate": 1.0388739946380697e-06, + "loss": 0.9215, + "num_input_tokens_seen": 164384, + "step": 280 + }, + { + "epoch": 0.04244861483467382, + "grad_norm": 7.750858783721924, + "learning_rate": 1.0574918081620494e-06, + "loss": 0.9587, + "num_input_tokens_seen": 167200, + "step": 285 + }, + { + "epoch": 0.04319332737563301, + "grad_norm": 7.500394344329834, + "learning_rate": 1.0761096216860292e-06, + "loss": 0.9346, + "num_input_tokens_seen": 170208, + "step": 290 + }, + { + "epoch": 0.043938039916592196, + "grad_norm": 9.411067008972168, + "learning_rate": 1.0947274352100089e-06, + "loss": 0.9133, + "num_input_tokens_seen": 173408, + "step": 295 + }, + { + "epoch": 0.044682752457551385, + "grad_norm": 9.661206245422363, + "learning_rate": 1.1133452487339888e-06, + "loss": 0.793, + "num_input_tokens_seen": 176064, + "step": 300 + }, + { + "epoch": 0.045427464998510574, + "grad_norm": 13.12051010131836, + "learning_rate": 1.1319630622579685e-06, + "loss": 0.7625, + "num_input_tokens_seen": 178848, + "step": 305 + }, + { + "epoch": 0.04617217753946976, + "grad_norm": 6.991497039794922, + "learning_rate": 1.1505808757819482e-06, + "loss": 0.8548, + "num_input_tokens_seen": 181856, + "step": 310 + }, + { + "epoch": 0.04691689008042895, + "grad_norm": 8.0050048828125, + "learning_rate": 1.169198689305928e-06, + "loss": 0.74, + "num_input_tokens_seen": 184896, + "step": 315 + }, + { + "epoch": 0.04766160262138815, + "grad_norm": 9.177217483520508, + "learning_rate": 1.1878165028299077e-06, + "loss": 0.8964, + "num_input_tokens_seen": 187808, + "step": 320 + }, + { + "epoch": 0.048406315162347335, + "grad_norm": 4.758383274078369, + "learning_rate": 1.2064343163538874e-06, + "loss": 0.6091, + "num_input_tokens_seen": 190880, + "step": 325 + }, + { + "epoch": 0.049151027703306524, + "grad_norm": 9.271036148071289, + "learning_rate": 1.2250521298778673e-06, + "loss": 1.0402, + "num_input_tokens_seen": 193536, + "step": 330 + }, + { + "epoch": 0.04989574024426571, + "grad_norm": 6.073648452758789, + "learning_rate": 1.243669943401847e-06, + "loss": 1.0405, + "num_input_tokens_seen": 196448, + "step": 335 + }, + { + "epoch": 0.0506404527852249, + "grad_norm": 6.750680446624756, + "learning_rate": 1.2622877569258267e-06, + "loss": 0.6042, + "num_input_tokens_seen": 199136, + "step": 340 + }, + { + "epoch": 0.05138516532618409, + "grad_norm": 7.1182966232299805, + "learning_rate": 1.2809055704498064e-06, + "loss": 0.996, + "num_input_tokens_seen": 202240, + "step": 345 + }, + { + "epoch": 0.052129877867143286, + "grad_norm": 8.134061813354492, + "learning_rate": 1.2995233839737862e-06, + "loss": 0.7972, + "num_input_tokens_seen": 204896, + "step": 350 + }, + { + "epoch": 0.052874590408102475, + "grad_norm": 5.578507900238037, + "learning_rate": 1.3181411974977659e-06, + "loss": 0.8037, + "num_input_tokens_seen": 207552, + "step": 355 + }, + { + "epoch": 0.05361930294906166, + "grad_norm": 4.748986721038818, + "learning_rate": 1.3367590110217456e-06, + "loss": 0.7322, + "num_input_tokens_seen": 210240, + "step": 360 + }, + { + "epoch": 0.05436401549002085, + "grad_norm": 5.4837870597839355, + "learning_rate": 1.3553768245457255e-06, + "loss": 0.6653, + "num_input_tokens_seen": 213024, + "step": 365 + }, + { + "epoch": 0.05510872803098004, + "grad_norm": 3.643625020980835, + "learning_rate": 1.3739946380697052e-06, + "loss": 0.6667, + "num_input_tokens_seen": 215840, + "step": 370 + }, + { + "epoch": 0.05585344057193923, + "grad_norm": 9.170262336730957, + "learning_rate": 1.392612451593685e-06, + "loss": 0.7343, + "num_input_tokens_seen": 218528, + "step": 375 + }, + { + "epoch": 0.05659815311289842, + "grad_norm": 8.485528945922852, + "learning_rate": 1.4112302651176647e-06, + "loss": 0.9279, + "num_input_tokens_seen": 221824, + "step": 380 + }, + { + "epoch": 0.057342865653857614, + "grad_norm": 4.238389492034912, + "learning_rate": 1.4298480786416444e-06, + "loss": 0.6518, + "num_input_tokens_seen": 224480, + "step": 385 + }, + { + "epoch": 0.0580875781948168, + "grad_norm": 4.563525199890137, + "learning_rate": 1.448465892165624e-06, + "loss": 0.6231, + "num_input_tokens_seen": 227296, + "step": 390 + }, + { + "epoch": 0.05883229073577599, + "grad_norm": 7.642597675323486, + "learning_rate": 1.4670837056896038e-06, + "loss": 0.8925, + "num_input_tokens_seen": 229920, + "step": 395 + }, + { + "epoch": 0.05957700327673518, + "grad_norm": 4.677399158477783, + "learning_rate": 1.4857015192135837e-06, + "loss": 0.6556, + "num_input_tokens_seen": 232512, + "step": 400 + }, + { + "epoch": 0.06032171581769437, + "grad_norm": 4.071849822998047, + "learning_rate": 1.5043193327375634e-06, + "loss": 0.799, + "num_input_tokens_seen": 235584, + "step": 405 + }, + { + "epoch": 0.06106642835865356, + "grad_norm": 7.11200475692749, + "learning_rate": 1.5229371462615432e-06, + "loss": 0.9613, + "num_input_tokens_seen": 238496, + "step": 410 + }, + { + "epoch": 0.06181114089961275, + "grad_norm": 3.341383934020996, + "learning_rate": 1.5415549597855229e-06, + "loss": 0.7163, + "num_input_tokens_seen": 241376, + "step": 415 + }, + { + "epoch": 0.06255585344057193, + "grad_norm": 5.622413158416748, + "learning_rate": 1.5601727733095026e-06, + "loss": 0.7431, + "num_input_tokens_seen": 244320, + "step": 420 + }, + { + "epoch": 0.06330056598153112, + "grad_norm": 7.113465785980225, + "learning_rate": 1.5787905868334823e-06, + "loss": 0.9822, + "num_input_tokens_seen": 247488, + "step": 425 + }, + { + "epoch": 0.06404527852249031, + "grad_norm": 7.674259185791016, + "learning_rate": 1.597408400357462e-06, + "loss": 1.0591, + "num_input_tokens_seen": 250208, + "step": 430 + }, + { + "epoch": 0.06478999106344951, + "grad_norm": 3.212587356567383, + "learning_rate": 1.6160262138814417e-06, + "loss": 0.803, + "num_input_tokens_seen": 253120, + "step": 435 + }, + { + "epoch": 0.0655347036044087, + "grad_norm": 5.438559055328369, + "learning_rate": 1.6346440274054214e-06, + "loss": 0.6733, + "num_input_tokens_seen": 256032, + "step": 440 + }, + { + "epoch": 0.06627941614536789, + "grad_norm": 6.124120712280273, + "learning_rate": 1.6532618409294012e-06, + "loss": 0.9301, + "num_input_tokens_seen": 258912, + "step": 445 + }, + { + "epoch": 0.06702412868632708, + "grad_norm": 6.268150329589844, + "learning_rate": 1.6718796544533813e-06, + "loss": 0.7574, + "num_input_tokens_seen": 261728, + "step": 450 + }, + { + "epoch": 0.06776884122728627, + "grad_norm": 3.686204433441162, + "learning_rate": 1.690497467977361e-06, + "loss": 0.6012, + "num_input_tokens_seen": 264608, + "step": 455 + }, + { + "epoch": 0.06851355376824546, + "grad_norm": 4.40387487411499, + "learning_rate": 1.7091152815013407e-06, + "loss": 0.7136, + "num_input_tokens_seen": 267744, + "step": 460 + }, + { + "epoch": 0.06925826630920465, + "grad_norm": 5.722039699554443, + "learning_rate": 1.7277330950253204e-06, + "loss": 0.7868, + "num_input_tokens_seen": 270912, + "step": 465 + }, + { + "epoch": 0.07000297885016384, + "grad_norm": 5.933197498321533, + "learning_rate": 1.7463509085493002e-06, + "loss": 0.9032, + "num_input_tokens_seen": 273824, + "step": 470 + }, + { + "epoch": 0.07074769139112302, + "grad_norm": 4.271512031555176, + "learning_rate": 1.7649687220732799e-06, + "loss": 0.7897, + "num_input_tokens_seen": 276960, + "step": 475 + }, + { + "epoch": 0.07149240393208221, + "grad_norm": 5.1438069343566895, + "learning_rate": 1.7835865355972596e-06, + "loss": 0.6496, + "num_input_tokens_seen": 279712, + "step": 480 + }, + { + "epoch": 0.0722371164730414, + "grad_norm": 7.929337978363037, + "learning_rate": 1.8022043491212393e-06, + "loss": 0.7436, + "num_input_tokens_seen": 282592, + "step": 485 + }, + { + "epoch": 0.07298182901400059, + "grad_norm": 5.440913200378418, + "learning_rate": 1.820822162645219e-06, + "loss": 0.9846, + "num_input_tokens_seen": 285472, + "step": 490 + }, + { + "epoch": 0.07372654155495978, + "grad_norm": 6.502635955810547, + "learning_rate": 1.8394399761691987e-06, + "loss": 0.6809, + "num_input_tokens_seen": 288544, + "step": 495 + }, + { + "epoch": 0.07447125409591898, + "grad_norm": 4.102325439453125, + "learning_rate": 1.8580577896931784e-06, + "loss": 0.7028, + "num_input_tokens_seen": 291552, + "step": 500 + }, + { + "epoch": 0.07521596663687817, + "grad_norm": 4.989837646484375, + "learning_rate": 1.8766756032171582e-06, + "loss": 0.7074, + "num_input_tokens_seen": 294432, + "step": 505 + }, + { + "epoch": 0.07596067917783736, + "grad_norm": 4.564352512359619, + "learning_rate": 1.8952934167411379e-06, + "loss": 0.7605, + "num_input_tokens_seen": 297344, + "step": 510 + }, + { + "epoch": 0.07670539171879655, + "grad_norm": 6.701633453369141, + "learning_rate": 1.913911230265118e-06, + "loss": 0.8859, + "num_input_tokens_seen": 300416, + "step": 515 + }, + { + "epoch": 0.07745010425975574, + "grad_norm": 3.6834170818328857, + "learning_rate": 1.9325290437890977e-06, + "loss": 0.7464, + "num_input_tokens_seen": 303520, + "step": 520 + }, + { + "epoch": 0.07819481680071493, + "grad_norm": 7.098754405975342, + "learning_rate": 1.9511468573130772e-06, + "loss": 0.6822, + "num_input_tokens_seen": 306208, + "step": 525 + }, + { + "epoch": 0.07893952934167411, + "grad_norm": 9.105101585388184, + "learning_rate": 1.969764670837057e-06, + "loss": 0.8026, + "num_input_tokens_seen": 309504, + "step": 530 + }, + { + "epoch": 0.0796842418826333, + "grad_norm": 5.209368705749512, + "learning_rate": 1.9883824843610367e-06, + "loss": 0.7687, + "num_input_tokens_seen": 312384, + "step": 535 + }, + { + "epoch": 0.08042895442359249, + "grad_norm": 2.7887208461761475, + "learning_rate": 2.0070002978850166e-06, + "loss": 0.8412, + "num_input_tokens_seen": 315072, + "step": 540 + }, + { + "epoch": 0.08117366696455168, + "grad_norm": 3.135200262069702, + "learning_rate": 2.025618111408996e-06, + "loss": 0.8303, + "num_input_tokens_seen": 318080, + "step": 545 + }, + { + "epoch": 0.08191837950551087, + "grad_norm": 5.7329840660095215, + "learning_rate": 2.044235924932976e-06, + "loss": 0.7719, + "num_input_tokens_seen": 320960, + "step": 550 + }, + { + "epoch": 0.08266309204647006, + "grad_norm": 6.926466464996338, + "learning_rate": 2.0628537384569555e-06, + "loss": 0.8099, + "num_input_tokens_seen": 324160, + "step": 555 + }, + { + "epoch": 0.08340780458742925, + "grad_norm": 5.635309219360352, + "learning_rate": 2.0814715519809354e-06, + "loss": 0.749, + "num_input_tokens_seen": 326784, + "step": 560 + }, + { + "epoch": 0.08415251712838845, + "grad_norm": 4.088613986968994, + "learning_rate": 2.100089365504915e-06, + "loss": 0.6714, + "num_input_tokens_seen": 329696, + "step": 565 + }, + { + "epoch": 0.08489722966934764, + "grad_norm": 7.652744293212891, + "learning_rate": 2.118707179028895e-06, + "loss": 0.7716, + "num_input_tokens_seen": 333280, + "step": 570 + }, + { + "epoch": 0.08564194221030683, + "grad_norm": 4.505173683166504, + "learning_rate": 2.1373249925528744e-06, + "loss": 0.8009, + "num_input_tokens_seen": 336160, + "step": 575 + }, + { + "epoch": 0.08638665475126601, + "grad_norm": 5.5360107421875, + "learning_rate": 2.1559428060768547e-06, + "loss": 0.6427, + "num_input_tokens_seen": 338880, + "step": 580 + }, + { + "epoch": 0.0871313672922252, + "grad_norm": 3.003453493118286, + "learning_rate": 2.1745606196008342e-06, + "loss": 0.7976, + "num_input_tokens_seen": 341824, + "step": 585 + }, + { + "epoch": 0.08787607983318439, + "grad_norm": 4.048456192016602, + "learning_rate": 2.193178433124814e-06, + "loss": 0.696, + "num_input_tokens_seen": 344672, + "step": 590 + }, + { + "epoch": 0.08862079237414358, + "grad_norm": 3.669048309326172, + "learning_rate": 2.2117962466487937e-06, + "loss": 0.6415, + "num_input_tokens_seen": 347424, + "step": 595 + }, + { + "epoch": 0.08936550491510277, + "grad_norm": 2.914979934692383, + "learning_rate": 2.2304140601727736e-06, + "loss": 0.7075, + "num_input_tokens_seen": 350272, + "step": 600 + }, + { + "epoch": 0.09011021745606196, + "grad_norm": 4.8059186935424805, + "learning_rate": 2.249031873696753e-06, + "loss": 0.7014, + "num_input_tokens_seen": 353248, + "step": 605 + }, + { + "epoch": 0.09085492999702115, + "grad_norm": 3.080714464187622, + "learning_rate": 2.267649687220733e-06, + "loss": 0.7308, + "num_input_tokens_seen": 356096, + "step": 610 + }, + { + "epoch": 0.09159964253798034, + "grad_norm": 5.71976900100708, + "learning_rate": 2.2862675007447125e-06, + "loss": 0.8127, + "num_input_tokens_seen": 358784, + "step": 615 + }, + { + "epoch": 0.09234435507893952, + "grad_norm": 4.764165878295898, + "learning_rate": 2.3048853142686924e-06, + "loss": 0.7339, + "num_input_tokens_seen": 361536, + "step": 620 + }, + { + "epoch": 0.09308906761989871, + "grad_norm": 6.870626449584961, + "learning_rate": 2.323503127792672e-06, + "loss": 0.81, + "num_input_tokens_seen": 364512, + "step": 625 + }, + { + "epoch": 0.0938337801608579, + "grad_norm": 3.554745674133301, + "learning_rate": 2.342120941316652e-06, + "loss": 0.5694, + "num_input_tokens_seen": 367424, + "step": 630 + }, + { + "epoch": 0.0945784927018171, + "grad_norm": 3.1852521896362305, + "learning_rate": 2.3607387548406314e-06, + "loss": 0.7141, + "num_input_tokens_seen": 370208, + "step": 635 + }, + { + "epoch": 0.0953232052427763, + "grad_norm": 5.33660888671875, + "learning_rate": 2.3793565683646113e-06, + "loss": 0.8454, + "num_input_tokens_seen": 373248, + "step": 640 + }, + { + "epoch": 0.09606791778373548, + "grad_norm": 4.557825088500977, + "learning_rate": 2.3979743818885912e-06, + "loss": 0.8899, + "num_input_tokens_seen": 376192, + "step": 645 + }, + { + "epoch": 0.09681263032469467, + "grad_norm": 3.6935510635375977, + "learning_rate": 2.416592195412571e-06, + "loss": 0.7172, + "num_input_tokens_seen": 378912, + "step": 650 + }, + { + "epoch": 0.09755734286565386, + "grad_norm": 5.232733726501465, + "learning_rate": 2.4352100089365507e-06, + "loss": 0.7742, + "num_input_tokens_seen": 381984, + "step": 655 + }, + { + "epoch": 0.09830205540661305, + "grad_norm": 6.194607734680176, + "learning_rate": 2.4538278224605306e-06, + "loss": 0.9877, + "num_input_tokens_seen": 384800, + "step": 660 + }, + { + "epoch": 0.09904676794757224, + "grad_norm": 3.2167389392852783, + "learning_rate": 2.47244563598451e-06, + "loss": 0.7495, + "num_input_tokens_seen": 387712, + "step": 665 + }, + { + "epoch": 0.09979148048853143, + "grad_norm": 4.092103481292725, + "learning_rate": 2.49106344950849e-06, + "loss": 0.6759, + "num_input_tokens_seen": 390656, + "step": 670 + }, + { + "epoch": 0.10053619302949061, + "grad_norm": 2.912137985229492, + "learning_rate": 2.5096812630324695e-06, + "loss": 0.8335, + "num_input_tokens_seen": 393504, + "step": 675 + }, + { + "epoch": 0.1012809055704498, + "grad_norm": 2.0730140209198, + "learning_rate": 2.5282990765564494e-06, + "loss": 0.6444, + "num_input_tokens_seen": 396256, + "step": 680 + }, + { + "epoch": 0.10202561811140899, + "grad_norm": 3.9909780025482178, + "learning_rate": 2.546916890080429e-06, + "loss": 0.6714, + "num_input_tokens_seen": 399328, + "step": 685 + }, + { + "epoch": 0.10277033065236818, + "grad_norm": 2.9248616695404053, + "learning_rate": 2.565534703604409e-06, + "loss": 0.7426, + "num_input_tokens_seen": 402080, + "step": 690 + }, + { + "epoch": 0.10351504319332737, + "grad_norm": 2.8347668647766113, + "learning_rate": 2.5841525171283884e-06, + "loss": 0.7907, + "num_input_tokens_seen": 404896, + "step": 695 + }, + { + "epoch": 0.10425975573428657, + "grad_norm": 3.7864692211151123, + "learning_rate": 2.6027703306523683e-06, + "loss": 0.8646, + "num_input_tokens_seen": 407552, + "step": 700 + }, + { + "epoch": 0.10500446827524576, + "grad_norm": 5.765258312225342, + "learning_rate": 2.621388144176348e-06, + "loss": 0.8856, + "num_input_tokens_seen": 410592, + "step": 705 + }, + { + "epoch": 0.10574918081620495, + "grad_norm": 5.1062541007995605, + "learning_rate": 2.6400059577003277e-06, + "loss": 0.771, + "num_input_tokens_seen": 413376, + "step": 710 + }, + { + "epoch": 0.10649389335716414, + "grad_norm": 2.813148021697998, + "learning_rate": 2.6586237712243077e-06, + "loss": 0.6754, + "num_input_tokens_seen": 416160, + "step": 715 + }, + { + "epoch": 0.10723860589812333, + "grad_norm": 3.8306455612182617, + "learning_rate": 2.6772415847482876e-06, + "loss": 0.7609, + "num_input_tokens_seen": 419168, + "step": 720 + }, + { + "epoch": 0.10798331843908252, + "grad_norm": 3.0408310890197754, + "learning_rate": 2.695859398272267e-06, + "loss": 0.6979, + "num_input_tokens_seen": 422112, + "step": 725 + }, + { + "epoch": 0.1087280309800417, + "grad_norm": 6.5481109619140625, + "learning_rate": 2.714477211796247e-06, + "loss": 0.9066, + "num_input_tokens_seen": 424960, + "step": 730 + }, + { + "epoch": 0.10947274352100089, + "grad_norm": 3.487739324569702, + "learning_rate": 2.7330950253202265e-06, + "loss": 0.7535, + "num_input_tokens_seen": 427872, + "step": 735 + }, + { + "epoch": 0.11021745606196008, + "grad_norm": 2.2289512157440186, + "learning_rate": 2.7517128388442064e-06, + "loss": 0.7038, + "num_input_tokens_seen": 431008, + "step": 740 + }, + { + "epoch": 0.11096216860291927, + "grad_norm": 7.011357307434082, + "learning_rate": 2.770330652368186e-06, + "loss": 0.7445, + "num_input_tokens_seen": 434016, + "step": 745 + }, + { + "epoch": 0.11170688114387846, + "grad_norm": 3.4955976009368896, + "learning_rate": 2.788948465892166e-06, + "loss": 0.6516, + "num_input_tokens_seen": 436736, + "step": 750 + }, + { + "epoch": 0.11245159368483765, + "grad_norm": 2.91294264793396, + "learning_rate": 2.8075662794161454e-06, + "loss": 0.7345, + "num_input_tokens_seen": 439776, + "step": 755 + }, + { + "epoch": 0.11319630622579684, + "grad_norm": 2.9869754314422607, + "learning_rate": 2.8261840929401253e-06, + "loss": 0.7247, + "num_input_tokens_seen": 442592, + "step": 760 + }, + { + "epoch": 0.11394101876675604, + "grad_norm": 4.076152801513672, + "learning_rate": 2.844801906464105e-06, + "loss": 0.709, + "num_input_tokens_seen": 445568, + "step": 765 + }, + { + "epoch": 0.11468573130771523, + "grad_norm": 5.667839050292969, + "learning_rate": 2.8634197199880847e-06, + "loss": 0.818, + "num_input_tokens_seen": 448480, + "step": 770 + }, + { + "epoch": 0.11543044384867442, + "grad_norm": 4.754634857177734, + "learning_rate": 2.8820375335120642e-06, + "loss": 0.7635, + "num_input_tokens_seen": 451168, + "step": 775 + }, + { + "epoch": 0.1161751563896336, + "grad_norm": 3.560652732849121, + "learning_rate": 2.9006553470360446e-06, + "loss": 0.7071, + "num_input_tokens_seen": 454496, + "step": 780 + }, + { + "epoch": 0.1169198689305928, + "grad_norm": 3.341341495513916, + "learning_rate": 2.919273160560024e-06, + "loss": 0.7652, + "num_input_tokens_seen": 457824, + "step": 785 + }, + { + "epoch": 0.11766458147155198, + "grad_norm": 2.8479666709899902, + "learning_rate": 2.937890974084004e-06, + "loss": 0.7412, + "num_input_tokens_seen": 460768, + "step": 790 + }, + { + "epoch": 0.11840929401251117, + "grad_norm": 3.305732488632202, + "learning_rate": 2.9565087876079835e-06, + "loss": 0.7525, + "num_input_tokens_seen": 463680, + "step": 795 + }, + { + "epoch": 0.11915400655347036, + "grad_norm": 3.7441744804382324, + "learning_rate": 2.9751266011319634e-06, + "loss": 0.8386, + "num_input_tokens_seen": 466624, + "step": 800 + }, + { + "epoch": 0.11989871909442955, + "grad_norm": 8.290502548217773, + "learning_rate": 2.993744414655943e-06, + "loss": 0.734, + "num_input_tokens_seen": 469568, + "step": 805 + }, + { + "epoch": 0.12064343163538874, + "grad_norm": 4.854254245758057, + "learning_rate": 3.012362228179923e-06, + "loss": 0.7521, + "num_input_tokens_seen": 472352, + "step": 810 + }, + { + "epoch": 0.12138814417634793, + "grad_norm": 5.44564962387085, + "learning_rate": 3.0309800417039024e-06, + "loss": 0.8272, + "num_input_tokens_seen": 475872, + "step": 815 + }, + { + "epoch": 0.12213285671730711, + "grad_norm": 3.100677728652954, + "learning_rate": 3.0495978552278823e-06, + "loss": 0.6034, + "num_input_tokens_seen": 478912, + "step": 820 + }, + { + "epoch": 0.1228775692582663, + "grad_norm": 8.532440185546875, + "learning_rate": 3.068215668751862e-06, + "loss": 0.7202, + "num_input_tokens_seen": 481600, + "step": 825 + }, + { + "epoch": 0.1236222817992255, + "grad_norm": 5.529086112976074, + "learning_rate": 3.0868334822758417e-06, + "loss": 0.7328, + "num_input_tokens_seen": 484576, + "step": 830 + }, + { + "epoch": 0.1243669943401847, + "grad_norm": 3.53167724609375, + "learning_rate": 3.1054512957998212e-06, + "loss": 0.6931, + "num_input_tokens_seen": 487520, + "step": 835 + }, + { + "epoch": 0.12511170688114387, + "grad_norm": 3.8304641246795654, + "learning_rate": 3.124069109323801e-06, + "loss": 0.6983, + "num_input_tokens_seen": 490496, + "step": 840 + }, + { + "epoch": 0.12585641942210307, + "grad_norm": 3.17417311668396, + "learning_rate": 3.142686922847781e-06, + "loss": 0.71, + "num_input_tokens_seen": 493408, + "step": 845 + }, + { + "epoch": 0.12660113196306225, + "grad_norm": 3.4440906047821045, + "learning_rate": 3.1613047363717606e-06, + "loss": 0.8181, + "num_input_tokens_seen": 496096, + "step": 850 + }, + { + "epoch": 0.12734584450402145, + "grad_norm": 3.175403118133545, + "learning_rate": 3.1799225498957405e-06, + "loss": 0.6536, + "num_input_tokens_seen": 498912, + "step": 855 + }, + { + "epoch": 0.12809055704498062, + "grad_norm": 4.3531060218811035, + "learning_rate": 3.19854036341972e-06, + "loss": 0.7174, + "num_input_tokens_seen": 501856, + "step": 860 + }, + { + "epoch": 0.12883526958593983, + "grad_norm": 3.8875298500061035, + "learning_rate": 3.2171581769437e-06, + "loss": 0.7349, + "num_input_tokens_seen": 504736, + "step": 865 + }, + { + "epoch": 0.12957998212689903, + "grad_norm": 2.7472589015960693, + "learning_rate": 3.2357759904676794e-06, + "loss": 0.7201, + "num_input_tokens_seen": 507456, + "step": 870 + }, + { + "epoch": 0.1303246946678582, + "grad_norm": 10.806256294250488, + "learning_rate": 3.2543938039916594e-06, + "loss": 0.8526, + "num_input_tokens_seen": 510432, + "step": 875 + }, + { + "epoch": 0.1310694072088174, + "grad_norm": 3.4850986003875732, + "learning_rate": 3.2730116175156393e-06, + "loss": 0.7576, + "num_input_tokens_seen": 513216, + "step": 880 + }, + { + "epoch": 0.13181411974977658, + "grad_norm": 4.85115385055542, + "learning_rate": 3.291629431039619e-06, + "loss": 0.7103, + "num_input_tokens_seen": 516256, + "step": 885 + }, + { + "epoch": 0.13255883229073578, + "grad_norm": 5.485177993774414, + "learning_rate": 3.3102472445635987e-06, + "loss": 0.7738, + "num_input_tokens_seen": 519296, + "step": 890 + }, + { + "epoch": 0.13330354483169496, + "grad_norm": 3.8094146251678467, + "learning_rate": 3.3288650580875782e-06, + "loss": 0.6948, + "num_input_tokens_seen": 522112, + "step": 895 + }, + { + "epoch": 0.13404825737265416, + "grad_norm": 2.4731554985046387, + "learning_rate": 3.347482871611558e-06, + "loss": 0.7708, + "num_input_tokens_seen": 525376, + "step": 900 + }, + { + "epoch": 0.13479296991361334, + "grad_norm": 4.6741228103637695, + "learning_rate": 3.3661006851355377e-06, + "loss": 0.713, + "num_input_tokens_seen": 528192, + "step": 905 + }, + { + "epoch": 0.13553768245457254, + "grad_norm": 2.880326747894287, + "learning_rate": 3.3847184986595176e-06, + "loss": 0.8305, + "num_input_tokens_seen": 531136, + "step": 910 + }, + { + "epoch": 0.1362823949955317, + "grad_norm": 4.945480823516846, + "learning_rate": 3.403336312183497e-06, + "loss": 0.8253, + "num_input_tokens_seen": 534144, + "step": 915 + }, + { + "epoch": 0.13702710753649092, + "grad_norm": 3.8040266036987305, + "learning_rate": 3.421954125707477e-06, + "loss": 0.7497, + "num_input_tokens_seen": 536832, + "step": 920 + }, + { + "epoch": 0.1377718200774501, + "grad_norm": 4.342973232269287, + "learning_rate": 3.4405719392314565e-06, + "loss": 0.7402, + "num_input_tokens_seen": 539328, + "step": 925 + }, + { + "epoch": 0.1385165326184093, + "grad_norm": 4.150696277618408, + "learning_rate": 3.4591897527554364e-06, + "loss": 0.7204, + "num_input_tokens_seen": 542208, + "step": 930 + }, + { + "epoch": 0.1392612451593685, + "grad_norm": 4.016871452331543, + "learning_rate": 3.477807566279416e-06, + "loss": 0.5514, + "num_input_tokens_seen": 545088, + "step": 935 + }, + { + "epoch": 0.14000595770032767, + "grad_norm": 4.529871463775635, + "learning_rate": 3.496425379803396e-06, + "loss": 0.7182, + "num_input_tokens_seen": 547776, + "step": 940 + }, + { + "epoch": 0.14075067024128687, + "grad_norm": 3.8733394145965576, + "learning_rate": 3.5150431933273762e-06, + "loss": 0.7178, + "num_input_tokens_seen": 550400, + "step": 945 + }, + { + "epoch": 0.14149538278224605, + "grad_norm": 5.651612281799316, + "learning_rate": 3.5336610068513553e-06, + "loss": 0.7161, + "num_input_tokens_seen": 553184, + "step": 950 + }, + { + "epoch": 0.14224009532320525, + "grad_norm": 7.635330677032471, + "learning_rate": 3.5522788203753356e-06, + "loss": 0.7852, + "num_input_tokens_seen": 556096, + "step": 955 + }, + { + "epoch": 0.14298480786416443, + "grad_norm": 3.8751416206359863, + "learning_rate": 3.570896633899315e-06, + "loss": 0.7308, + "num_input_tokens_seen": 558784, + "step": 960 + }, + { + "epoch": 0.14372952040512363, + "grad_norm": 5.166362285614014, + "learning_rate": 3.589514447423295e-06, + "loss": 0.8799, + "num_input_tokens_seen": 561760, + "step": 965 + }, + { + "epoch": 0.1444742329460828, + "grad_norm": 4.178103446960449, + "learning_rate": 3.6081322609472746e-06, + "loss": 0.7196, + "num_input_tokens_seen": 564544, + "step": 970 + }, + { + "epoch": 0.145218945487042, + "grad_norm": 2.6960041522979736, + "learning_rate": 3.6267500744712545e-06, + "loss": 0.6004, + "num_input_tokens_seen": 567200, + "step": 975 + }, + { + "epoch": 0.14596365802800118, + "grad_norm": 3.9615819454193115, + "learning_rate": 3.645367887995234e-06, + "loss": 0.6157, + "num_input_tokens_seen": 570240, + "step": 980 + }, + { + "epoch": 0.14670837056896038, + "grad_norm": 4.863311767578125, + "learning_rate": 3.663985701519214e-06, + "loss": 0.7004, + "num_input_tokens_seen": 572896, + "step": 985 + }, + { + "epoch": 0.14745308310991956, + "grad_norm": 4.069835186004639, + "learning_rate": 3.6826035150431934e-06, + "loss": 0.7115, + "num_input_tokens_seen": 575744, + "step": 990 + }, + { + "epoch": 0.14819779565087876, + "grad_norm": 3.2916951179504395, + "learning_rate": 3.7012213285671734e-06, + "loss": 0.6919, + "num_input_tokens_seen": 578848, + "step": 995 + }, + { + "epoch": 0.14894250819183796, + "grad_norm": 6.6495137214660645, + "learning_rate": 3.719839142091153e-06, + "loss": 0.8411, + "num_input_tokens_seen": 581504, + "step": 1000 + }, + { + "epoch": 0.14968722073279714, + "grad_norm": 7.229043006896973, + "learning_rate": 3.738456955615133e-06, + "loss": 0.7478, + "num_input_tokens_seen": 584256, + "step": 1005 + }, + { + "epoch": 0.15043193327375634, + "grad_norm": 8.43328857421875, + "learning_rate": 3.7570747691391127e-06, + "loss": 0.8086, + "num_input_tokens_seen": 586912, + "step": 1010 + }, + { + "epoch": 0.15117664581471552, + "grad_norm": 3.857494354248047, + "learning_rate": 3.7756925826630922e-06, + "loss": 0.6916, + "num_input_tokens_seen": 589920, + "step": 1015 + }, + { + "epoch": 0.15192135835567472, + "grad_norm": 4.77067232131958, + "learning_rate": 3.794310396187072e-06, + "loss": 0.7222, + "num_input_tokens_seen": 592992, + "step": 1020 + }, + { + "epoch": 0.1526660708966339, + "grad_norm": 4.233238697052002, + "learning_rate": 3.8129282097110517e-06, + "loss": 0.6661, + "num_input_tokens_seen": 595872, + "step": 1025 + }, + { + "epoch": 0.1534107834375931, + "grad_norm": 3.4719624519348145, + "learning_rate": 3.831546023235032e-06, + "loss": 0.7223, + "num_input_tokens_seen": 599008, + "step": 1030 + }, + { + "epoch": 0.15415549597855227, + "grad_norm": 5.505776405334473, + "learning_rate": 3.850163836759011e-06, + "loss": 0.7227, + "num_input_tokens_seen": 601792, + "step": 1035 + }, + { + "epoch": 0.15490020851951147, + "grad_norm": 4.668536186218262, + "learning_rate": 3.8687816502829914e-06, + "loss": 0.6208, + "num_input_tokens_seen": 604608, + "step": 1040 + }, + { + "epoch": 0.15564492106047065, + "grad_norm": 3.929109811782837, + "learning_rate": 3.8873994638069705e-06, + "loss": 0.7997, + "num_input_tokens_seen": 607168, + "step": 1045 + }, + { + "epoch": 0.15638963360142985, + "grad_norm": 3.7376718521118164, + "learning_rate": 3.9060172773309504e-06, + "loss": 0.7294, + "num_input_tokens_seen": 610048, + "step": 1050 + }, + { + "epoch": 0.15713434614238903, + "grad_norm": 5.3072052001953125, + "learning_rate": 3.92463509085493e-06, + "loss": 0.6709, + "num_input_tokens_seen": 612864, + "step": 1055 + }, + { + "epoch": 0.15787905868334823, + "grad_norm": 2.969412326812744, + "learning_rate": 3.94325290437891e-06, + "loss": 0.7238, + "num_input_tokens_seen": 616096, + "step": 1060 + }, + { + "epoch": 0.15862377122430743, + "grad_norm": 4.143185615539551, + "learning_rate": 3.961870717902889e-06, + "loss": 0.7193, + "num_input_tokens_seen": 618784, + "step": 1065 + }, + { + "epoch": 0.1593684837652666, + "grad_norm": 4.40792179107666, + "learning_rate": 3.980488531426869e-06, + "loss": 0.7104, + "num_input_tokens_seen": 621600, + "step": 1070 + }, + { + "epoch": 0.1601131963062258, + "grad_norm": 4.296742916107178, + "learning_rate": 3.999106344950849e-06, + "loss": 0.7398, + "num_input_tokens_seen": 624704, + "step": 1075 + }, + { + "epoch": 0.16085790884718498, + "grad_norm": 4.261825084686279, + "learning_rate": 4.017724158474829e-06, + "loss": 0.8088, + "num_input_tokens_seen": 627808, + "step": 1080 + }, + { + "epoch": 0.16160262138814419, + "grad_norm": 4.541808605194092, + "learning_rate": 4.036341971998809e-06, + "loss": 0.8075, + "num_input_tokens_seen": 630688, + "step": 1085 + }, + { + "epoch": 0.16234733392910336, + "grad_norm": 3.5159831047058105, + "learning_rate": 4.054959785522788e-06, + "loss": 0.7266, + "num_input_tokens_seen": 633600, + "step": 1090 + }, + { + "epoch": 0.16309204647006256, + "grad_norm": 2.379495143890381, + "learning_rate": 4.073577599046768e-06, + "loss": 0.7559, + "num_input_tokens_seen": 636672, + "step": 1095 + }, + { + "epoch": 0.16383675901102174, + "grad_norm": 7.567749500274658, + "learning_rate": 4.092195412570748e-06, + "loss": 0.7626, + "num_input_tokens_seen": 639168, + "step": 1100 + }, + { + "epoch": 0.16458147155198094, + "grad_norm": 4.199253559112549, + "learning_rate": 4.110813226094728e-06, + "loss": 0.7512, + "num_input_tokens_seen": 641984, + "step": 1105 + }, + { + "epoch": 0.16532618409294011, + "grad_norm": 4.1342573165893555, + "learning_rate": 4.129431039618707e-06, + "loss": 0.7192, + "num_input_tokens_seen": 645408, + "step": 1110 + }, + { + "epoch": 0.16607089663389932, + "grad_norm": 3.3130712509155273, + "learning_rate": 4.148048853142687e-06, + "loss": 0.8555, + "num_input_tokens_seen": 648032, + "step": 1115 + }, + { + "epoch": 0.1668156091748585, + "grad_norm": 3.9384493827819824, + "learning_rate": 4.166666666666667e-06, + "loss": 0.7054, + "num_input_tokens_seen": 651232, + "step": 1120 + }, + { + "epoch": 0.1675603217158177, + "grad_norm": 4.096521377563477, + "learning_rate": 4.185284480190647e-06, + "loss": 0.6966, + "num_input_tokens_seen": 654112, + "step": 1125 + }, + { + "epoch": 0.1683050342567769, + "grad_norm": 3.7833809852600098, + "learning_rate": 4.203902293714626e-06, + "loss": 0.6729, + "num_input_tokens_seen": 657184, + "step": 1130 + }, + { + "epoch": 0.16904974679773607, + "grad_norm": 4.247753620147705, + "learning_rate": 4.222520107238606e-06, + "loss": 0.7181, + "num_input_tokens_seen": 659936, + "step": 1135 + }, + { + "epoch": 0.16979445933869527, + "grad_norm": 3.4927144050598145, + "learning_rate": 4.241137920762586e-06, + "loss": 0.6583, + "num_input_tokens_seen": 662880, + "step": 1140 + }, + { + "epoch": 0.17053917187965445, + "grad_norm": 4.50593900680542, + "learning_rate": 4.259755734286566e-06, + "loss": 0.792, + "num_input_tokens_seen": 665760, + "step": 1145 + }, + { + "epoch": 0.17128388442061365, + "grad_norm": 6.08641242980957, + "learning_rate": 4.278373547810546e-06, + "loss": 0.8351, + "num_input_tokens_seen": 668672, + "step": 1150 + }, + { + "epoch": 0.17202859696157283, + "grad_norm": 8.265390396118164, + "learning_rate": 4.296991361334525e-06, + "loss": 0.7889, + "num_input_tokens_seen": 671424, + "step": 1155 + }, + { + "epoch": 0.17277330950253203, + "grad_norm": 7.26796817779541, + "learning_rate": 4.3156091748585054e-06, + "loss": 0.8382, + "num_input_tokens_seen": 674176, + "step": 1160 + }, + { + "epoch": 0.1735180220434912, + "grad_norm": 4.070652961730957, + "learning_rate": 4.3342269883824845e-06, + "loss": 0.7197, + "num_input_tokens_seen": 676992, + "step": 1165 + }, + { + "epoch": 0.1742627345844504, + "grad_norm": 3.4586827754974365, + "learning_rate": 4.3528448019064644e-06, + "loss": 0.674, + "num_input_tokens_seen": 679936, + "step": 1170 + }, + { + "epoch": 0.17500744712540958, + "grad_norm": 2.854335069656372, + "learning_rate": 4.3714626154304435e-06, + "loss": 0.612, + "num_input_tokens_seen": 682656, + "step": 1175 + }, + { + "epoch": 0.17575215966636878, + "grad_norm": 4.367297649383545, + "learning_rate": 4.390080428954424e-06, + "loss": 0.7526, + "num_input_tokens_seen": 685376, + "step": 1180 + }, + { + "epoch": 0.17649687220732796, + "grad_norm": 4.5636372566223145, + "learning_rate": 4.408698242478403e-06, + "loss": 0.7914, + "num_input_tokens_seen": 688000, + "step": 1185 + }, + { + "epoch": 0.17724158474828716, + "grad_norm": 3.611304521560669, + "learning_rate": 4.427316056002383e-06, + "loss": 0.6554, + "num_input_tokens_seen": 690752, + "step": 1190 + }, + { + "epoch": 0.17798629728924636, + "grad_norm": 7.332277297973633, + "learning_rate": 4.445933869526362e-06, + "loss": 0.6561, + "num_input_tokens_seen": 693696, + "step": 1195 + }, + { + "epoch": 0.17873100983020554, + "grad_norm": 3.0341997146606445, + "learning_rate": 4.464551683050343e-06, + "loss": 0.8017, + "num_input_tokens_seen": 696832, + "step": 1200 + }, + { + "epoch": 0.17947572237116474, + "grad_norm": 3.6951427459716797, + "learning_rate": 4.483169496574322e-06, + "loss": 0.8048, + "num_input_tokens_seen": 700192, + "step": 1205 + }, + { + "epoch": 0.18022043491212392, + "grad_norm": 6.446733474731445, + "learning_rate": 4.501787310098302e-06, + "loss": 0.7145, + "num_input_tokens_seen": 702976, + "step": 1210 + }, + { + "epoch": 0.18096514745308312, + "grad_norm": 4.406558513641357, + "learning_rate": 4.520405123622282e-06, + "loss": 0.6268, + "num_input_tokens_seen": 705696, + "step": 1215 + }, + { + "epoch": 0.1817098599940423, + "grad_norm": 3.4493765830993652, + "learning_rate": 4.539022937146262e-06, + "loss": 0.6908, + "num_input_tokens_seen": 708544, + "step": 1220 + }, + { + "epoch": 0.1824545725350015, + "grad_norm": 3.588956117630005, + "learning_rate": 4.557640750670242e-06, + "loss": 0.7162, + "num_input_tokens_seen": 711168, + "step": 1225 + }, + { + "epoch": 0.18319928507596067, + "grad_norm": 5.066405296325684, + "learning_rate": 4.576258564194221e-06, + "loss": 0.7855, + "num_input_tokens_seen": 714176, + "step": 1230 + }, + { + "epoch": 0.18394399761691987, + "grad_norm": 11.439751625061035, + "learning_rate": 4.594876377718201e-06, + "loss": 0.6927, + "num_input_tokens_seen": 716800, + "step": 1235 + }, + { + "epoch": 0.18468871015787905, + "grad_norm": 4.342942714691162, + "learning_rate": 4.613494191242181e-06, + "loss": 0.6863, + "num_input_tokens_seen": 719712, + "step": 1240 + }, + { + "epoch": 0.18543342269883825, + "grad_norm": 8.949456214904785, + "learning_rate": 4.632112004766161e-06, + "loss": 0.6874, + "num_input_tokens_seen": 722720, + "step": 1245 + }, + { + "epoch": 0.18617813523979743, + "grad_norm": 6.367798805236816, + "learning_rate": 4.65072981829014e-06, + "loss": 0.8331, + "num_input_tokens_seen": 725248, + "step": 1250 + }, + { + "epoch": 0.18692284778075663, + "grad_norm": 4.613927364349365, + "learning_rate": 4.66934763181412e-06, + "loss": 0.6915, + "num_input_tokens_seen": 728160, + "step": 1255 + }, + { + "epoch": 0.1876675603217158, + "grad_norm": 3.703494071960449, + "learning_rate": 4.6879654453381e-06, + "loss": 0.7238, + "num_input_tokens_seen": 731008, + "step": 1260 + }, + { + "epoch": 0.188412272862675, + "grad_norm": 3.6976912021636963, + "learning_rate": 4.70658325886208e-06, + "loss": 0.6573, + "num_input_tokens_seen": 733824, + "step": 1265 + }, + { + "epoch": 0.1891569854036342, + "grad_norm": 3.7201738357543945, + "learning_rate": 4.725201072386059e-06, + "loss": 0.6318, + "num_input_tokens_seen": 736640, + "step": 1270 + }, + { + "epoch": 0.18990169794459338, + "grad_norm": 5.148128509521484, + "learning_rate": 4.743818885910039e-06, + "loss": 0.6707, + "num_input_tokens_seen": 739488, + "step": 1275 + }, + { + "epoch": 0.1906464104855526, + "grad_norm": 5.609634876251221, + "learning_rate": 4.7624366994340194e-06, + "loss": 0.7405, + "num_input_tokens_seen": 742176, + "step": 1280 + }, + { + "epoch": 0.19139112302651176, + "grad_norm": 4.227792263031006, + "learning_rate": 4.7810545129579985e-06, + "loss": 0.5949, + "num_input_tokens_seen": 745184, + "step": 1285 + }, + { + "epoch": 0.19213583556747096, + "grad_norm": 5.110191345214844, + "learning_rate": 4.7996723264819784e-06, + "loss": 0.6681, + "num_input_tokens_seen": 748224, + "step": 1290 + }, + { + "epoch": 0.19288054810843014, + "grad_norm": 3.7108113765716553, + "learning_rate": 4.8182901400059575e-06, + "loss": 0.7054, + "num_input_tokens_seen": 750944, + "step": 1295 + }, + { + "epoch": 0.19362526064938934, + "grad_norm": 5.104458808898926, + "learning_rate": 4.836907953529938e-06, + "loss": 0.6661, + "num_input_tokens_seen": 753888, + "step": 1300 + }, + { + "epoch": 0.19436997319034852, + "grad_norm": 4.927037239074707, + "learning_rate": 4.855525767053917e-06, + "loss": 0.7805, + "num_input_tokens_seen": 756704, + "step": 1305 + }, + { + "epoch": 0.19511468573130772, + "grad_norm": 3.8360435962677, + "learning_rate": 4.874143580577897e-06, + "loss": 0.6147, + "num_input_tokens_seen": 759904, + "step": 1310 + }, + { + "epoch": 0.1958593982722669, + "grad_norm": 6.115900039672852, + "learning_rate": 4.892761394101876e-06, + "loss": 0.6913, + "num_input_tokens_seen": 762848, + "step": 1315 + }, + { + "epoch": 0.1966041108132261, + "grad_norm": 5.15281867980957, + "learning_rate": 4.911379207625857e-06, + "loss": 0.6274, + "num_input_tokens_seen": 765536, + "step": 1320 + }, + { + "epoch": 0.19734882335418527, + "grad_norm": 11.886588096618652, + "learning_rate": 4.929997021149836e-06, + "loss": 0.6771, + "num_input_tokens_seen": 768384, + "step": 1325 + }, + { + "epoch": 0.19809353589514447, + "grad_norm": 5.41392707824707, + "learning_rate": 4.948614834673816e-06, + "loss": 0.5998, + "num_input_tokens_seen": 771552, + "step": 1330 + }, + { + "epoch": 0.19883824843610368, + "grad_norm": 3.647545099258423, + "learning_rate": 4.967232648197795e-06, + "loss": 0.7178, + "num_input_tokens_seen": 774336, + "step": 1335 + }, + { + "epoch": 0.19958296097706285, + "grad_norm": 5.326066970825195, + "learning_rate": 4.985850461721776e-06, + "loss": 0.7107, + "num_input_tokens_seen": 777312, + "step": 1340 + }, + { + "epoch": 0.20032767351802205, + "grad_norm": 5.737568378448486, + "learning_rate": 5.004468275245756e-06, + "loss": 0.9109, + "num_input_tokens_seen": 780032, + "step": 1345 + }, + { + "epoch": 0.20107238605898123, + "grad_norm": 3.928309679031372, + "learning_rate": 5.023086088769735e-06, + "loss": 0.5458, + "num_input_tokens_seen": 782720, + "step": 1350 + }, + { + "epoch": 0.20181709859994043, + "grad_norm": 7.908411979675293, + "learning_rate": 5.041703902293715e-06, + "loss": 0.7233, + "num_input_tokens_seen": 785760, + "step": 1355 + }, + { + "epoch": 0.2025618111408996, + "grad_norm": 5.000067234039307, + "learning_rate": 5.060321715817695e-06, + "loss": 0.6715, + "num_input_tokens_seen": 788608, + "step": 1360 + }, + { + "epoch": 0.2033065236818588, + "grad_norm": 5.49289083480835, + "learning_rate": 5.078939529341675e-06, + "loss": 0.7934, + "num_input_tokens_seen": 791872, + "step": 1365 + }, + { + "epoch": 0.20405123622281798, + "grad_norm": 6.965632915496826, + "learning_rate": 5.097557342865654e-06, + "loss": 0.6949, + "num_input_tokens_seen": 795136, + "step": 1370 + }, + { + "epoch": 0.20479594876377719, + "grad_norm": 5.999640941619873, + "learning_rate": 5.116175156389634e-06, + "loss": 0.7604, + "num_input_tokens_seen": 798048, + "step": 1375 + }, + { + "epoch": 0.20554066130473636, + "grad_norm": 4.9657158851623535, + "learning_rate": 5.134792969913614e-06, + "loss": 0.6331, + "num_input_tokens_seen": 800576, + "step": 1380 + }, + { + "epoch": 0.20628537384569556, + "grad_norm": 4.690451622009277, + "learning_rate": 5.153410783437594e-06, + "loss": 0.7416, + "num_input_tokens_seen": 803616, + "step": 1385 + }, + { + "epoch": 0.20703008638665474, + "grad_norm": 5.231090068817139, + "learning_rate": 5.172028596961573e-06, + "loss": 0.9036, + "num_input_tokens_seen": 806304, + "step": 1390 + }, + { + "epoch": 0.20777479892761394, + "grad_norm": 7.394189357757568, + "learning_rate": 5.190646410485553e-06, + "loss": 0.8134, + "num_input_tokens_seen": 808992, + "step": 1395 + }, + { + "epoch": 0.20851951146857314, + "grad_norm": 4.099926948547363, + "learning_rate": 5.209264224009533e-06, + "loss": 0.67, + "num_input_tokens_seen": 811904, + "step": 1400 + }, + { + "epoch": 0.20926422400953232, + "grad_norm": 5.892171382904053, + "learning_rate": 5.2278820375335125e-06, + "loss": 0.7022, + "num_input_tokens_seen": 815072, + "step": 1405 + }, + { + "epoch": 0.21000893655049152, + "grad_norm": 11.202237129211426, + "learning_rate": 5.2464998510574924e-06, + "loss": 0.8943, + "num_input_tokens_seen": 818016, + "step": 1410 + }, + { + "epoch": 0.2107536490914507, + "grad_norm": 3.2510712146759033, + "learning_rate": 5.2651176645814715e-06, + "loss": 0.6466, + "num_input_tokens_seen": 820896, + "step": 1415 + }, + { + "epoch": 0.2114983616324099, + "grad_norm": 4.517914772033691, + "learning_rate": 5.2837354781054514e-06, + "loss": 0.6707, + "num_input_tokens_seen": 823520, + "step": 1420 + }, + { + "epoch": 0.21224307417336907, + "grad_norm": 3.585829734802246, + "learning_rate": 5.302353291629431e-06, + "loss": 0.7403, + "num_input_tokens_seen": 826144, + "step": 1425 + }, + { + "epoch": 0.21298778671432828, + "grad_norm": 3.9880173206329346, + "learning_rate": 5.320971105153411e-06, + "loss": 0.6748, + "num_input_tokens_seen": 828768, + "step": 1430 + }, + { + "epoch": 0.21373249925528745, + "grad_norm": 2.8282361030578613, + "learning_rate": 5.33958891867739e-06, + "loss": 0.761, + "num_input_tokens_seen": 831936, + "step": 1435 + }, + { + "epoch": 0.21447721179624665, + "grad_norm": 4.896012306213379, + "learning_rate": 5.35820673220137e-06, + "loss": 0.6772, + "num_input_tokens_seen": 835008, + "step": 1440 + }, + { + "epoch": 0.21522192433720583, + "grad_norm": 5.501242637634277, + "learning_rate": 5.37682454572535e-06, + "loss": 0.7713, + "num_input_tokens_seen": 837600, + "step": 1445 + }, + { + "epoch": 0.21596663687816503, + "grad_norm": 4.245748519897461, + "learning_rate": 5.39544235924933e-06, + "loss": 0.7443, + "num_input_tokens_seen": 840480, + "step": 1450 + }, + { + "epoch": 0.2167113494191242, + "grad_norm": 6.252084732055664, + "learning_rate": 5.414060172773309e-06, + "loss": 0.6932, + "num_input_tokens_seen": 843424, + "step": 1455 + }, + { + "epoch": 0.2174560619600834, + "grad_norm": 3.2149879932403564, + "learning_rate": 5.43267798629729e-06, + "loss": 0.6279, + "num_input_tokens_seen": 846272, + "step": 1460 + }, + { + "epoch": 0.2182007745010426, + "grad_norm": 5.627643585205078, + "learning_rate": 5.451295799821269e-06, + "loss": 0.7262, + "num_input_tokens_seen": 849216, + "step": 1465 + }, + { + "epoch": 0.21894548704200179, + "grad_norm": 4.683766841888428, + "learning_rate": 5.469913613345249e-06, + "loss": 0.8108, + "num_input_tokens_seen": 852032, + "step": 1470 + }, + { + "epoch": 0.219690199582961, + "grad_norm": 3.9969027042388916, + "learning_rate": 5.488531426869229e-06, + "loss": 0.6887, + "num_input_tokens_seen": 855072, + "step": 1475 + }, + { + "epoch": 0.22043491212392016, + "grad_norm": 5.060018539428711, + "learning_rate": 5.507149240393209e-06, + "loss": 0.7446, + "num_input_tokens_seen": 857760, + "step": 1480 + }, + { + "epoch": 0.22117962466487937, + "grad_norm": 4.666828155517578, + "learning_rate": 5.525767053917189e-06, + "loss": 0.8068, + "num_input_tokens_seen": 860928, + "step": 1485 + }, + { + "epoch": 0.22192433720583854, + "grad_norm": 5.8750319480896, + "learning_rate": 5.544384867441168e-06, + "loss": 0.7454, + "num_input_tokens_seen": 864032, + "step": 1490 + }, + { + "epoch": 0.22266904974679774, + "grad_norm": 4.438586711883545, + "learning_rate": 5.563002680965148e-06, + "loss": 0.9631, + "num_input_tokens_seen": 867040, + "step": 1495 + }, + { + "epoch": 0.22341376228775692, + "grad_norm": 3.958667755126953, + "learning_rate": 5.581620494489128e-06, + "loss": 0.7065, + "num_input_tokens_seen": 870048, + "step": 1500 + }, + { + "epoch": 0.22415847482871612, + "grad_norm": 8.571981430053711, + "learning_rate": 5.600238308013108e-06, + "loss": 0.7803, + "num_input_tokens_seen": 873632, + "step": 1505 + }, + { + "epoch": 0.2249031873696753, + "grad_norm": 3.4489738941192627, + "learning_rate": 5.618856121537087e-06, + "loss": 0.7369, + "num_input_tokens_seen": 876608, + "step": 1510 + }, + { + "epoch": 0.2256478999106345, + "grad_norm": 3.5389719009399414, + "learning_rate": 5.637473935061067e-06, + "loss": 0.7941, + "num_input_tokens_seen": 879424, + "step": 1515 + }, + { + "epoch": 0.22639261245159367, + "grad_norm": 2.5531117916107178, + "learning_rate": 5.656091748585047e-06, + "loss": 0.6208, + "num_input_tokens_seen": 882496, + "step": 1520 + }, + { + "epoch": 0.22713732499255287, + "grad_norm": 2.8457448482513428, + "learning_rate": 5.6747095621090265e-06, + "loss": 0.7128, + "num_input_tokens_seen": 885632, + "step": 1525 + }, + { + "epoch": 0.22788203753351208, + "grad_norm": 3.59098482131958, + "learning_rate": 5.693327375633006e-06, + "loss": 0.5632, + "num_input_tokens_seen": 888480, + "step": 1530 + }, + { + "epoch": 0.22862675007447125, + "grad_norm": 5.174984931945801, + "learning_rate": 5.7119451891569855e-06, + "loss": 0.6303, + "num_input_tokens_seen": 891392, + "step": 1535 + }, + { + "epoch": 0.22937146261543045, + "grad_norm": 4.89317512512207, + "learning_rate": 5.7305630026809654e-06, + "loss": 0.5718, + "num_input_tokens_seen": 894336, + "step": 1540 + }, + { + "epoch": 0.23011617515638963, + "grad_norm": 4.771053314208984, + "learning_rate": 5.749180816204945e-06, + "loss": 0.5616, + "num_input_tokens_seen": 897280, + "step": 1545 + }, + { + "epoch": 0.23086088769734883, + "grad_norm": 12.724658012390137, + "learning_rate": 5.767798629728925e-06, + "loss": 0.9463, + "num_input_tokens_seen": 900192, + "step": 1550 + }, + { + "epoch": 0.231605600238308, + "grad_norm": 4.074901580810547, + "learning_rate": 5.786416443252904e-06, + "loss": 0.6747, + "num_input_tokens_seen": 903232, + "step": 1555 + }, + { + "epoch": 0.2323503127792672, + "grad_norm": 6.157442569732666, + "learning_rate": 5.805034256776884e-06, + "loss": 0.6416, + "num_input_tokens_seen": 906080, + "step": 1560 + }, + { + "epoch": 0.23309502532022638, + "grad_norm": 4.066648483276367, + "learning_rate": 5.823652070300864e-06, + "loss": 0.6658, + "num_input_tokens_seen": 908896, + "step": 1565 + }, + { + "epoch": 0.2338397378611856, + "grad_norm": 5.681422710418701, + "learning_rate": 5.842269883824844e-06, + "loss": 0.7822, + "num_input_tokens_seen": 911840, + "step": 1570 + }, + { + "epoch": 0.23458445040214476, + "grad_norm": 6.5858635902404785, + "learning_rate": 5.860887697348823e-06, + "loss": 0.8105, + "num_input_tokens_seen": 914848, + "step": 1575 + }, + { + "epoch": 0.23532916294310396, + "grad_norm": 6.862659931182861, + "learning_rate": 5.879505510872803e-06, + "loss": 0.7656, + "num_input_tokens_seen": 917792, + "step": 1580 + }, + { + "epoch": 0.23607387548406314, + "grad_norm": 6.291182518005371, + "learning_rate": 5.898123324396783e-06, + "loss": 0.8031, + "num_input_tokens_seen": 920608, + "step": 1585 + }, + { + "epoch": 0.23681858802502234, + "grad_norm": 6.056919097900391, + "learning_rate": 5.916741137920763e-06, + "loss": 0.8282, + "num_input_tokens_seen": 923648, + "step": 1590 + }, + { + "epoch": 0.23756330056598154, + "grad_norm": 3.6727638244628906, + "learning_rate": 5.935358951444742e-06, + "loss": 0.6704, + "num_input_tokens_seen": 926528, + "step": 1595 + }, + { + "epoch": 0.23830801310694072, + "grad_norm": 4.744163513183594, + "learning_rate": 5.953976764968722e-06, + "loss": 0.6697, + "num_input_tokens_seen": 929184, + "step": 1600 + }, + { + "epoch": 0.23905272564789992, + "grad_norm": 8.802165031433105, + "learning_rate": 5.972594578492702e-06, + "loss": 0.7255, + "num_input_tokens_seen": 931872, + "step": 1605 + }, + { + "epoch": 0.2397974381888591, + "grad_norm": 6.779573440551758, + "learning_rate": 5.991212392016682e-06, + "loss": 0.6378, + "num_input_tokens_seen": 934688, + "step": 1610 + }, + { + "epoch": 0.2405421507298183, + "grad_norm": 5.733285427093506, + "learning_rate": 6.009830205540662e-06, + "loss": 0.607, + "num_input_tokens_seen": 938048, + "step": 1615 + }, + { + "epoch": 0.24128686327077747, + "grad_norm": 4.159451007843018, + "learning_rate": 6.028448019064641e-06, + "loss": 0.813, + "num_input_tokens_seen": 940864, + "step": 1620 + }, + { + "epoch": 0.24203157581173668, + "grad_norm": 3.446777582168579, + "learning_rate": 6.047065832588622e-06, + "loss": 0.7278, + "num_input_tokens_seen": 943808, + "step": 1625 + }, + { + "epoch": 0.24277628835269585, + "grad_norm": 7.94171667098999, + "learning_rate": 6.065683646112601e-06, + "loss": 0.581, + "num_input_tokens_seen": 946976, + "step": 1630 + }, + { + "epoch": 0.24352100089365505, + "grad_norm": 5.095869541168213, + "learning_rate": 6.084301459636581e-06, + "loss": 0.5808, + "num_input_tokens_seen": 949696, + "step": 1635 + }, + { + "epoch": 0.24426571343461423, + "grad_norm": 6.317439079284668, + "learning_rate": 6.10291927316056e-06, + "loss": 0.7803, + "num_input_tokens_seen": 952512, + "step": 1640 + }, + { + "epoch": 0.24501042597557343, + "grad_norm": 7.963269233703613, + "learning_rate": 6.1215370866845405e-06, + "loss": 0.7388, + "num_input_tokens_seen": 955200, + "step": 1645 + }, + { + "epoch": 0.2457551385165326, + "grad_norm": 6.172366142272949, + "learning_rate": 6.14015490020852e-06, + "loss": 0.5768, + "num_input_tokens_seen": 958048, + "step": 1650 + }, + { + "epoch": 0.2464998510574918, + "grad_norm": 4.201632022857666, + "learning_rate": 6.1587727137324995e-06, + "loss": 0.6266, + "num_input_tokens_seen": 961152, + "step": 1655 + }, + { + "epoch": 0.247244563598451, + "grad_norm": 6.349620342254639, + "learning_rate": 6.1773905272564794e-06, + "loss": 0.6659, + "num_input_tokens_seen": 964000, + "step": 1660 + }, + { + "epoch": 0.2479892761394102, + "grad_norm": 8.090203285217285, + "learning_rate": 6.196008340780459e-06, + "loss": 0.8112, + "num_input_tokens_seen": 966656, + "step": 1665 + }, + { + "epoch": 0.2487339886803694, + "grad_norm": 5.65048885345459, + "learning_rate": 6.2146261543044384e-06, + "loss": 0.8024, + "num_input_tokens_seen": 969760, + "step": 1670 + }, + { + "epoch": 0.24947870122132856, + "grad_norm": 5.5839056968688965, + "learning_rate": 6.233243967828418e-06, + "loss": 0.8954, + "num_input_tokens_seen": 972672, + "step": 1675 + }, + { + "epoch": 0.25022341376228774, + "grad_norm": 9.171768188476562, + "learning_rate": 6.251861781352398e-06, + "loss": 0.7499, + "num_input_tokens_seen": 975552, + "step": 1680 + }, + { + "epoch": 0.25096812630324694, + "grad_norm": 4.676169395446777, + "learning_rate": 6.270479594876379e-06, + "loss": 0.509, + "num_input_tokens_seen": 978304, + "step": 1685 + }, + { + "epoch": 0.25171283884420614, + "grad_norm": 4.775261402130127, + "learning_rate": 6.289097408400357e-06, + "loss": 0.608, + "num_input_tokens_seen": 981088, + "step": 1690 + }, + { + "epoch": 0.25245755138516535, + "grad_norm": 4.5279059410095215, + "learning_rate": 6.307715221924337e-06, + "loss": 0.7005, + "num_input_tokens_seen": 983712, + "step": 1695 + }, + { + "epoch": 0.2532022639261245, + "grad_norm": 4.397166728973389, + "learning_rate": 6.326333035448317e-06, + "loss": 0.8175, + "num_input_tokens_seen": 986784, + "step": 1700 + }, + { + "epoch": 0.2539469764670837, + "grad_norm": 8.163846969604492, + "learning_rate": 6.344950848972298e-06, + "loss": 0.728, + "num_input_tokens_seen": 989984, + "step": 1705 + }, + { + "epoch": 0.2546916890080429, + "grad_norm": 8.56224536895752, + "learning_rate": 6.363568662496276e-06, + "loss": 0.643, + "num_input_tokens_seen": 992896, + "step": 1710 + }, + { + "epoch": 0.2554364015490021, + "grad_norm": 12.519129753112793, + "learning_rate": 6.382186476020256e-06, + "loss": 0.787, + "num_input_tokens_seen": 995744, + "step": 1715 + }, + { + "epoch": 0.25618111408996125, + "grad_norm": 6.391804218292236, + "learning_rate": 6.400804289544236e-06, + "loss": 0.7367, + "num_input_tokens_seen": 998752, + "step": 1720 + }, + { + "epoch": 0.25692582663092045, + "grad_norm": 4.203317165374756, + "learning_rate": 6.419422103068217e-06, + "loss": 0.8213, + "num_input_tokens_seen": 1001792, + "step": 1725 + }, + { + "epoch": 0.25767053917187965, + "grad_norm": 6.472588062286377, + "learning_rate": 6.438039916592195e-06, + "loss": 0.7455, + "num_input_tokens_seen": 1004608, + "step": 1730 + }, + { + "epoch": 0.25841525171283886, + "grad_norm": 4.3876190185546875, + "learning_rate": 6.456657730116175e-06, + "loss": 0.6873, + "num_input_tokens_seen": 1007232, + "step": 1735 + }, + { + "epoch": 0.25915996425379806, + "grad_norm": 5.638927459716797, + "learning_rate": 6.475275543640155e-06, + "loss": 0.7692, + "num_input_tokens_seen": 1010272, + "step": 1740 + }, + { + "epoch": 0.2599046767947572, + "grad_norm": 4.0532450675964355, + "learning_rate": 6.493893357164136e-06, + "loss": 0.7252, + "num_input_tokens_seen": 1013024, + "step": 1745 + }, + { + "epoch": 0.2606493893357164, + "grad_norm": 5.079476356506348, + "learning_rate": 6.5125111706881156e-06, + "loss": 0.5701, + "num_input_tokens_seen": 1016256, + "step": 1750 + }, + { + "epoch": 0.2613941018766756, + "grad_norm": 5.046901702880859, + "learning_rate": 6.531128984212094e-06, + "loss": 0.6309, + "num_input_tokens_seen": 1019200, + "step": 1755 + }, + { + "epoch": 0.2621388144176348, + "grad_norm": 4.607880115509033, + "learning_rate": 6.549746797736074e-06, + "loss": 0.812, + "num_input_tokens_seen": 1021888, + "step": 1760 + }, + { + "epoch": 0.26288352695859396, + "grad_norm": 6.457152843475342, + "learning_rate": 6.5683646112600545e-06, + "loss": 0.8029, + "num_input_tokens_seen": 1024864, + "step": 1765 + }, + { + "epoch": 0.26362823949955316, + "grad_norm": 4.704416275024414, + "learning_rate": 6.5869824247840344e-06, + "loss": 0.5549, + "num_input_tokens_seen": 1027648, + "step": 1770 + }, + { + "epoch": 0.26437295204051237, + "grad_norm": 5.704776763916016, + "learning_rate": 6.605600238308013e-06, + "loss": 0.8032, + "num_input_tokens_seen": 1030496, + "step": 1775 + }, + { + "epoch": 0.26511766458147157, + "grad_norm": 3.6085000038146973, + "learning_rate": 6.624218051831993e-06, + "loss": 0.5858, + "num_input_tokens_seen": 1033728, + "step": 1780 + }, + { + "epoch": 0.2658623771224307, + "grad_norm": 12.632158279418945, + "learning_rate": 6.642835865355973e-06, + "loss": 0.8017, + "num_input_tokens_seen": 1036672, + "step": 1785 + }, + { + "epoch": 0.2666070896633899, + "grad_norm": 4.298497200012207, + "learning_rate": 6.661453678879953e-06, + "loss": 0.7291, + "num_input_tokens_seen": 1039808, + "step": 1790 + }, + { + "epoch": 0.2673518022043491, + "grad_norm": 7.823922157287598, + "learning_rate": 6.6800714924039315e-06, + "loss": 0.6712, + "num_input_tokens_seen": 1042720, + "step": 1795 + }, + { + "epoch": 0.2680965147453083, + "grad_norm": 5.826444149017334, + "learning_rate": 6.6986893059279114e-06, + "loss": 0.6852, + "num_input_tokens_seen": 1045536, + "step": 1800 + }, + { + "epoch": 0.2688412272862675, + "grad_norm": 6.1747541427612305, + "learning_rate": 6.717307119451892e-06, + "loss": 0.7103, + "num_input_tokens_seen": 1048384, + "step": 1805 + }, + { + "epoch": 0.2695859398272267, + "grad_norm": 8.024418830871582, + "learning_rate": 6.735924932975872e-06, + "loss": 0.8036, + "num_input_tokens_seen": 1051168, + "step": 1810 + }, + { + "epoch": 0.2703306523681859, + "grad_norm": 4.957547664642334, + "learning_rate": 6.754542746499852e-06, + "loss": 0.6791, + "num_input_tokens_seen": 1053952, + "step": 1815 + }, + { + "epoch": 0.2710753649091451, + "grad_norm": 4.829057693481445, + "learning_rate": 6.77316056002383e-06, + "loss": 0.6089, + "num_input_tokens_seen": 1057056, + "step": 1820 + }, + { + "epoch": 0.2718200774501043, + "grad_norm": 4.695064544677734, + "learning_rate": 6.791778373547811e-06, + "loss": 0.6824, + "num_input_tokens_seen": 1059968, + "step": 1825 + }, + { + "epoch": 0.2725647899910634, + "grad_norm": 7.652475833892822, + "learning_rate": 6.810396187071791e-06, + "loss": 0.7679, + "num_input_tokens_seen": 1063008, + "step": 1830 + }, + { + "epoch": 0.27330950253202263, + "grad_norm": 5.2379865646362305, + "learning_rate": 6.829014000595771e-06, + "loss": 0.6196, + "num_input_tokens_seen": 1065728, + "step": 1835 + }, + { + "epoch": 0.27405421507298183, + "grad_norm": 6.5681538581848145, + "learning_rate": 6.847631814119749e-06, + "loss": 0.6019, + "num_input_tokens_seen": 1068608, + "step": 1840 + }, + { + "epoch": 0.27479892761394104, + "grad_norm": 5.8657379150390625, + "learning_rate": 6.86624962764373e-06, + "loss": 0.5993, + "num_input_tokens_seen": 1071552, + "step": 1845 + }, + { + "epoch": 0.2755436401549002, + "grad_norm": 5.305788516998291, + "learning_rate": 6.88486744116771e-06, + "loss": 0.6483, + "num_input_tokens_seen": 1074400, + "step": 1850 + }, + { + "epoch": 0.2762883526958594, + "grad_norm": 7.883950233459473, + "learning_rate": 6.90348525469169e-06, + "loss": 0.5849, + "num_input_tokens_seen": 1077440, + "step": 1855 + }, + { + "epoch": 0.2770330652368186, + "grad_norm": 10.972616195678711, + "learning_rate": 6.922103068215669e-06, + "loss": 0.7938, + "num_input_tokens_seen": 1080576, + "step": 1860 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 7.974542617797852, + "learning_rate": 6.940720881739649e-06, + "loss": 0.6256, + "num_input_tokens_seen": 1083616, + "step": 1865 + }, + { + "epoch": 0.278522490318737, + "grad_norm": 10.90691089630127, + "learning_rate": 6.959338695263629e-06, + "loss": 0.8134, + "num_input_tokens_seen": 1086624, + "step": 1870 + }, + { + "epoch": 0.27926720285969614, + "grad_norm": 10.663524627685547, + "learning_rate": 6.977956508787609e-06, + "loss": 0.7763, + "num_input_tokens_seen": 1089632, + "step": 1875 + }, + { + "epoch": 0.28001191540065534, + "grad_norm": 6.502586364746094, + "learning_rate": 6.9965743223115886e-06, + "loss": 0.6827, + "num_input_tokens_seen": 1092512, + "step": 1880 + }, + { + "epoch": 0.28075662794161454, + "grad_norm": 7.859530925750732, + "learning_rate": 7.015192135835568e-06, + "loss": 0.6614, + "num_input_tokens_seen": 1095488, + "step": 1885 + }, + { + "epoch": 0.28150134048257375, + "grad_norm": 8.11622142791748, + "learning_rate": 7.033809949359548e-06, + "loss": 0.7045, + "num_input_tokens_seen": 1098176, + "step": 1890 + }, + { + "epoch": 0.2822460530235329, + "grad_norm": 5.949709415435791, + "learning_rate": 7.0524277628835275e-06, + "loss": 0.6022, + "num_input_tokens_seen": 1101312, + "step": 1895 + }, + { + "epoch": 0.2829907655644921, + "grad_norm": 8.090648651123047, + "learning_rate": 7.0710455764075074e-06, + "loss": 0.633, + "num_input_tokens_seen": 1104480, + "step": 1900 + }, + { + "epoch": 0.2837354781054513, + "grad_norm": 6.0801849365234375, + "learning_rate": 7.0896633899314865e-06, + "loss": 0.669, + "num_input_tokens_seen": 1107424, + "step": 1905 + }, + { + "epoch": 0.2844801906464105, + "grad_norm": 9.826885223388672, + "learning_rate": 7.1082812034554664e-06, + "loss": 0.6622, + "num_input_tokens_seen": 1110112, + "step": 1910 + }, + { + "epoch": 0.28522490318736965, + "grad_norm": 14.345170974731445, + "learning_rate": 7.126899016979446e-06, + "loss": 0.9668, + "num_input_tokens_seen": 1113120, + "step": 1915 + }, + { + "epoch": 0.28596961572832885, + "grad_norm": 7.2780070304870605, + "learning_rate": 7.145516830503426e-06, + "loss": 0.834, + "num_input_tokens_seen": 1115904, + "step": 1920 + }, + { + "epoch": 0.28671432826928805, + "grad_norm": 3.6503994464874268, + "learning_rate": 7.164134644027405e-06, + "loss": 0.5743, + "num_input_tokens_seen": 1118880, + "step": 1925 + }, + { + "epoch": 0.28745904081024726, + "grad_norm": 5.324588775634766, + "learning_rate": 7.182752457551385e-06, + "loss": 0.7336, + "num_input_tokens_seen": 1121952, + "step": 1930 + }, + { + "epoch": 0.28820375335120646, + "grad_norm": 5.276536464691162, + "learning_rate": 7.201370271075365e-06, + "loss": 0.5489, + "num_input_tokens_seen": 1124704, + "step": 1935 + }, + { + "epoch": 0.2889484658921656, + "grad_norm": 4.700926303863525, + "learning_rate": 7.219988084599345e-06, + "loss": 0.6982, + "num_input_tokens_seen": 1127456, + "step": 1940 + }, + { + "epoch": 0.2896931784331248, + "grad_norm": 10.147988319396973, + "learning_rate": 7.238605898123325e-06, + "loss": 0.7101, + "num_input_tokens_seen": 1130784, + "step": 1945 + }, + { + "epoch": 0.290437890974084, + "grad_norm": 6.884389877319336, + "learning_rate": 7.257223711647304e-06, + "loss": 0.6641, + "num_input_tokens_seen": 1133760, + "step": 1950 + }, + { + "epoch": 0.2911826035150432, + "grad_norm": 4.0454816818237305, + "learning_rate": 7.275841525171284e-06, + "loss": 0.7232, + "num_input_tokens_seen": 1136640, + "step": 1955 + }, + { + "epoch": 0.29192731605600236, + "grad_norm": 5.9688310623168945, + "learning_rate": 7.294459338695264e-06, + "loss": 0.6186, + "num_input_tokens_seen": 1139744, + "step": 1960 + }, + { + "epoch": 0.29267202859696156, + "grad_norm": 6.422449111938477, + "learning_rate": 7.313077152219244e-06, + "loss": 0.6067, + "num_input_tokens_seen": 1142688, + "step": 1965 + }, + { + "epoch": 0.29341674113792077, + "grad_norm": 5.452460289001465, + "learning_rate": 7.331694965743223e-06, + "loss": 0.7507, + "num_input_tokens_seen": 1145824, + "step": 1970 + }, + { + "epoch": 0.29416145367887997, + "grad_norm": 6.645854949951172, + "learning_rate": 7.350312779267203e-06, + "loss": 0.671, + "num_input_tokens_seen": 1148544, + "step": 1975 + }, + { + "epoch": 0.2949061662198391, + "grad_norm": 5.215625286102295, + "learning_rate": 7.368930592791183e-06, + "loss": 0.7425, + "num_input_tokens_seen": 1151328, + "step": 1980 + }, + { + "epoch": 0.2956508787607983, + "grad_norm": 4.9962239265441895, + "learning_rate": 7.387548406315163e-06, + "loss": 0.5964, + "num_input_tokens_seen": 1153952, + "step": 1985 + }, + { + "epoch": 0.2963955913017575, + "grad_norm": 4.321212291717529, + "learning_rate": 7.406166219839142e-06, + "loss": 0.6658, + "num_input_tokens_seen": 1156800, + "step": 1990 + }, + { + "epoch": 0.2971403038427167, + "grad_norm": 8.626856803894043, + "learning_rate": 7.424784033363122e-06, + "loss": 0.6409, + "num_input_tokens_seen": 1159808, + "step": 1995 + }, + { + "epoch": 0.2978850163836759, + "grad_norm": 6.70037317276001, + "learning_rate": 7.443401846887102e-06, + "loss": 0.6461, + "num_input_tokens_seen": 1162848, + "step": 2000 + }, + { + "epoch": 0.2986297289246351, + "grad_norm": 7.4241251945495605, + "learning_rate": 7.462019660411082e-06, + "loss": 0.7766, + "num_input_tokens_seen": 1166208, + "step": 2005 + }, + { + "epoch": 0.2993744414655943, + "grad_norm": 7.2298736572265625, + "learning_rate": 7.480637473935062e-06, + "loss": 0.9582, + "num_input_tokens_seen": 1168928, + "step": 2010 + }, + { + "epoch": 0.3001191540065535, + "grad_norm": 7.162229537963867, + "learning_rate": 7.499255287459041e-06, + "loss": 0.6962, + "num_input_tokens_seen": 1171712, + "step": 2015 + }, + { + "epoch": 0.3008638665475127, + "grad_norm": 4.625200271606445, + "learning_rate": 7.517873100983021e-06, + "loss": 0.5611, + "num_input_tokens_seen": 1174368, + "step": 2020 + }, + { + "epoch": 0.30160857908847183, + "grad_norm": 4.599929332733154, + "learning_rate": 7.5364909145070005e-06, + "loss": 0.5905, + "num_input_tokens_seen": 1177184, + "step": 2025 + }, + { + "epoch": 0.30235329162943103, + "grad_norm": 7.8717241287231445, + "learning_rate": 7.555108728030981e-06, + "loss": 0.7516, + "num_input_tokens_seen": 1179872, + "step": 2030 + }, + { + "epoch": 0.30309800417039023, + "grad_norm": 7.319043159484863, + "learning_rate": 7.5737265415549595e-06, + "loss": 0.8054, + "num_input_tokens_seen": 1182464, + "step": 2035 + }, + { + "epoch": 0.30384271671134944, + "grad_norm": 4.381505966186523, + "learning_rate": 7.5923443550789394e-06, + "loss": 0.8172, + "num_input_tokens_seen": 1185632, + "step": 2040 + }, + { + "epoch": 0.3045874292523086, + "grad_norm": 6.152384281158447, + "learning_rate": 7.610962168602919e-06, + "loss": 0.65, + "num_input_tokens_seen": 1188416, + "step": 2045 + }, + { + "epoch": 0.3053321417932678, + "grad_norm": 4.225789546966553, + "learning_rate": 7.6295799821269e-06, + "loss": 0.7271, + "num_input_tokens_seen": 1191648, + "step": 2050 + }, + { + "epoch": 0.306076854334227, + "grad_norm": 6.509397506713867, + "learning_rate": 7.64819779565088e-06, + "loss": 0.6954, + "num_input_tokens_seen": 1194432, + "step": 2055 + }, + { + "epoch": 0.3068215668751862, + "grad_norm": 4.845241069793701, + "learning_rate": 7.666815609174859e-06, + "loss": 0.663, + "num_input_tokens_seen": 1197248, + "step": 2060 + }, + { + "epoch": 0.3075662794161454, + "grad_norm": 4.863946914672852, + "learning_rate": 7.685433422698839e-06, + "loss": 0.7533, + "num_input_tokens_seen": 1200256, + "step": 2065 + }, + { + "epoch": 0.30831099195710454, + "grad_norm": 5.519295692443848, + "learning_rate": 7.704051236222819e-06, + "loss": 0.765, + "num_input_tokens_seen": 1203040, + "step": 2070 + }, + { + "epoch": 0.30905570449806374, + "grad_norm": 5.450803279876709, + "learning_rate": 7.722669049746799e-06, + "loss": 0.7001, + "num_input_tokens_seen": 1206080, + "step": 2075 + }, + { + "epoch": 0.30980041703902295, + "grad_norm": 11.859193801879883, + "learning_rate": 7.741286863270777e-06, + "loss": 0.6192, + "num_input_tokens_seen": 1208704, + "step": 2080 + }, + { + "epoch": 0.31054512957998215, + "grad_norm": 5.338890075683594, + "learning_rate": 7.759904676794757e-06, + "loss": 0.6325, + "num_input_tokens_seen": 1211424, + "step": 2085 + }, + { + "epoch": 0.3112898421209413, + "grad_norm": 4.981992721557617, + "learning_rate": 7.778522490318737e-06, + "loss": 0.5324, + "num_input_tokens_seen": 1214272, + "step": 2090 + }, + { + "epoch": 0.3120345546619005, + "grad_norm": 5.956925392150879, + "learning_rate": 7.797140303842717e-06, + "loss": 0.7774, + "num_input_tokens_seen": 1217280, + "step": 2095 + }, + { + "epoch": 0.3127792672028597, + "grad_norm": 5.703226566314697, + "learning_rate": 7.815758117366697e-06, + "loss": 0.6934, + "num_input_tokens_seen": 1220064, + "step": 2100 + }, + { + "epoch": 0.3135239797438189, + "grad_norm": 7.417707443237305, + "learning_rate": 7.834375930890677e-06, + "loss": 0.553, + "num_input_tokens_seen": 1222848, + "step": 2105 + }, + { + "epoch": 0.31426869228477805, + "grad_norm": 7.603117942810059, + "learning_rate": 7.852993744414657e-06, + "loss": 0.6961, + "num_input_tokens_seen": 1225760, + "step": 2110 + }, + { + "epoch": 0.31501340482573725, + "grad_norm": 5.83669376373291, + "learning_rate": 7.871611557938637e-06, + "loss": 0.7146, + "num_input_tokens_seen": 1228864, + "step": 2115 + }, + { + "epoch": 0.31575811736669646, + "grad_norm": 8.598518371582031, + "learning_rate": 7.890229371462615e-06, + "loss": 0.6969, + "num_input_tokens_seen": 1231712, + "step": 2120 + }, + { + "epoch": 0.31650282990765566, + "grad_norm": 6.249850273132324, + "learning_rate": 7.908847184986595e-06, + "loss": 0.615, + "num_input_tokens_seen": 1234720, + "step": 2125 + }, + { + "epoch": 0.31724754244861486, + "grad_norm": 4.321227550506592, + "learning_rate": 7.927464998510575e-06, + "loss": 0.6922, + "num_input_tokens_seen": 1237504, + "step": 2130 + }, + { + "epoch": 0.317992254989574, + "grad_norm": 3.974827766418457, + "learning_rate": 7.946082812034555e-06, + "loss": 0.7255, + "num_input_tokens_seen": 1240256, + "step": 2135 + }, + { + "epoch": 0.3187369675305332, + "grad_norm": 6.259387016296387, + "learning_rate": 7.964700625558536e-06, + "loss": 0.7943, + "num_input_tokens_seen": 1243264, + "step": 2140 + }, + { + "epoch": 0.3194816800714924, + "grad_norm": 4.505035400390625, + "learning_rate": 7.983318439082515e-06, + "loss": 0.7047, + "num_input_tokens_seen": 1246304, + "step": 2145 + }, + { + "epoch": 0.3202263926124516, + "grad_norm": 9.146924018859863, + "learning_rate": 8.001936252606494e-06, + "loss": 0.7241, + "num_input_tokens_seen": 1249184, + "step": 2150 + }, + { + "epoch": 0.32097110515341076, + "grad_norm": 5.914586544036865, + "learning_rate": 8.020554066130474e-06, + "loss": 0.7714, + "num_input_tokens_seen": 1252064, + "step": 2155 + }, + { + "epoch": 0.32171581769436997, + "grad_norm": 5.719528675079346, + "learning_rate": 8.039171879654454e-06, + "loss": 0.6725, + "num_input_tokens_seen": 1255200, + "step": 2160 + }, + { + "epoch": 0.32246053023532917, + "grad_norm": 4.729129791259766, + "learning_rate": 8.057789693178433e-06, + "loss": 0.6715, + "num_input_tokens_seen": 1258048, + "step": 2165 + }, + { + "epoch": 0.32320524277628837, + "grad_norm": 3.7402286529541016, + "learning_rate": 8.076407506702412e-06, + "loss": 0.6373, + "num_input_tokens_seen": 1261024, + "step": 2170 + }, + { + "epoch": 0.3239499553172475, + "grad_norm": 4.7270636558532715, + "learning_rate": 8.095025320226392e-06, + "loss": 0.6775, + "num_input_tokens_seen": 1263872, + "step": 2175 + }, + { + "epoch": 0.3246946678582067, + "grad_norm": 8.159414291381836, + "learning_rate": 8.113643133750374e-06, + "loss": 0.6783, + "num_input_tokens_seen": 1266912, + "step": 2180 + }, + { + "epoch": 0.3254393803991659, + "grad_norm": 11.339363098144531, + "learning_rate": 8.132260947274352e-06, + "loss": 0.6958, + "num_input_tokens_seen": 1269728, + "step": 2185 + }, + { + "epoch": 0.3261840929401251, + "grad_norm": 7.7016754150390625, + "learning_rate": 8.150878760798332e-06, + "loss": 0.6768, + "num_input_tokens_seen": 1272416, + "step": 2190 + }, + { + "epoch": 0.32692880548108433, + "grad_norm": 8.528070449829102, + "learning_rate": 8.169496574322312e-06, + "loss": 0.7732, + "num_input_tokens_seen": 1275552, + "step": 2195 + }, + { + "epoch": 0.3276735180220435, + "grad_norm": 6.768664360046387, + "learning_rate": 8.188114387846292e-06, + "loss": 0.6619, + "num_input_tokens_seen": 1278752, + "step": 2200 + }, + { + "epoch": 0.3284182305630027, + "grad_norm": 8.647138595581055, + "learning_rate": 8.206732201370272e-06, + "loss": 0.8571, + "num_input_tokens_seen": 1281728, + "step": 2205 + }, + { + "epoch": 0.3291629431039619, + "grad_norm": 8.11633586883545, + "learning_rate": 8.22535001489425e-06, + "loss": 0.6345, + "num_input_tokens_seen": 1284608, + "step": 2210 + }, + { + "epoch": 0.3299076556449211, + "grad_norm": 12.155244827270508, + "learning_rate": 8.24396782841823e-06, + "loss": 0.6281, + "num_input_tokens_seen": 1287648, + "step": 2215 + }, + { + "epoch": 0.33065236818588023, + "grad_norm": 6.9894890785217285, + "learning_rate": 8.262585641942212e-06, + "loss": 0.6718, + "num_input_tokens_seen": 1290496, + "step": 2220 + }, + { + "epoch": 0.33139708072683943, + "grad_norm": 4.895290374755859, + "learning_rate": 8.281203455466192e-06, + "loss": 0.4959, + "num_input_tokens_seen": 1293344, + "step": 2225 + }, + { + "epoch": 0.33214179326779864, + "grad_norm": 6.184462070465088, + "learning_rate": 8.29982126899017e-06, + "loss": 0.6333, + "num_input_tokens_seen": 1296192, + "step": 2230 + }, + { + "epoch": 0.33288650580875784, + "grad_norm": 10.489750862121582, + "learning_rate": 8.31843908251415e-06, + "loss": 0.7225, + "num_input_tokens_seen": 1298848, + "step": 2235 + }, + { + "epoch": 0.333631218349717, + "grad_norm": 9.461226463317871, + "learning_rate": 8.33705689603813e-06, + "loss": 0.8125, + "num_input_tokens_seen": 1301536, + "step": 2240 + }, + { + "epoch": 0.3343759308906762, + "grad_norm": 6.777676105499268, + "learning_rate": 8.35567470956211e-06, + "loss": 0.7063, + "num_input_tokens_seen": 1304448, + "step": 2245 + }, + { + "epoch": 0.3351206434316354, + "grad_norm": 6.649848937988281, + "learning_rate": 8.374292523086088e-06, + "loss": 0.6993, + "num_input_tokens_seen": 1307328, + "step": 2250 + }, + { + "epoch": 0.3358653559725946, + "grad_norm": 4.111466407775879, + "learning_rate": 8.392910336610068e-06, + "loss": 0.6156, + "num_input_tokens_seen": 1310368, + "step": 2255 + }, + { + "epoch": 0.3366100685135538, + "grad_norm": 5.087409019470215, + "learning_rate": 8.41152815013405e-06, + "loss": 0.7141, + "num_input_tokens_seen": 1313216, + "step": 2260 + }, + { + "epoch": 0.33735478105451294, + "grad_norm": 7.032345294952393, + "learning_rate": 8.43014596365803e-06, + "loss": 0.6993, + "num_input_tokens_seen": 1316256, + "step": 2265 + }, + { + "epoch": 0.33809949359547214, + "grad_norm": 8.278244972229004, + "learning_rate": 8.44876377718201e-06, + "loss": 0.7551, + "num_input_tokens_seen": 1318976, + "step": 2270 + }, + { + "epoch": 0.33884420613643135, + "grad_norm": 6.356421947479248, + "learning_rate": 8.467381590705988e-06, + "loss": 0.8564, + "num_input_tokens_seen": 1321632, + "step": 2275 + }, + { + "epoch": 0.33958891867739055, + "grad_norm": 3.836160898208618, + "learning_rate": 8.485999404229967e-06, + "loss": 0.6687, + "num_input_tokens_seen": 1324256, + "step": 2280 + }, + { + "epoch": 0.3403336312183497, + "grad_norm": 3.504909038543701, + "learning_rate": 8.504617217753947e-06, + "loss": 0.6858, + "num_input_tokens_seen": 1327008, + "step": 2285 + }, + { + "epoch": 0.3410783437593089, + "grad_norm": 7.494557857513428, + "learning_rate": 8.523235031277927e-06, + "loss": 0.7577, + "num_input_tokens_seen": 1330016, + "step": 2290 + }, + { + "epoch": 0.3418230563002681, + "grad_norm": 9.845422744750977, + "learning_rate": 8.541852844801907e-06, + "loss": 0.7116, + "num_input_tokens_seen": 1333024, + "step": 2295 + }, + { + "epoch": 0.3425677688412273, + "grad_norm": 11.47579288482666, + "learning_rate": 8.560470658325887e-06, + "loss": 0.6528, + "num_input_tokens_seen": 1335872, + "step": 2300 + }, + { + "epoch": 0.34331248138218645, + "grad_norm": 9.35091781616211, + "learning_rate": 8.579088471849867e-06, + "loss": 0.7516, + "num_input_tokens_seen": 1338560, + "step": 2305 + }, + { + "epoch": 0.34405719392314565, + "grad_norm": 4.11307954788208, + "learning_rate": 8.597706285373847e-06, + "loss": 0.6852, + "num_input_tokens_seen": 1341248, + "step": 2310 + }, + { + "epoch": 0.34480190646410486, + "grad_norm": 7.17581844329834, + "learning_rate": 8.616324098897825e-06, + "loss": 0.6583, + "num_input_tokens_seen": 1343936, + "step": 2315 + }, + { + "epoch": 0.34554661900506406, + "grad_norm": 6.491641044616699, + "learning_rate": 8.634941912421805e-06, + "loss": 0.6455, + "num_input_tokens_seen": 1346880, + "step": 2320 + }, + { + "epoch": 0.34629133154602326, + "grad_norm": 6.8474860191345215, + "learning_rate": 8.653559725945785e-06, + "loss": 0.6353, + "num_input_tokens_seen": 1349824, + "step": 2325 + }, + { + "epoch": 0.3470360440869824, + "grad_norm": 7.671634197235107, + "learning_rate": 8.672177539469765e-06, + "loss": 0.6618, + "num_input_tokens_seen": 1352576, + "step": 2330 + }, + { + "epoch": 0.3477807566279416, + "grad_norm": 8.535658836364746, + "learning_rate": 8.690795352993745e-06, + "loss": 0.6884, + "num_input_tokens_seen": 1355872, + "step": 2335 + }, + { + "epoch": 0.3485254691689008, + "grad_norm": 5.505020618438721, + "learning_rate": 8.709413166517725e-06, + "loss": 0.6204, + "num_input_tokens_seen": 1358816, + "step": 2340 + }, + { + "epoch": 0.34927018170986, + "grad_norm": 5.211563587188721, + "learning_rate": 8.728030980041705e-06, + "loss": 0.6557, + "num_input_tokens_seen": 1362144, + "step": 2345 + }, + { + "epoch": 0.35001489425081916, + "grad_norm": 6.278181076049805, + "learning_rate": 8.746648793565685e-06, + "loss": 0.5517, + "num_input_tokens_seen": 1365120, + "step": 2350 + }, + { + "epoch": 0.35075960679177837, + "grad_norm": 7.078400135040283, + "learning_rate": 8.765266607089665e-06, + "loss": 0.8255, + "num_input_tokens_seen": 1368192, + "step": 2355 + }, + { + "epoch": 0.35150431933273757, + "grad_norm": 7.570345878601074, + "learning_rate": 8.783884420613643e-06, + "loss": 0.7346, + "num_input_tokens_seen": 1371200, + "step": 2360 + }, + { + "epoch": 0.35224903187369677, + "grad_norm": 3.9990358352661133, + "learning_rate": 8.802502234137623e-06, + "loss": 0.6487, + "num_input_tokens_seen": 1374112, + "step": 2365 + }, + { + "epoch": 0.3529937444146559, + "grad_norm": 9.725452423095703, + "learning_rate": 8.821120047661603e-06, + "loss": 0.8774, + "num_input_tokens_seen": 1376640, + "step": 2370 + }, + { + "epoch": 0.3537384569556151, + "grad_norm": 8.726845741271973, + "learning_rate": 8.839737861185583e-06, + "loss": 0.7953, + "num_input_tokens_seen": 1379616, + "step": 2375 + }, + { + "epoch": 0.3544831694965743, + "grad_norm": 16.12211799621582, + "learning_rate": 8.858355674709563e-06, + "loss": 0.941, + "num_input_tokens_seen": 1382304, + "step": 2380 + }, + { + "epoch": 0.3552278820375335, + "grad_norm": 4.7066545486450195, + "learning_rate": 8.876973488233543e-06, + "loss": 0.6749, + "num_input_tokens_seen": 1385184, + "step": 2385 + }, + { + "epoch": 0.35597259457849273, + "grad_norm": 9.636001586914062, + "learning_rate": 8.895591301757522e-06, + "loss": 0.6785, + "num_input_tokens_seen": 1388224, + "step": 2390 + }, + { + "epoch": 0.3567173071194519, + "grad_norm": 5.943131923675537, + "learning_rate": 8.914209115281502e-06, + "loss": 0.6693, + "num_input_tokens_seen": 1391040, + "step": 2395 + }, + { + "epoch": 0.3574620196604111, + "grad_norm": 6.257038593292236, + "learning_rate": 8.93282692880548e-06, + "loss": 0.7053, + "num_input_tokens_seen": 1393792, + "step": 2400 + }, + { + "epoch": 0.3582067322013703, + "grad_norm": 6.247485637664795, + "learning_rate": 8.95144474232946e-06, + "loss": 0.7499, + "num_input_tokens_seen": 1396576, + "step": 2405 + }, + { + "epoch": 0.3589514447423295, + "grad_norm": 2.8711037635803223, + "learning_rate": 8.97006255585344e-06, + "loss": 0.7237, + "num_input_tokens_seen": 1399552, + "step": 2410 + }, + { + "epoch": 0.35969615728328863, + "grad_norm": 5.243406295776367, + "learning_rate": 8.98868036937742e-06, + "loss": 0.5462, + "num_input_tokens_seen": 1402368, + "step": 2415 + }, + { + "epoch": 0.36044086982424783, + "grad_norm": 6.734992504119873, + "learning_rate": 9.0072981829014e-06, + "loss": 0.668, + "num_input_tokens_seen": 1405344, + "step": 2420 + }, + { + "epoch": 0.36118558236520704, + "grad_norm": 5.759209156036377, + "learning_rate": 9.02591599642538e-06, + "loss": 0.6697, + "num_input_tokens_seen": 1408064, + "step": 2425 + }, + { + "epoch": 0.36193029490616624, + "grad_norm": 7.833540439605713, + "learning_rate": 9.04453380994936e-06, + "loss": 0.7015, + "num_input_tokens_seen": 1410944, + "step": 2430 + }, + { + "epoch": 0.3626750074471254, + "grad_norm": 2.672952175140381, + "learning_rate": 9.06315162347334e-06, + "loss": 0.679, + "num_input_tokens_seen": 1413856, + "step": 2435 + }, + { + "epoch": 0.3634197199880846, + "grad_norm": 10.38479995727539, + "learning_rate": 9.08176943699732e-06, + "loss": 0.659, + "num_input_tokens_seen": 1416864, + "step": 2440 + }, + { + "epoch": 0.3641644325290438, + "grad_norm": 7.404048919677734, + "learning_rate": 9.100387250521298e-06, + "loss": 0.6984, + "num_input_tokens_seen": 1419680, + "step": 2445 + }, + { + "epoch": 0.364909145070003, + "grad_norm": 5.611001491546631, + "learning_rate": 9.119005064045278e-06, + "loss": 0.6694, + "num_input_tokens_seen": 1422656, + "step": 2450 + }, + { + "epoch": 0.3656538576109622, + "grad_norm": 4.0814924240112305, + "learning_rate": 9.137622877569258e-06, + "loss": 0.6768, + "num_input_tokens_seen": 1425504, + "step": 2455 + }, + { + "epoch": 0.36639857015192134, + "grad_norm": 16.65565299987793, + "learning_rate": 9.156240691093238e-06, + "loss": 0.9282, + "num_input_tokens_seen": 1428704, + "step": 2460 + }, + { + "epoch": 0.36714328269288055, + "grad_norm": 8.93142032623291, + "learning_rate": 9.174858504617218e-06, + "loss": 0.858, + "num_input_tokens_seen": 1431744, + "step": 2465 + }, + { + "epoch": 0.36788799523383975, + "grad_norm": 6.452106952667236, + "learning_rate": 9.193476318141198e-06, + "loss": 0.7574, + "num_input_tokens_seen": 1434880, + "step": 2470 + }, + { + "epoch": 0.36863270777479895, + "grad_norm": 8.506200790405273, + "learning_rate": 9.212094131665178e-06, + "loss": 0.6614, + "num_input_tokens_seen": 1437728, + "step": 2475 + }, + { + "epoch": 0.3693774203157581, + "grad_norm": 3.8967082500457764, + "learning_rate": 9.230711945189158e-06, + "loss": 0.6218, + "num_input_tokens_seen": 1440672, + "step": 2480 + }, + { + "epoch": 0.3701221328567173, + "grad_norm": 4.989303112030029, + "learning_rate": 9.249329758713138e-06, + "loss": 0.6598, + "num_input_tokens_seen": 1443424, + "step": 2485 + }, + { + "epoch": 0.3708668453976765, + "grad_norm": 4.147903919219971, + "learning_rate": 9.267947572237116e-06, + "loss": 0.7236, + "num_input_tokens_seen": 1446368, + "step": 2490 + }, + { + "epoch": 0.3716115579386357, + "grad_norm": 4.212974548339844, + "learning_rate": 9.286565385761096e-06, + "loss": 0.6217, + "num_input_tokens_seen": 1449184, + "step": 2495 + }, + { + "epoch": 0.37235627047959485, + "grad_norm": 12.594098091125488, + "learning_rate": 9.305183199285077e-06, + "loss": 0.6284, + "num_input_tokens_seen": 1452096, + "step": 2500 + }, + { + "epoch": 0.37310098302055406, + "grad_norm": 12.862348556518555, + "learning_rate": 9.323801012809057e-06, + "loss": 0.8327, + "num_input_tokens_seen": 1455040, + "step": 2505 + }, + { + "epoch": 0.37384569556151326, + "grad_norm": 7.051909923553467, + "learning_rate": 9.342418826333036e-06, + "loss": 0.6333, + "num_input_tokens_seen": 1457984, + "step": 2510 + }, + { + "epoch": 0.37459040810247246, + "grad_norm": 17.062023162841797, + "learning_rate": 9.361036639857016e-06, + "loss": 0.665, + "num_input_tokens_seen": 1460896, + "step": 2515 + }, + { + "epoch": 0.3753351206434316, + "grad_norm": 7.026362895965576, + "learning_rate": 9.379654453380995e-06, + "loss": 0.5086, + "num_input_tokens_seen": 1463776, + "step": 2520 + }, + { + "epoch": 0.3760798331843908, + "grad_norm": 6.829153060913086, + "learning_rate": 9.398272266904975e-06, + "loss": 0.7674, + "num_input_tokens_seen": 1466464, + "step": 2525 + }, + { + "epoch": 0.37682454572535, + "grad_norm": 9.46269702911377, + "learning_rate": 9.416890080428954e-06, + "loss": 0.6392, + "num_input_tokens_seen": 1469472, + "step": 2530 + }, + { + "epoch": 0.3775692582663092, + "grad_norm": 9.520059585571289, + "learning_rate": 9.435507893952934e-06, + "loss": 0.6424, + "num_input_tokens_seen": 1472256, + "step": 2535 + }, + { + "epoch": 0.3783139708072684, + "grad_norm": 5.246187686920166, + "learning_rate": 9.454125707476915e-06, + "loss": 0.6139, + "num_input_tokens_seen": 1475392, + "step": 2540 + }, + { + "epoch": 0.37905868334822757, + "grad_norm": 3.8253445625305176, + "learning_rate": 9.472743521000895e-06, + "loss": 0.6513, + "num_input_tokens_seen": 1477856, + "step": 2545 + }, + { + "epoch": 0.37980339588918677, + "grad_norm": 11.698685646057129, + "learning_rate": 9.491361334524875e-06, + "loss": 0.7555, + "num_input_tokens_seen": 1480736, + "step": 2550 + }, + { + "epoch": 0.38054810843014597, + "grad_norm": 5.393768310546875, + "learning_rate": 9.509979148048853e-06, + "loss": 0.7699, + "num_input_tokens_seen": 1483680, + "step": 2555 + }, + { + "epoch": 0.3812928209711052, + "grad_norm": 4.996095180511475, + "learning_rate": 9.528596961572833e-06, + "loss": 0.7395, + "num_input_tokens_seen": 1486560, + "step": 2560 + }, + { + "epoch": 0.3820375335120643, + "grad_norm": 6.994109153747559, + "learning_rate": 9.547214775096813e-06, + "loss": 0.7138, + "num_input_tokens_seen": 1489376, + "step": 2565 + }, + { + "epoch": 0.3827822460530235, + "grad_norm": 5.492467880249023, + "learning_rate": 9.565832588620793e-06, + "loss": 0.6484, + "num_input_tokens_seen": 1492448, + "step": 2570 + }, + { + "epoch": 0.3835269585939827, + "grad_norm": 5.083649635314941, + "learning_rate": 9.584450402144771e-06, + "loss": 0.6859, + "num_input_tokens_seen": 1495424, + "step": 2575 + }, + { + "epoch": 0.38427167113494193, + "grad_norm": 6.558760643005371, + "learning_rate": 9.603068215668753e-06, + "loss": 0.7025, + "num_input_tokens_seen": 1498368, + "step": 2580 + }, + { + "epoch": 0.3850163836759011, + "grad_norm": 7.164266109466553, + "learning_rate": 9.621686029192733e-06, + "loss": 0.6367, + "num_input_tokens_seen": 1501152, + "step": 2585 + }, + { + "epoch": 0.3857610962168603, + "grad_norm": 6.666920185089111, + "learning_rate": 9.640303842716713e-06, + "loss": 0.7616, + "num_input_tokens_seen": 1504096, + "step": 2590 + }, + { + "epoch": 0.3865058087578195, + "grad_norm": 9.386176109313965, + "learning_rate": 9.658921656240691e-06, + "loss": 0.7824, + "num_input_tokens_seen": 1507136, + "step": 2595 + }, + { + "epoch": 0.3872505212987787, + "grad_norm": 7.2049994468688965, + "learning_rate": 9.677539469764671e-06, + "loss": 0.6308, + "num_input_tokens_seen": 1509888, + "step": 2600 + }, + { + "epoch": 0.3879952338397379, + "grad_norm": 6.100470542907715, + "learning_rate": 9.69615728328865e-06, + "loss": 0.7132, + "num_input_tokens_seen": 1512960, + "step": 2605 + }, + { + "epoch": 0.38873994638069703, + "grad_norm": 4.258932113647461, + "learning_rate": 9.71477509681263e-06, + "loss": 0.7371, + "num_input_tokens_seen": 1515648, + "step": 2610 + }, + { + "epoch": 0.38948465892165623, + "grad_norm": 3.85831356048584, + "learning_rate": 9.73339291033661e-06, + "loss": 0.6536, + "num_input_tokens_seen": 1518432, + "step": 2615 + }, + { + "epoch": 0.39022937146261544, + "grad_norm": 7.0201096534729, + "learning_rate": 9.75201072386059e-06, + "loss": 0.7652, + "num_input_tokens_seen": 1521152, + "step": 2620 + }, + { + "epoch": 0.39097408400357464, + "grad_norm": 9.793900489807129, + "learning_rate": 9.77062853738457e-06, + "loss": 0.7137, + "num_input_tokens_seen": 1523744, + "step": 2625 + }, + { + "epoch": 0.3917187965445338, + "grad_norm": 8.175745964050293, + "learning_rate": 9.78924635090855e-06, + "loss": 0.665, + "num_input_tokens_seen": 1526720, + "step": 2630 + }, + { + "epoch": 0.392463509085493, + "grad_norm": 6.570956707000732, + "learning_rate": 9.80786416443253e-06, + "loss": 0.5688, + "num_input_tokens_seen": 1529792, + "step": 2635 + }, + { + "epoch": 0.3932082216264522, + "grad_norm": 4.837464332580566, + "learning_rate": 9.826481977956509e-06, + "loss": 0.712, + "num_input_tokens_seen": 1532544, + "step": 2640 + }, + { + "epoch": 0.3939529341674114, + "grad_norm": 5.959856033325195, + "learning_rate": 9.845099791480489e-06, + "loss": 0.7112, + "num_input_tokens_seen": 1535616, + "step": 2645 + }, + { + "epoch": 0.39469764670837054, + "grad_norm": 12.668920516967773, + "learning_rate": 9.863717605004468e-06, + "loss": 0.7286, + "num_input_tokens_seen": 1539008, + "step": 2650 + }, + { + "epoch": 0.39544235924932974, + "grad_norm": 6.036080360412598, + "learning_rate": 9.882335418528448e-06, + "loss": 0.6745, + "num_input_tokens_seen": 1541792, + "step": 2655 + }, + { + "epoch": 0.39618707179028895, + "grad_norm": 6.096039295196533, + "learning_rate": 9.900953232052428e-06, + "loss": 0.6615, + "num_input_tokens_seen": 1544864, + "step": 2660 + }, + { + "epoch": 0.39693178433124815, + "grad_norm": 4.672317028045654, + "learning_rate": 9.919571045576408e-06, + "loss": 0.6747, + "num_input_tokens_seen": 1547904, + "step": 2665 + }, + { + "epoch": 0.39767649687220735, + "grad_norm": 5.80328369140625, + "learning_rate": 9.938188859100388e-06, + "loss": 0.6305, + "num_input_tokens_seen": 1550688, + "step": 2670 + }, + { + "epoch": 0.3984212094131665, + "grad_norm": 7.826707363128662, + "learning_rate": 9.956806672624368e-06, + "loss": 0.6713, + "num_input_tokens_seen": 1553824, + "step": 2675 + }, + { + "epoch": 0.3991659219541257, + "grad_norm": 5.374536991119385, + "learning_rate": 9.975424486148348e-06, + "loss": 0.5161, + "num_input_tokens_seen": 1556576, + "step": 2680 + }, + { + "epoch": 0.3999106344950849, + "grad_norm": 7.033294200897217, + "learning_rate": 9.994042299672326e-06, + "loss": 0.6303, + "num_input_tokens_seen": 1559296, + "step": 2685 + }, + { + "epoch": 0.4006553470360441, + "grad_norm": 6.531309604644775, + "learning_rate": 1.0012660113196306e-05, + "loss": 0.6971, + "num_input_tokens_seen": 1562240, + "step": 2690 + }, + { + "epoch": 0.40140005957700325, + "grad_norm": 7.678210258483887, + "learning_rate": 1.0031277926720286e-05, + "loss": 0.5904, + "num_input_tokens_seen": 1565088, + "step": 2695 + }, + { + "epoch": 0.40214477211796246, + "grad_norm": 5.280368328094482, + "learning_rate": 1.0049895740244266e-05, + "loss": 0.656, + "num_input_tokens_seen": 1568000, + "step": 2700 + }, + { + "epoch": 0.40288948465892166, + "grad_norm": 10.846343040466309, + "learning_rate": 1.0068513553768246e-05, + "loss": 0.7777, + "num_input_tokens_seen": 1570912, + "step": 2705 + }, + { + "epoch": 0.40363419719988086, + "grad_norm": 19.250574111938477, + "learning_rate": 1.0087131367292226e-05, + "loss": 0.7816, + "num_input_tokens_seen": 1573888, + "step": 2710 + }, + { + "epoch": 0.40437890974084, + "grad_norm": 14.101375579833984, + "learning_rate": 1.0105749180816206e-05, + "loss": 0.7147, + "num_input_tokens_seen": 1576672, + "step": 2715 + }, + { + "epoch": 0.4051236222817992, + "grad_norm": 5.275883674621582, + "learning_rate": 1.0124366994340186e-05, + "loss": 0.5474, + "num_input_tokens_seen": 1579456, + "step": 2720 + }, + { + "epoch": 0.4058683348227584, + "grad_norm": 4.974694728851318, + "learning_rate": 1.0142984807864164e-05, + "loss": 0.7251, + "num_input_tokens_seen": 1581952, + "step": 2725 + }, + { + "epoch": 0.4066130473637176, + "grad_norm": 7.858240604400635, + "learning_rate": 1.0161602621388144e-05, + "loss": 0.657, + "num_input_tokens_seen": 1584864, + "step": 2730 + }, + { + "epoch": 0.4073577599046768, + "grad_norm": 7.096179485321045, + "learning_rate": 1.0180220434912124e-05, + "loss": 0.7365, + "num_input_tokens_seen": 1588160, + "step": 2735 + }, + { + "epoch": 0.40810247244563597, + "grad_norm": 12.990878105163574, + "learning_rate": 1.0198838248436104e-05, + "loss": 0.7556, + "num_input_tokens_seen": 1590880, + "step": 2740 + }, + { + "epoch": 0.40884718498659517, + "grad_norm": 5.617076396942139, + "learning_rate": 1.0217456061960085e-05, + "loss": 0.7406, + "num_input_tokens_seen": 1593600, + "step": 2745 + }, + { + "epoch": 0.40959189752755437, + "grad_norm": 7.684675216674805, + "learning_rate": 1.0236073875484064e-05, + "loss": 0.8826, + "num_input_tokens_seen": 1596512, + "step": 2750 + }, + { + "epoch": 0.4103366100685136, + "grad_norm": 4.783047199249268, + "learning_rate": 1.0254691689008044e-05, + "loss": 0.6347, + "num_input_tokens_seen": 1599296, + "step": 2755 + }, + { + "epoch": 0.4110813226094727, + "grad_norm": 7.1263957023620605, + "learning_rate": 1.0273309502532023e-05, + "loss": 0.6868, + "num_input_tokens_seen": 1602208, + "step": 2760 + }, + { + "epoch": 0.4118260351504319, + "grad_norm": 7.235676288604736, + "learning_rate": 1.0291927316056003e-05, + "loss": 0.7353, + "num_input_tokens_seen": 1605184, + "step": 2765 + }, + { + "epoch": 0.4125707476913911, + "grad_norm": 6.571208477020264, + "learning_rate": 1.0310545129579982e-05, + "loss": 0.6722, + "num_input_tokens_seen": 1608032, + "step": 2770 + }, + { + "epoch": 0.41331546023235033, + "grad_norm": 7.379067897796631, + "learning_rate": 1.0329162943103962e-05, + "loss": 0.7214, + "num_input_tokens_seen": 1611104, + "step": 2775 + }, + { + "epoch": 0.4140601727733095, + "grad_norm": 9.207716941833496, + "learning_rate": 1.0347780756627941e-05, + "loss": 0.7364, + "num_input_tokens_seen": 1614048, + "step": 2780 + }, + { + "epoch": 0.4148048853142687, + "grad_norm": 9.111557960510254, + "learning_rate": 1.0366398570151923e-05, + "loss": 0.8323, + "num_input_tokens_seen": 1617056, + "step": 2785 + }, + { + "epoch": 0.4155495978552279, + "grad_norm": 4.336118698120117, + "learning_rate": 1.0385016383675901e-05, + "loss": 0.7618, + "num_input_tokens_seen": 1619680, + "step": 2790 + }, + { + "epoch": 0.4162943103961871, + "grad_norm": 4.712620735168457, + "learning_rate": 1.0403634197199881e-05, + "loss": 0.683, + "num_input_tokens_seen": 1622816, + "step": 2795 + }, + { + "epoch": 0.4170390229371463, + "grad_norm": 5.307152271270752, + "learning_rate": 1.0422252010723861e-05, + "loss": 0.7641, + "num_input_tokens_seen": 1625664, + "step": 2800 + }, + { + "epoch": 0.41778373547810543, + "grad_norm": 3.775773763656616, + "learning_rate": 1.0440869824247841e-05, + "loss": 0.7152, + "num_input_tokens_seen": 1628384, + "step": 2805 + }, + { + "epoch": 0.41852844801906464, + "grad_norm": 4.502519607543945, + "learning_rate": 1.0459487637771821e-05, + "loss": 0.6084, + "num_input_tokens_seen": 1631616, + "step": 2810 + }, + { + "epoch": 0.41927316056002384, + "grad_norm": 9.057684898376465, + "learning_rate": 1.04781054512958e-05, + "loss": 0.6904, + "num_input_tokens_seen": 1634304, + "step": 2815 + }, + { + "epoch": 0.42001787310098304, + "grad_norm": 7.4951934814453125, + "learning_rate": 1.049672326481978e-05, + "loss": 0.6478, + "num_input_tokens_seen": 1637312, + "step": 2820 + }, + { + "epoch": 0.4207625856419422, + "grad_norm": 19.7974796295166, + "learning_rate": 1.051534107834376e-05, + "loss": 0.7495, + "num_input_tokens_seen": 1640192, + "step": 2825 + }, + { + "epoch": 0.4215072981829014, + "grad_norm": 5.667826175689697, + "learning_rate": 1.053395889186774e-05, + "loss": 0.7379, + "num_input_tokens_seen": 1643008, + "step": 2830 + }, + { + "epoch": 0.4222520107238606, + "grad_norm": 19.36119270324707, + "learning_rate": 1.0552576705391719e-05, + "loss": 0.8693, + "num_input_tokens_seen": 1645696, + "step": 2835 + }, + { + "epoch": 0.4229967232648198, + "grad_norm": 7.852826118469238, + "learning_rate": 1.0571194518915699e-05, + "loss": 0.7489, + "num_input_tokens_seen": 1648448, + "step": 2840 + }, + { + "epoch": 0.42374143580577894, + "grad_norm": 7.081023693084717, + "learning_rate": 1.0589812332439679e-05, + "loss": 0.7749, + "num_input_tokens_seen": 1651392, + "step": 2845 + }, + { + "epoch": 0.42448614834673815, + "grad_norm": 4.226819038391113, + "learning_rate": 1.0608430145963659e-05, + "loss": 0.7086, + "num_input_tokens_seen": 1654240, + "step": 2850 + }, + { + "epoch": 0.42523086088769735, + "grad_norm": 5.133233547210693, + "learning_rate": 1.0627047959487637e-05, + "loss": 0.6217, + "num_input_tokens_seen": 1657088, + "step": 2855 + }, + { + "epoch": 0.42597557342865655, + "grad_norm": 10.703008651733398, + "learning_rate": 1.0645665773011617e-05, + "loss": 0.7277, + "num_input_tokens_seen": 1659744, + "step": 2860 + }, + { + "epoch": 0.42672028596961575, + "grad_norm": 4.682960033416748, + "learning_rate": 1.0664283586535598e-05, + "loss": 0.6812, + "num_input_tokens_seen": 1662656, + "step": 2865 + }, + { + "epoch": 0.4274649985105749, + "grad_norm": 4.980742931365967, + "learning_rate": 1.0682901400059578e-05, + "loss": 0.6383, + "num_input_tokens_seen": 1665408, + "step": 2870 + }, + { + "epoch": 0.4282097110515341, + "grad_norm": 5.778203964233398, + "learning_rate": 1.0701519213583558e-05, + "loss": 0.5172, + "num_input_tokens_seen": 1668160, + "step": 2875 + }, + { + "epoch": 0.4289544235924933, + "grad_norm": 8.580055236816406, + "learning_rate": 1.0720137027107537e-05, + "loss": 0.7279, + "num_input_tokens_seen": 1671232, + "step": 2880 + }, + { + "epoch": 0.4296991361334525, + "grad_norm": 11.498860359191895, + "learning_rate": 1.0738754840631517e-05, + "loss": 0.6307, + "num_input_tokens_seen": 1674336, + "step": 2885 + }, + { + "epoch": 0.43044384867441166, + "grad_norm": 5.702898979187012, + "learning_rate": 1.0757372654155496e-05, + "loss": 0.7972, + "num_input_tokens_seen": 1677184, + "step": 2890 + }, + { + "epoch": 0.43118856121537086, + "grad_norm": 10.917574882507324, + "learning_rate": 1.0775990467679476e-05, + "loss": 0.8635, + "num_input_tokens_seen": 1679904, + "step": 2895 + }, + { + "epoch": 0.43193327375633006, + "grad_norm": 7.599544525146484, + "learning_rate": 1.0794608281203456e-05, + "loss": 0.633, + "num_input_tokens_seen": 1682656, + "step": 2900 + }, + { + "epoch": 0.43267798629728926, + "grad_norm": 6.22849702835083, + "learning_rate": 1.0813226094727436e-05, + "loss": 0.6222, + "num_input_tokens_seen": 1685440, + "step": 2905 + }, + { + "epoch": 0.4334226988382484, + "grad_norm": 6.376546382904053, + "learning_rate": 1.0831843908251416e-05, + "loss": 0.7166, + "num_input_tokens_seen": 1688288, + "step": 2910 + }, + { + "epoch": 0.4341674113792076, + "grad_norm": 4.958022117614746, + "learning_rate": 1.0850461721775396e-05, + "loss": 0.5439, + "num_input_tokens_seen": 1691264, + "step": 2915 + }, + { + "epoch": 0.4349121239201668, + "grad_norm": 5.127017974853516, + "learning_rate": 1.0869079535299374e-05, + "loss": 0.7129, + "num_input_tokens_seen": 1694240, + "step": 2920 + }, + { + "epoch": 0.435656836461126, + "grad_norm": 18.449949264526367, + "learning_rate": 1.0887697348823354e-05, + "loss": 0.7334, + "num_input_tokens_seen": 1697152, + "step": 2925 + }, + { + "epoch": 0.4364015490020852, + "grad_norm": 5.361304759979248, + "learning_rate": 1.0906315162347334e-05, + "loss": 0.6296, + "num_input_tokens_seen": 1699936, + "step": 2930 + }, + { + "epoch": 0.43714626154304437, + "grad_norm": 7.455173969268799, + "learning_rate": 1.0924932975871314e-05, + "loss": 0.6725, + "num_input_tokens_seen": 1702880, + "step": 2935 + }, + { + "epoch": 0.43789097408400357, + "grad_norm": 4.939868450164795, + "learning_rate": 1.0943550789395294e-05, + "loss": 0.6637, + "num_input_tokens_seen": 1705664, + "step": 2940 + }, + { + "epoch": 0.4386356866249628, + "grad_norm": 5.675046443939209, + "learning_rate": 1.0962168602919274e-05, + "loss": 0.6478, + "num_input_tokens_seen": 1708416, + "step": 2945 + }, + { + "epoch": 0.439380399165922, + "grad_norm": 6.856414318084717, + "learning_rate": 1.0980786416443254e-05, + "loss": 0.5258, + "num_input_tokens_seen": 1711424, + "step": 2950 + }, + { + "epoch": 0.4401251117068811, + "grad_norm": 9.080936431884766, + "learning_rate": 1.0999404229967234e-05, + "loss": 0.9414, + "num_input_tokens_seen": 1714208, + "step": 2955 + }, + { + "epoch": 0.4408698242478403, + "grad_norm": 4.385567665100098, + "learning_rate": 1.1018022043491214e-05, + "loss": 0.6444, + "num_input_tokens_seen": 1716992, + "step": 2960 + }, + { + "epoch": 0.4416145367887995, + "grad_norm": 12.748610496520996, + "learning_rate": 1.1036639857015192e-05, + "loss": 0.9815, + "num_input_tokens_seen": 1719840, + "step": 2965 + }, + { + "epoch": 0.44235924932975873, + "grad_norm": 10.832447052001953, + "learning_rate": 1.1055257670539172e-05, + "loss": 0.6757, + "num_input_tokens_seen": 1722464, + "step": 2970 + }, + { + "epoch": 0.4431039618707179, + "grad_norm": 4.982848167419434, + "learning_rate": 1.1073875484063152e-05, + "loss": 0.6321, + "num_input_tokens_seen": 1725216, + "step": 2975 + }, + { + "epoch": 0.4438486744116771, + "grad_norm": 5.799880027770996, + "learning_rate": 1.1092493297587132e-05, + "loss": 0.7762, + "num_input_tokens_seen": 1728000, + "step": 2980 + }, + { + "epoch": 0.4445933869526363, + "grad_norm": 6.719766616821289, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.6389, + "num_input_tokens_seen": 1730912, + "step": 2985 + }, + { + "epoch": 0.4453380994935955, + "grad_norm": 5.4439849853515625, + "learning_rate": 1.1129728924635092e-05, + "loss": 0.6689, + "num_input_tokens_seen": 1733600, + "step": 2990 + }, + { + "epoch": 0.4460828120345547, + "grad_norm": 10.613160133361816, + "learning_rate": 1.1148346738159071e-05, + "loss": 0.5399, + "num_input_tokens_seen": 1736480, + "step": 2995 + }, + { + "epoch": 0.44682752457551383, + "grad_norm": 5.815305709838867, + "learning_rate": 1.1166964551683051e-05, + "loss": 0.5669, + "num_input_tokens_seen": 1739264, + "step": 3000 + }, + { + "epoch": 0.44757223711647304, + "grad_norm": 6.918760776519775, + "learning_rate": 1.1185582365207031e-05, + "loss": 0.6507, + "num_input_tokens_seen": 1742336, + "step": 3005 + }, + { + "epoch": 0.44831694965743224, + "grad_norm": 6.284045696258545, + "learning_rate": 1.120420017873101e-05, + "loss": 0.6872, + "num_input_tokens_seen": 1745216, + "step": 3010 + }, + { + "epoch": 0.44906166219839144, + "grad_norm": 11.237996101379395, + "learning_rate": 1.122281799225499e-05, + "loss": 0.781, + "num_input_tokens_seen": 1748192, + "step": 3015 + }, + { + "epoch": 0.4498063747393506, + "grad_norm": 19.39266586303711, + "learning_rate": 1.124143580577897e-05, + "loss": 0.9747, + "num_input_tokens_seen": 1750944, + "step": 3020 + }, + { + "epoch": 0.4505510872803098, + "grad_norm": 10.310136795043945, + "learning_rate": 1.126005361930295e-05, + "loss": 0.6629, + "num_input_tokens_seen": 1753856, + "step": 3025 + }, + { + "epoch": 0.451295799821269, + "grad_norm": 8.701903343200684, + "learning_rate": 1.127867143282693e-05, + "loss": 0.6627, + "num_input_tokens_seen": 1756896, + "step": 3030 + }, + { + "epoch": 0.4520405123622282, + "grad_norm": 8.247201919555664, + "learning_rate": 1.129728924635091e-05, + "loss": 0.7388, + "num_input_tokens_seen": 1760032, + "step": 3035 + }, + { + "epoch": 0.45278522490318734, + "grad_norm": 8.74451732635498, + "learning_rate": 1.1315907059874889e-05, + "loss": 0.6431, + "num_input_tokens_seen": 1762848, + "step": 3040 + }, + { + "epoch": 0.45352993744414655, + "grad_norm": 7.169740676879883, + "learning_rate": 1.1334524873398869e-05, + "loss": 0.7241, + "num_input_tokens_seen": 1765760, + "step": 3045 + }, + { + "epoch": 0.45427464998510575, + "grad_norm": 4.269694805145264, + "learning_rate": 1.1353142686922847e-05, + "loss": 0.592, + "num_input_tokens_seen": 1768608, + "step": 3050 + }, + { + "epoch": 0.45501936252606495, + "grad_norm": 6.2934889793396, + "learning_rate": 1.1371760500446827e-05, + "loss": 0.729, + "num_input_tokens_seen": 1771808, + "step": 3055 + }, + { + "epoch": 0.45576407506702415, + "grad_norm": 8.290600776672363, + "learning_rate": 1.1390378313970807e-05, + "loss": 0.6005, + "num_input_tokens_seen": 1774656, + "step": 3060 + }, + { + "epoch": 0.4565087876079833, + "grad_norm": 5.087130546569824, + "learning_rate": 1.1408996127494787e-05, + "loss": 0.7877, + "num_input_tokens_seen": 1777568, + "step": 3065 + }, + { + "epoch": 0.4572535001489425, + "grad_norm": 7.323131084442139, + "learning_rate": 1.1427613941018769e-05, + "loss": 0.7914, + "num_input_tokens_seen": 1780672, + "step": 3070 + }, + { + "epoch": 0.4579982126899017, + "grad_norm": 6.190136432647705, + "learning_rate": 1.1446231754542747e-05, + "loss": 0.6163, + "num_input_tokens_seen": 1783424, + "step": 3075 + }, + { + "epoch": 0.4587429252308609, + "grad_norm": 12.39527702331543, + "learning_rate": 1.1464849568066727e-05, + "loss": 0.6992, + "num_input_tokens_seen": 1786176, + "step": 3080 + }, + { + "epoch": 0.45948763777182006, + "grad_norm": 22.108617782592773, + "learning_rate": 1.1483467381590707e-05, + "loss": 0.8996, + "num_input_tokens_seen": 1788896, + "step": 3085 + }, + { + "epoch": 0.46023235031277926, + "grad_norm": 5.151708126068115, + "learning_rate": 1.1502085195114687e-05, + "loss": 0.8595, + "num_input_tokens_seen": 1791840, + "step": 3090 + }, + { + "epoch": 0.46097706285373846, + "grad_norm": 5.434630870819092, + "learning_rate": 1.1520703008638665e-05, + "loss": 0.5474, + "num_input_tokens_seen": 1794752, + "step": 3095 + }, + { + "epoch": 0.46172177539469766, + "grad_norm": 7.366402626037598, + "learning_rate": 1.1539320822162645e-05, + "loss": 0.6348, + "num_input_tokens_seen": 1797568, + "step": 3100 + }, + { + "epoch": 0.4624664879356568, + "grad_norm": 6.68189811706543, + "learning_rate": 1.1557938635686626e-05, + "loss": 0.84, + "num_input_tokens_seen": 1800768, + "step": 3105 + }, + { + "epoch": 0.463211200476616, + "grad_norm": 11.85738468170166, + "learning_rate": 1.1576556449210606e-05, + "loss": 0.7101, + "num_input_tokens_seen": 1803744, + "step": 3110 + }, + { + "epoch": 0.4639559130175752, + "grad_norm": 9.552521705627441, + "learning_rate": 1.1595174262734585e-05, + "loss": 0.6717, + "num_input_tokens_seen": 1806624, + "step": 3115 + }, + { + "epoch": 0.4647006255585344, + "grad_norm": 10.951202392578125, + "learning_rate": 1.1613792076258565e-05, + "loss": 0.828, + "num_input_tokens_seen": 1809600, + "step": 3120 + }, + { + "epoch": 0.4654453380994936, + "grad_norm": 11.104909896850586, + "learning_rate": 1.1632409889782545e-05, + "loss": 0.6971, + "num_input_tokens_seen": 1812480, + "step": 3125 + }, + { + "epoch": 0.46619005064045277, + "grad_norm": 6.632997512817383, + "learning_rate": 1.1651027703306524e-05, + "loss": 0.6242, + "num_input_tokens_seen": 1815328, + "step": 3130 + }, + { + "epoch": 0.46693476318141197, + "grad_norm": 5.392765522003174, + "learning_rate": 1.1669645516830504e-05, + "loss": 0.7664, + "num_input_tokens_seen": 1818368, + "step": 3135 + }, + { + "epoch": 0.4676794757223712, + "grad_norm": 6.406505584716797, + "learning_rate": 1.1688263330354483e-05, + "loss": 0.6773, + "num_input_tokens_seen": 1821312, + "step": 3140 + }, + { + "epoch": 0.4684241882633304, + "grad_norm": 4.172749996185303, + "learning_rate": 1.1706881143878464e-05, + "loss": 0.7166, + "num_input_tokens_seen": 1824640, + "step": 3145 + }, + { + "epoch": 0.4691689008042895, + "grad_norm": 6.134106159210205, + "learning_rate": 1.1725498957402444e-05, + "loss": 0.6002, + "num_input_tokens_seen": 1827392, + "step": 3150 + }, + { + "epoch": 0.4699136133452487, + "grad_norm": 5.026341915130615, + "learning_rate": 1.1744116770926424e-05, + "loss": 0.7628, + "num_input_tokens_seen": 1830080, + "step": 3155 + }, + { + "epoch": 0.47065832588620793, + "grad_norm": 2.7661499977111816, + "learning_rate": 1.1762734584450402e-05, + "loss": 0.7691, + "num_input_tokens_seen": 1833024, + "step": 3160 + }, + { + "epoch": 0.47140303842716713, + "grad_norm": 6.210758686065674, + "learning_rate": 1.1781352397974382e-05, + "loss": 0.7036, + "num_input_tokens_seen": 1835680, + "step": 3165 + }, + { + "epoch": 0.4721477509681263, + "grad_norm": 3.9920156002044678, + "learning_rate": 1.1799970211498362e-05, + "loss": 0.6602, + "num_input_tokens_seen": 1838464, + "step": 3170 + }, + { + "epoch": 0.4728924635090855, + "grad_norm": 5.494725704193115, + "learning_rate": 1.1818588025022342e-05, + "loss": 0.6873, + "num_input_tokens_seen": 1841728, + "step": 3175 + }, + { + "epoch": 0.4736371760500447, + "grad_norm": 4.574413299560547, + "learning_rate": 1.183720583854632e-05, + "loss": 0.694, + "num_input_tokens_seen": 1844576, + "step": 3180 + }, + { + "epoch": 0.4743818885910039, + "grad_norm": 3.251973867416382, + "learning_rate": 1.1855823652070302e-05, + "loss": 0.725, + "num_input_tokens_seen": 1847328, + "step": 3185 + }, + { + "epoch": 0.4751266011319631, + "grad_norm": 5.903323173522949, + "learning_rate": 1.1874441465594282e-05, + "loss": 0.7216, + "num_input_tokens_seen": 1850240, + "step": 3190 + }, + { + "epoch": 0.47587131367292224, + "grad_norm": 3.73075532913208, + "learning_rate": 1.1893059279118262e-05, + "loss": 0.6005, + "num_input_tokens_seen": 1853216, + "step": 3195 + }, + { + "epoch": 0.47661602621388144, + "grad_norm": 5.794747352600098, + "learning_rate": 1.1911677092642242e-05, + "loss": 0.5401, + "num_input_tokens_seen": 1856000, + "step": 3200 + }, + { + "epoch": 0.47736073875484064, + "grad_norm": 5.75193977355957, + "learning_rate": 1.193029490616622e-05, + "loss": 0.6845, + "num_input_tokens_seen": 1858912, + "step": 3205 + }, + { + "epoch": 0.47810545129579984, + "grad_norm": 5.801257133483887, + "learning_rate": 1.19489127196902e-05, + "loss": 0.7349, + "num_input_tokens_seen": 1862240, + "step": 3210 + }, + { + "epoch": 0.478850163836759, + "grad_norm": 19.279394149780273, + "learning_rate": 1.196753053321418e-05, + "loss": 0.8585, + "num_input_tokens_seen": 1864896, + "step": 3215 + }, + { + "epoch": 0.4795948763777182, + "grad_norm": 10.081915855407715, + "learning_rate": 1.198614834673816e-05, + "loss": 0.7113, + "num_input_tokens_seen": 1867904, + "step": 3220 + }, + { + "epoch": 0.4803395889186774, + "grad_norm": 8.756119728088379, + "learning_rate": 1.200476616026214e-05, + "loss": 0.7082, + "num_input_tokens_seen": 1870496, + "step": 3225 + }, + { + "epoch": 0.4810843014596366, + "grad_norm": 5.047640800476074, + "learning_rate": 1.202338397378612e-05, + "loss": 0.5399, + "num_input_tokens_seen": 1873152, + "step": 3230 + }, + { + "epoch": 0.48182901400059575, + "grad_norm": 4.2024688720703125, + "learning_rate": 1.20420017873101e-05, + "loss": 0.6395, + "num_input_tokens_seen": 1876096, + "step": 3235 + }, + { + "epoch": 0.48257372654155495, + "grad_norm": 5.848716735839844, + "learning_rate": 1.206061960083408e-05, + "loss": 0.7054, + "num_input_tokens_seen": 1879168, + "step": 3240 + }, + { + "epoch": 0.48331843908251415, + "grad_norm": 6.029223442077637, + "learning_rate": 1.2079237414358058e-05, + "loss": 0.6064, + "num_input_tokens_seen": 1882304, + "step": 3245 + }, + { + "epoch": 0.48406315162347335, + "grad_norm": 9.93999195098877, + "learning_rate": 1.2097855227882038e-05, + "loss": 0.7087, + "num_input_tokens_seen": 1885120, + "step": 3250 + }, + { + "epoch": 0.48480786416443256, + "grad_norm": 4.736778259277344, + "learning_rate": 1.2116473041406018e-05, + "loss": 0.622, + "num_input_tokens_seen": 1887968, + "step": 3255 + }, + { + "epoch": 0.4855525767053917, + "grad_norm": 8.373746871948242, + "learning_rate": 1.2135090854929997e-05, + "loss": 0.8116, + "num_input_tokens_seen": 1890688, + "step": 3260 + }, + { + "epoch": 0.4862972892463509, + "grad_norm": 6.220398426055908, + "learning_rate": 1.2153708668453977e-05, + "loss": 0.4603, + "num_input_tokens_seen": 1893408, + "step": 3265 + }, + { + "epoch": 0.4870420017873101, + "grad_norm": 43.285804748535156, + "learning_rate": 1.2172326481977957e-05, + "loss": 0.4725, + "num_input_tokens_seen": 1896288, + "step": 3270 + }, + { + "epoch": 0.4877867143282693, + "grad_norm": 12.573589324951172, + "learning_rate": 1.2190944295501937e-05, + "loss": 1.0182, + "num_input_tokens_seen": 1899296, + "step": 3275 + }, + { + "epoch": 0.48853142686922846, + "grad_norm": 16.495800018310547, + "learning_rate": 1.2209562109025917e-05, + "loss": 0.7142, + "num_input_tokens_seen": 1902176, + "step": 3280 + }, + { + "epoch": 0.48927613941018766, + "grad_norm": 12.490256309509277, + "learning_rate": 1.2228179922549897e-05, + "loss": 0.5823, + "num_input_tokens_seen": 1904992, + "step": 3285 + }, + { + "epoch": 0.49002085195114686, + "grad_norm": 9.13528823852539, + "learning_rate": 1.2246797736073875e-05, + "loss": 0.7443, + "num_input_tokens_seen": 1907936, + "step": 3290 + }, + { + "epoch": 0.49076556449210607, + "grad_norm": 6.014569282531738, + "learning_rate": 1.2265415549597855e-05, + "loss": 0.5885, + "num_input_tokens_seen": 1911072, + "step": 3295 + }, + { + "epoch": 0.4915102770330652, + "grad_norm": 9.02782917022705, + "learning_rate": 1.2284033363121835e-05, + "loss": 0.728, + "num_input_tokens_seen": 1913792, + "step": 3300 + }, + { + "epoch": 0.4922549895740244, + "grad_norm": 6.530007362365723, + "learning_rate": 1.2302651176645815e-05, + "loss": 0.658, + "num_input_tokens_seen": 1916864, + "step": 3305 + }, + { + "epoch": 0.4929997021149836, + "grad_norm": 4.135145664215088, + "learning_rate": 1.2321268990169795e-05, + "loss": 0.5509, + "num_input_tokens_seen": 1920128, + "step": 3310 + }, + { + "epoch": 0.4937444146559428, + "grad_norm": 9.976263999938965, + "learning_rate": 1.2339886803693775e-05, + "loss": 0.6228, + "num_input_tokens_seen": 1922976, + "step": 3315 + }, + { + "epoch": 0.494489127196902, + "grad_norm": 8.959957122802734, + "learning_rate": 1.2358504617217755e-05, + "loss": 0.5179, + "num_input_tokens_seen": 1925952, + "step": 3320 + }, + { + "epoch": 0.49523383973786117, + "grad_norm": 16.707561492919922, + "learning_rate": 1.2377122430741735e-05, + "loss": 0.8293, + "num_input_tokens_seen": 1928928, + "step": 3325 + }, + { + "epoch": 0.4959785522788204, + "grad_norm": 8.722095489501953, + "learning_rate": 1.2395740244265713e-05, + "loss": 0.6587, + "num_input_tokens_seen": 1931808, + "step": 3330 + }, + { + "epoch": 0.4967232648197796, + "grad_norm": 9.196575164794922, + "learning_rate": 1.2414358057789693e-05, + "loss": 0.6507, + "num_input_tokens_seen": 1934528, + "step": 3335 + }, + { + "epoch": 0.4974679773607388, + "grad_norm": 17.223142623901367, + "learning_rate": 1.2432975871313673e-05, + "loss": 0.8396, + "num_input_tokens_seen": 1937568, + "step": 3340 + }, + { + "epoch": 0.4982126899016979, + "grad_norm": 8.71137809753418, + "learning_rate": 1.2451593684837653e-05, + "loss": 0.8858, + "num_input_tokens_seen": 1940832, + "step": 3345 + }, + { + "epoch": 0.4989574024426571, + "grad_norm": 4.081054210662842, + "learning_rate": 1.2470211498361634e-05, + "loss": 0.5917, + "num_input_tokens_seen": 1943520, + "step": 3350 + }, + { + "epoch": 0.49970211498361633, + "grad_norm": 9.244853019714355, + "learning_rate": 1.2488829311885613e-05, + "loss": 0.6619, + "num_input_tokens_seen": 1946240, + "step": 3355 + }, + { + "epoch": 0.5004468275245755, + "grad_norm": 4.412874698638916, + "learning_rate": 1.2507447125409594e-05, + "loss": 0.5674, + "num_input_tokens_seen": 1949248, + "step": 3360 + }, + { + "epoch": 0.5011915400655347, + "grad_norm": 5.132126808166504, + "learning_rate": 1.252606493893357e-05, + "loss": 0.6146, + "num_input_tokens_seen": 1952160, + "step": 3365 + }, + { + "epoch": 0.5019362526064939, + "grad_norm": 6.257622718811035, + "learning_rate": 1.254468275245755e-05, + "loss": 0.5728, + "num_input_tokens_seen": 1955040, + "step": 3370 + }, + { + "epoch": 0.5026809651474531, + "grad_norm": 4.833962917327881, + "learning_rate": 1.256330056598153e-05, + "loss": 0.779, + "num_input_tokens_seen": 1957888, + "step": 3375 + }, + { + "epoch": 0.5034256776884123, + "grad_norm": 14.554532051086426, + "learning_rate": 1.258191837950551e-05, + "loss": 0.707, + "num_input_tokens_seen": 1960992, + "step": 3380 + }, + { + "epoch": 0.5041703902293715, + "grad_norm": 12.424259185791016, + "learning_rate": 1.260053619302949e-05, + "loss": 0.55, + "num_input_tokens_seen": 1963968, + "step": 3385 + }, + { + "epoch": 0.5049151027703307, + "grad_norm": 9.645299911499023, + "learning_rate": 1.2619154006553472e-05, + "loss": 0.8192, + "num_input_tokens_seen": 1966848, + "step": 3390 + }, + { + "epoch": 0.5056598153112899, + "grad_norm": 5.94127082824707, + "learning_rate": 1.2637771820077452e-05, + "loss": 0.6152, + "num_input_tokens_seen": 1969536, + "step": 3395 + }, + { + "epoch": 0.506404527852249, + "grad_norm": 10.100982666015625, + "learning_rate": 1.2656389633601432e-05, + "loss": 0.7087, + "num_input_tokens_seen": 1972704, + "step": 3400 + }, + { + "epoch": 0.5071492403932082, + "grad_norm": 7.057208061218262, + "learning_rate": 1.2675007447125412e-05, + "loss": 0.6898, + "num_input_tokens_seen": 1975552, + "step": 3405 + }, + { + "epoch": 0.5078939529341674, + "grad_norm": 6.112754821777344, + "learning_rate": 1.2693625260649388e-05, + "loss": 0.7849, + "num_input_tokens_seen": 1978624, + "step": 3410 + }, + { + "epoch": 0.5086386654751266, + "grad_norm": 7.23696231842041, + "learning_rate": 1.2712243074173368e-05, + "loss": 0.7183, + "num_input_tokens_seen": 1981824, + "step": 3415 + }, + { + "epoch": 0.5093833780160858, + "grad_norm": 6.1846771240234375, + "learning_rate": 1.2730860887697348e-05, + "loss": 0.7571, + "num_input_tokens_seen": 1984736, + "step": 3420 + }, + { + "epoch": 0.510128090557045, + "grad_norm": 7.05345344543457, + "learning_rate": 1.2749478701221328e-05, + "loss": 0.756, + "num_input_tokens_seen": 1987552, + "step": 3425 + }, + { + "epoch": 0.5108728030980042, + "grad_norm": 9.011363983154297, + "learning_rate": 1.276809651474531e-05, + "loss": 0.7049, + "num_input_tokens_seen": 1990880, + "step": 3430 + }, + { + "epoch": 0.5116175156389634, + "grad_norm": 9.624495506286621, + "learning_rate": 1.278671432826929e-05, + "loss": 0.6492, + "num_input_tokens_seen": 1994048, + "step": 3435 + }, + { + "epoch": 0.5123622281799225, + "grad_norm": 5.973958492279053, + "learning_rate": 1.280533214179327e-05, + "loss": 0.6028, + "num_input_tokens_seen": 1997792, + "step": 3440 + }, + { + "epoch": 0.5131069407208817, + "grad_norm": 5.396010398864746, + "learning_rate": 1.282394995531725e-05, + "loss": 0.6615, + "num_input_tokens_seen": 2000800, + "step": 3445 + }, + { + "epoch": 0.5138516532618409, + "grad_norm": 7.393359661102295, + "learning_rate": 1.2842567768841226e-05, + "loss": 0.634, + "num_input_tokens_seen": 2003904, + "step": 3450 + }, + { + "epoch": 0.5145963658028001, + "grad_norm": 6.928417205810547, + "learning_rate": 1.2861185582365206e-05, + "loss": 0.7966, + "num_input_tokens_seen": 2006944, + "step": 3455 + }, + { + "epoch": 0.5153410783437593, + "grad_norm": 8.052879333496094, + "learning_rate": 1.2879803395889186e-05, + "loss": 0.661, + "num_input_tokens_seen": 2010496, + "step": 3460 + }, + { + "epoch": 0.5160857908847185, + "grad_norm": 5.980714797973633, + "learning_rate": 1.2898421209413166e-05, + "loss": 0.6873, + "num_input_tokens_seen": 2013408, + "step": 3465 + }, + { + "epoch": 0.5168305034256777, + "grad_norm": 9.96666431427002, + "learning_rate": 1.2917039022937148e-05, + "loss": 0.7084, + "num_input_tokens_seen": 2016032, + "step": 3470 + }, + { + "epoch": 0.5175752159666369, + "grad_norm": 13.751755714416504, + "learning_rate": 1.2935656836461127e-05, + "loss": 0.6944, + "num_input_tokens_seen": 2018752, + "step": 3475 + }, + { + "epoch": 0.5183199285075961, + "grad_norm": 8.746589660644531, + "learning_rate": 1.2954274649985107e-05, + "loss": 0.7907, + "num_input_tokens_seen": 2021600, + "step": 3480 + }, + { + "epoch": 0.5190646410485552, + "grad_norm": 12.643209457397461, + "learning_rate": 1.2972892463509087e-05, + "loss": 0.8082, + "num_input_tokens_seen": 2024544, + "step": 3485 + }, + { + "epoch": 0.5198093535895144, + "grad_norm": 6.988947868347168, + "learning_rate": 1.2991510277033067e-05, + "loss": 0.778, + "num_input_tokens_seen": 2027552, + "step": 3490 + }, + { + "epoch": 0.5205540661304736, + "grad_norm": 4.197910785675049, + "learning_rate": 1.3010128090557044e-05, + "loss": 0.6248, + "num_input_tokens_seen": 2030656, + "step": 3495 + }, + { + "epoch": 0.5212987786714328, + "grad_norm": 4.720814228057861, + "learning_rate": 1.3028745904081024e-05, + "loss": 0.7026, + "num_input_tokens_seen": 2033600, + "step": 3500 + }, + { + "epoch": 0.522043491212392, + "grad_norm": 4.502511501312256, + "learning_rate": 1.3047363717605005e-05, + "loss": 0.6897, + "num_input_tokens_seen": 2036544, + "step": 3505 + }, + { + "epoch": 0.5227882037533512, + "grad_norm": 5.327757835388184, + "learning_rate": 1.3065981531128985e-05, + "loss": 0.6061, + "num_input_tokens_seen": 2039392, + "step": 3510 + }, + { + "epoch": 0.5235329162943104, + "grad_norm": 6.05517578125, + "learning_rate": 1.3084599344652965e-05, + "loss": 0.7028, + "num_input_tokens_seen": 2042080, + "step": 3515 + }, + { + "epoch": 0.5242776288352696, + "grad_norm": 10.311328887939453, + "learning_rate": 1.3103217158176945e-05, + "loss": 0.7189, + "num_input_tokens_seen": 2044992, + "step": 3520 + }, + { + "epoch": 0.5250223413762288, + "grad_norm": 6.062792778015137, + "learning_rate": 1.3121834971700925e-05, + "loss": 0.7385, + "num_input_tokens_seen": 2047776, + "step": 3525 + }, + { + "epoch": 0.5257670539171879, + "grad_norm": 4.1465840339660645, + "learning_rate": 1.3140452785224905e-05, + "loss": 0.6774, + "num_input_tokens_seen": 2050816, + "step": 3530 + }, + { + "epoch": 0.5265117664581471, + "grad_norm": 6.178119659423828, + "learning_rate": 1.3159070598748885e-05, + "loss": 0.7314, + "num_input_tokens_seen": 2053952, + "step": 3535 + }, + { + "epoch": 0.5272564789991063, + "grad_norm": 5.806377410888672, + "learning_rate": 1.3177688412272861e-05, + "loss": 0.649, + "num_input_tokens_seen": 2056896, + "step": 3540 + }, + { + "epoch": 0.5280011915400655, + "grad_norm": 5.787143230438232, + "learning_rate": 1.3196306225796843e-05, + "loss": 0.6511, + "num_input_tokens_seen": 2059776, + "step": 3545 + }, + { + "epoch": 0.5287459040810247, + "grad_norm": 5.065536975860596, + "learning_rate": 1.3214924039320823e-05, + "loss": 0.6617, + "num_input_tokens_seen": 2062880, + "step": 3550 + }, + { + "epoch": 0.5294906166219839, + "grad_norm": 4.319667816162109, + "learning_rate": 1.3233541852844803e-05, + "loss": 0.6682, + "num_input_tokens_seen": 2065856, + "step": 3555 + }, + { + "epoch": 0.5302353291629431, + "grad_norm": 4.322238922119141, + "learning_rate": 1.3252159666368783e-05, + "loss": 0.7425, + "num_input_tokens_seen": 2068704, + "step": 3560 + }, + { + "epoch": 0.5309800417039023, + "grad_norm": 6.0274977684021, + "learning_rate": 1.3270777479892763e-05, + "loss": 0.6277, + "num_input_tokens_seen": 2071488, + "step": 3565 + }, + { + "epoch": 0.5317247542448614, + "grad_norm": 8.042961120605469, + "learning_rate": 1.3289395293416743e-05, + "loss": 0.7199, + "num_input_tokens_seen": 2074432, + "step": 3570 + }, + { + "epoch": 0.5324694667858206, + "grad_norm": 3.437458038330078, + "learning_rate": 1.3308013106940723e-05, + "loss": 0.6071, + "num_input_tokens_seen": 2077120, + "step": 3575 + }, + { + "epoch": 0.5332141793267798, + "grad_norm": 7.377723693847656, + "learning_rate": 1.33266309204647e-05, + "loss": 0.6444, + "num_input_tokens_seen": 2079968, + "step": 3580 + }, + { + "epoch": 0.533958891867739, + "grad_norm": 4.294750213623047, + "learning_rate": 1.334524873398868e-05, + "loss": 0.6063, + "num_input_tokens_seen": 2082784, + "step": 3585 + }, + { + "epoch": 0.5347036044086982, + "grad_norm": 4.012420177459717, + "learning_rate": 1.336386654751266e-05, + "loss": 0.5577, + "num_input_tokens_seen": 2085920, + "step": 3590 + }, + { + "epoch": 0.5354483169496574, + "grad_norm": 10.419921875, + "learning_rate": 1.338248436103664e-05, + "loss": 0.6994, + "num_input_tokens_seen": 2088608, + "step": 3595 + }, + { + "epoch": 0.5361930294906166, + "grad_norm": 6.020255088806152, + "learning_rate": 1.340110217456062e-05, + "loss": 0.6861, + "num_input_tokens_seen": 2091584, + "step": 3600 + }, + { + "epoch": 0.5369377420315758, + "grad_norm": 9.167801856994629, + "learning_rate": 1.34197199880846e-05, + "loss": 0.6381, + "num_input_tokens_seen": 2094272, + "step": 3605 + }, + { + "epoch": 0.537682454572535, + "grad_norm": 9.846222877502441, + "learning_rate": 1.343833780160858e-05, + "loss": 0.5996, + "num_input_tokens_seen": 2097344, + "step": 3610 + }, + { + "epoch": 0.5384271671134941, + "grad_norm": 8.60977554321289, + "learning_rate": 1.345695561513256e-05, + "loss": 0.5358, + "num_input_tokens_seen": 2100192, + "step": 3615 + }, + { + "epoch": 0.5391718796544533, + "grad_norm": 14.883769989013672, + "learning_rate": 1.347557342865654e-05, + "loss": 0.6525, + "num_input_tokens_seen": 2103136, + "step": 3620 + }, + { + "epoch": 0.5399165921954125, + "grad_norm": 6.133451461791992, + "learning_rate": 1.3494191242180519e-05, + "loss": 0.9127, + "num_input_tokens_seen": 2106272, + "step": 3625 + }, + { + "epoch": 0.5406613047363718, + "grad_norm": 5.152815818786621, + "learning_rate": 1.3512809055704498e-05, + "loss": 0.453, + "num_input_tokens_seen": 2109440, + "step": 3630 + }, + { + "epoch": 0.541406017277331, + "grad_norm": 10.931153297424316, + "learning_rate": 1.3531426869228478e-05, + "loss": 0.7638, + "num_input_tokens_seen": 2112352, + "step": 3635 + }, + { + "epoch": 0.5421507298182902, + "grad_norm": 7.203112602233887, + "learning_rate": 1.3550044682752458e-05, + "loss": 0.5561, + "num_input_tokens_seen": 2115200, + "step": 3640 + }, + { + "epoch": 0.5428954423592494, + "grad_norm": 7.24666690826416, + "learning_rate": 1.3568662496276438e-05, + "loss": 0.7983, + "num_input_tokens_seen": 2118112, + "step": 3645 + }, + { + "epoch": 0.5436401549002086, + "grad_norm": 6.460424900054932, + "learning_rate": 1.3587280309800418e-05, + "loss": 0.617, + "num_input_tokens_seen": 2121024, + "step": 3650 + }, + { + "epoch": 0.5443848674411678, + "grad_norm": 10.6987886428833, + "learning_rate": 1.3605898123324398e-05, + "loss": 0.6809, + "num_input_tokens_seen": 2123680, + "step": 3655 + }, + { + "epoch": 0.5451295799821269, + "grad_norm": 5.294222354888916, + "learning_rate": 1.3624515936848378e-05, + "loss": 0.6942, + "num_input_tokens_seen": 2126784, + "step": 3660 + }, + { + "epoch": 0.5458742925230861, + "grad_norm": 6.038636207580566, + "learning_rate": 1.3643133750372358e-05, + "loss": 0.7233, + "num_input_tokens_seen": 2129504, + "step": 3665 + }, + { + "epoch": 0.5466190050640453, + "grad_norm": 6.656161308288574, + "learning_rate": 1.3661751563896336e-05, + "loss": 0.7416, + "num_input_tokens_seen": 2132416, + "step": 3670 + }, + { + "epoch": 0.5473637176050045, + "grad_norm": 9.168909072875977, + "learning_rate": 1.3680369377420316e-05, + "loss": 0.6461, + "num_input_tokens_seen": 2135264, + "step": 3675 + }, + { + "epoch": 0.5481084301459637, + "grad_norm": 3.4932634830474854, + "learning_rate": 1.3698987190944296e-05, + "loss": 0.8017, + "num_input_tokens_seen": 2138496, + "step": 3680 + }, + { + "epoch": 0.5488531426869229, + "grad_norm": 5.363718032836914, + "learning_rate": 1.3717605004468276e-05, + "loss": 0.6224, + "num_input_tokens_seen": 2141344, + "step": 3685 + }, + { + "epoch": 0.5495978552278821, + "grad_norm": 4.3254499435424805, + "learning_rate": 1.3736222817992256e-05, + "loss": 0.7116, + "num_input_tokens_seen": 2144256, + "step": 3690 + }, + { + "epoch": 0.5503425677688413, + "grad_norm": 6.20891809463501, + "learning_rate": 1.3754840631516236e-05, + "loss": 0.6396, + "num_input_tokens_seen": 2147040, + "step": 3695 + }, + { + "epoch": 0.5510872803098004, + "grad_norm": 3.3602938652038574, + "learning_rate": 1.3773458445040216e-05, + "loss": 0.6622, + "num_input_tokens_seen": 2150144, + "step": 3700 + }, + { + "epoch": 0.5518319928507596, + "grad_norm": 3.6178858280181885, + "learning_rate": 1.3792076258564196e-05, + "loss": 0.5652, + "num_input_tokens_seen": 2152864, + "step": 3705 + }, + { + "epoch": 0.5525767053917188, + "grad_norm": 4.333006858825684, + "learning_rate": 1.3810694072088174e-05, + "loss": 0.7579, + "num_input_tokens_seen": 2156032, + "step": 3710 + }, + { + "epoch": 0.553321417932678, + "grad_norm": 4.580514430999756, + "learning_rate": 1.3829311885612154e-05, + "loss": 0.5975, + "num_input_tokens_seen": 2158848, + "step": 3715 + }, + { + "epoch": 0.5540661304736372, + "grad_norm": 8.343399047851562, + "learning_rate": 1.3847929699136134e-05, + "loss": 0.6901, + "num_input_tokens_seen": 2161664, + "step": 3720 + }, + { + "epoch": 0.5548108430145964, + "grad_norm": 9.403820037841797, + "learning_rate": 1.3866547512660114e-05, + "loss": 0.6474, + "num_input_tokens_seen": 2164544, + "step": 3725 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 3.2401485443115234, + "learning_rate": 1.3885165326184094e-05, + "loss": 0.7883, + "num_input_tokens_seen": 2167168, + "step": 3730 + }, + { + "epoch": 0.5563002680965148, + "grad_norm": 3.7699620723724365, + "learning_rate": 1.3903783139708073e-05, + "loss": 0.6118, + "num_input_tokens_seen": 2169952, + "step": 3735 + }, + { + "epoch": 0.557044980637474, + "grad_norm": 5.539346218109131, + "learning_rate": 1.3922400953232053e-05, + "loss": 0.7325, + "num_input_tokens_seen": 2173376, + "step": 3740 + }, + { + "epoch": 0.5577896931784331, + "grad_norm": 6.356539249420166, + "learning_rate": 1.3941018766756033e-05, + "loss": 0.7374, + "num_input_tokens_seen": 2176032, + "step": 3745 + }, + { + "epoch": 0.5585344057193923, + "grad_norm": 6.166562080383301, + "learning_rate": 1.3959636580280013e-05, + "loss": 0.6254, + "num_input_tokens_seen": 2179136, + "step": 3750 + }, + { + "epoch": 0.5592791182603515, + "grad_norm": 4.421483993530273, + "learning_rate": 1.3978254393803992e-05, + "loss": 0.6224, + "num_input_tokens_seen": 2181920, + "step": 3755 + }, + { + "epoch": 0.5600238308013107, + "grad_norm": 4.687982559204102, + "learning_rate": 1.3996872207327971e-05, + "loss": 0.7511, + "num_input_tokens_seen": 2186080, + "step": 3760 + }, + { + "epoch": 0.5607685433422699, + "grad_norm": 5.640780925750732, + "learning_rate": 1.4015490020851951e-05, + "loss": 0.7493, + "num_input_tokens_seen": 2189120, + "step": 3765 + }, + { + "epoch": 0.5615132558832291, + "grad_norm": 4.920597553253174, + "learning_rate": 1.4034107834375931e-05, + "loss": 0.5897, + "num_input_tokens_seen": 2191744, + "step": 3770 + }, + { + "epoch": 0.5622579684241883, + "grad_norm": 5.826193332672119, + "learning_rate": 1.4052725647899911e-05, + "loss": 0.7681, + "num_input_tokens_seen": 2194816, + "step": 3775 + }, + { + "epoch": 0.5630026809651475, + "grad_norm": 10.978670120239258, + "learning_rate": 1.4071343461423891e-05, + "loss": 0.6599, + "num_input_tokens_seen": 2197312, + "step": 3780 + }, + { + "epoch": 0.5637473935061067, + "grad_norm": 3.9129092693328857, + "learning_rate": 1.4089961274947871e-05, + "loss": 0.5713, + "num_input_tokens_seen": 2200448, + "step": 3785 + }, + { + "epoch": 0.5644921060470658, + "grad_norm": 8.327595710754395, + "learning_rate": 1.4108579088471851e-05, + "loss": 0.8252, + "num_input_tokens_seen": 2203392, + "step": 3790 + }, + { + "epoch": 0.565236818588025, + "grad_norm": 3.483940601348877, + "learning_rate": 1.412719690199583e-05, + "loss": 0.5102, + "num_input_tokens_seen": 2206112, + "step": 3795 + }, + { + "epoch": 0.5659815311289842, + "grad_norm": 9.206193923950195, + "learning_rate": 1.4145814715519809e-05, + "loss": 0.7887, + "num_input_tokens_seen": 2209216, + "step": 3800 + }, + { + "epoch": 0.5667262436699434, + "grad_norm": 4.9205641746521, + "learning_rate": 1.4164432529043789e-05, + "loss": 0.744, + "num_input_tokens_seen": 2211840, + "step": 3805 + }, + { + "epoch": 0.5674709562109026, + "grad_norm": 4.24164342880249, + "learning_rate": 1.4183050342567769e-05, + "loss": 0.6134, + "num_input_tokens_seen": 2214848, + "step": 3810 + }, + { + "epoch": 0.5682156687518618, + "grad_norm": 5.837770938873291, + "learning_rate": 1.4201668156091749e-05, + "loss": 0.6613, + "num_input_tokens_seen": 2217696, + "step": 3815 + }, + { + "epoch": 0.568960381292821, + "grad_norm": 6.014416694641113, + "learning_rate": 1.4220285969615729e-05, + "loss": 0.7119, + "num_input_tokens_seen": 2220480, + "step": 3820 + }, + { + "epoch": 0.5697050938337802, + "grad_norm": 5.42681360244751, + "learning_rate": 1.4238903783139709e-05, + "loss": 0.6886, + "num_input_tokens_seen": 2223200, + "step": 3825 + }, + { + "epoch": 0.5704498063747393, + "grad_norm": 10.101863861083984, + "learning_rate": 1.4257521596663689e-05, + "loss": 0.7906, + "num_input_tokens_seen": 2226048, + "step": 3830 + }, + { + "epoch": 0.5711945189156985, + "grad_norm": 6.422499656677246, + "learning_rate": 1.4276139410187669e-05, + "loss": 0.8421, + "num_input_tokens_seen": 2229024, + "step": 3835 + }, + { + "epoch": 0.5719392314566577, + "grad_norm": 3.4628090858459473, + "learning_rate": 1.4294757223711647e-05, + "loss": 0.7467, + "num_input_tokens_seen": 2232032, + "step": 3840 + }, + { + "epoch": 0.5726839439976169, + "grad_norm": 4.480306625366211, + "learning_rate": 1.4313375037235627e-05, + "loss": 0.6569, + "num_input_tokens_seen": 2234752, + "step": 3845 + }, + { + "epoch": 0.5734286565385761, + "grad_norm": 5.347010135650635, + "learning_rate": 1.4331992850759607e-05, + "loss": 0.5667, + "num_input_tokens_seen": 2237664, + "step": 3850 + }, + { + "epoch": 0.5741733690795353, + "grad_norm": 5.716709613800049, + "learning_rate": 1.4350610664283587e-05, + "loss": 0.6606, + "num_input_tokens_seen": 2240480, + "step": 3855 + }, + { + "epoch": 0.5749180816204945, + "grad_norm": 11.639281272888184, + "learning_rate": 1.4369228477807567e-05, + "loss": 0.5931, + "num_input_tokens_seen": 2243264, + "step": 3860 + }, + { + "epoch": 0.5756627941614537, + "grad_norm": 13.988441467285156, + "learning_rate": 1.4387846291331546e-05, + "loss": 0.6804, + "num_input_tokens_seen": 2246304, + "step": 3865 + }, + { + "epoch": 0.5764075067024129, + "grad_norm": 5.296210765838623, + "learning_rate": 1.4406464104855526e-05, + "loss": 0.6043, + "num_input_tokens_seen": 2249216, + "step": 3870 + }, + { + "epoch": 0.577152219243372, + "grad_norm": 11.802421569824219, + "learning_rate": 1.4425081918379506e-05, + "loss": 0.7326, + "num_input_tokens_seen": 2251904, + "step": 3875 + }, + { + "epoch": 0.5778969317843312, + "grad_norm": 7.180326461791992, + "learning_rate": 1.4443699731903488e-05, + "loss": 0.7494, + "num_input_tokens_seen": 2254880, + "step": 3880 + }, + { + "epoch": 0.5786416443252904, + "grad_norm": 8.32532787322998, + "learning_rate": 1.4462317545427465e-05, + "loss": 0.6677, + "num_input_tokens_seen": 2257888, + "step": 3885 + }, + { + "epoch": 0.5793863568662496, + "grad_norm": 8.334999084472656, + "learning_rate": 1.4480935358951444e-05, + "loss": 0.6836, + "num_input_tokens_seen": 2260736, + "step": 3890 + }, + { + "epoch": 0.5801310694072088, + "grad_norm": 8.102336883544922, + "learning_rate": 1.4499553172475424e-05, + "loss": 0.8824, + "num_input_tokens_seen": 2263968, + "step": 3895 + }, + { + "epoch": 0.580875781948168, + "grad_norm": 4.109306812286377, + "learning_rate": 1.4518170985999404e-05, + "loss": 0.6216, + "num_input_tokens_seen": 2266560, + "step": 3900 + }, + { + "epoch": 0.5816204944891272, + "grad_norm": 5.9384307861328125, + "learning_rate": 1.4536788799523384e-05, + "loss": 0.6052, + "num_input_tokens_seen": 2269376, + "step": 3905 + }, + { + "epoch": 0.5823652070300864, + "grad_norm": 4.487612724304199, + "learning_rate": 1.4555406613047364e-05, + "loss": 0.6531, + "num_input_tokens_seen": 2272320, + "step": 3910 + }, + { + "epoch": 0.5831099195710456, + "grad_norm": 9.027956008911133, + "learning_rate": 1.4574024426571346e-05, + "loss": 0.6911, + "num_input_tokens_seen": 2275328, + "step": 3915 + }, + { + "epoch": 0.5838546321120047, + "grad_norm": 6.568814277648926, + "learning_rate": 1.4592642240095326e-05, + "loss": 0.7766, + "num_input_tokens_seen": 2278240, + "step": 3920 + }, + { + "epoch": 0.5845993446529639, + "grad_norm": 7.367393493652344, + "learning_rate": 1.4611260053619302e-05, + "loss": 0.7741, + "num_input_tokens_seen": 2281504, + "step": 3925 + }, + { + "epoch": 0.5853440571939231, + "grad_norm": 3.861494302749634, + "learning_rate": 1.4629877867143282e-05, + "loss": 0.6095, + "num_input_tokens_seen": 2284320, + "step": 3930 + }, + { + "epoch": 0.5860887697348823, + "grad_norm": 3.6018776893615723, + "learning_rate": 1.4648495680667262e-05, + "loss": 0.6819, + "num_input_tokens_seen": 2287232, + "step": 3935 + }, + { + "epoch": 0.5868334822758415, + "grad_norm": 9.159882545471191, + "learning_rate": 1.4667113494191242e-05, + "loss": 0.7502, + "num_input_tokens_seen": 2290176, + "step": 3940 + }, + { + "epoch": 0.5875781948168007, + "grad_norm": 8.791894912719727, + "learning_rate": 1.4685731307715222e-05, + "loss": 0.6566, + "num_input_tokens_seen": 2292832, + "step": 3945 + }, + { + "epoch": 0.5883229073577599, + "grad_norm": 5.334659576416016, + "learning_rate": 1.4704349121239202e-05, + "loss": 0.7481, + "num_input_tokens_seen": 2295712, + "step": 3950 + }, + { + "epoch": 0.5890676198987191, + "grad_norm": 9.19884967803955, + "learning_rate": 1.4722966934763183e-05, + "loss": 0.7638, + "num_input_tokens_seen": 2298432, + "step": 3955 + }, + { + "epoch": 0.5898123324396782, + "grad_norm": 5.635380744934082, + "learning_rate": 1.4741584748287163e-05, + "loss": 0.64, + "num_input_tokens_seen": 2301408, + "step": 3960 + }, + { + "epoch": 0.5905570449806374, + "grad_norm": 6.7485198974609375, + "learning_rate": 1.4760202561811143e-05, + "loss": 0.6958, + "num_input_tokens_seen": 2304512, + "step": 3965 + }, + { + "epoch": 0.5913017575215966, + "grad_norm": 10.565450668334961, + "learning_rate": 1.477882037533512e-05, + "loss": 0.6482, + "num_input_tokens_seen": 2307584, + "step": 3970 + }, + { + "epoch": 0.5920464700625558, + "grad_norm": 7.0765838623046875, + "learning_rate": 1.47974381888591e-05, + "loss": 0.8006, + "num_input_tokens_seen": 2310336, + "step": 3975 + }, + { + "epoch": 0.592791182603515, + "grad_norm": 8.128246307373047, + "learning_rate": 1.481605600238308e-05, + "loss": 0.6007, + "num_input_tokens_seen": 2313600, + "step": 3980 + }, + { + "epoch": 0.5935358951444742, + "grad_norm": 4.259963035583496, + "learning_rate": 1.483467381590706e-05, + "loss": 0.6414, + "num_input_tokens_seen": 2316448, + "step": 3985 + }, + { + "epoch": 0.5942806076854334, + "grad_norm": 9.388568878173828, + "learning_rate": 1.485329162943104e-05, + "loss": 0.8485, + "num_input_tokens_seen": 2319648, + "step": 3990 + }, + { + "epoch": 0.5950253202263927, + "grad_norm": 4.601771354675293, + "learning_rate": 1.4871909442955021e-05, + "loss": 0.6882, + "num_input_tokens_seen": 2322496, + "step": 3995 + }, + { + "epoch": 0.5957700327673519, + "grad_norm": 7.376682758331299, + "learning_rate": 1.4890527256479001e-05, + "loss": 0.7479, + "num_input_tokens_seen": 2325568, + "step": 4000 + }, + { + "epoch": 0.596514745308311, + "grad_norm": 4.532130718231201, + "learning_rate": 1.4909145070002981e-05, + "loss": 0.7032, + "num_input_tokens_seen": 2328512, + "step": 4005 + }, + { + "epoch": 0.5972594578492701, + "grad_norm": 4.345248222351074, + "learning_rate": 1.4927762883526961e-05, + "loss": 0.7906, + "num_input_tokens_seen": 2331584, + "step": 4010 + }, + { + "epoch": 0.5980041703902294, + "grad_norm": 6.271090984344482, + "learning_rate": 1.4946380697050938e-05, + "loss": 0.6899, + "num_input_tokens_seen": 2334528, + "step": 4015 + }, + { + "epoch": 0.5987488829311886, + "grad_norm": 7.649844169616699, + "learning_rate": 1.4964998510574917e-05, + "loss": 0.8384, + "num_input_tokens_seen": 2337600, + "step": 4020 + }, + { + "epoch": 0.5994935954721478, + "grad_norm": 8.63271427154541, + "learning_rate": 1.4983616324098897e-05, + "loss": 0.6599, + "num_input_tokens_seen": 2340480, + "step": 4025 + }, + { + "epoch": 0.600238308013107, + "grad_norm": 5.973481178283691, + "learning_rate": 1.5002234137622877e-05, + "loss": 0.7664, + "num_input_tokens_seen": 2343232, + "step": 4030 + }, + { + "epoch": 0.6009830205540662, + "grad_norm": 4.548530101776123, + "learning_rate": 1.5020851951146859e-05, + "loss": 0.6779, + "num_input_tokens_seen": 2345920, + "step": 4035 + }, + { + "epoch": 0.6017277330950254, + "grad_norm": 5.790647506713867, + "learning_rate": 1.5039469764670839e-05, + "loss": 0.7613, + "num_input_tokens_seen": 2349248, + "step": 4040 + }, + { + "epoch": 0.6024724456359845, + "grad_norm": 3.8885269165039062, + "learning_rate": 1.5058087578194819e-05, + "loss": 0.5671, + "num_input_tokens_seen": 2352224, + "step": 4045 + }, + { + "epoch": 0.6032171581769437, + "grad_norm": 4.004682540893555, + "learning_rate": 1.5076705391718799e-05, + "loss": 0.518, + "num_input_tokens_seen": 2355040, + "step": 4050 + }, + { + "epoch": 0.6039618707179029, + "grad_norm": 3.8949434757232666, + "learning_rate": 1.5095323205242775e-05, + "loss": 0.7075, + "num_input_tokens_seen": 2357728, + "step": 4055 + }, + { + "epoch": 0.6047065832588621, + "grad_norm": 3.744589328765869, + "learning_rate": 1.5113941018766755e-05, + "loss": 0.691, + "num_input_tokens_seen": 2360672, + "step": 4060 + }, + { + "epoch": 0.6054512957998213, + "grad_norm": 6.858578681945801, + "learning_rate": 1.5132558832290735e-05, + "loss": 0.6898, + "num_input_tokens_seen": 2363424, + "step": 4065 + }, + { + "epoch": 0.6061960083407805, + "grad_norm": 5.901797771453857, + "learning_rate": 1.5151176645814715e-05, + "loss": 0.6257, + "num_input_tokens_seen": 2366368, + "step": 4070 + }, + { + "epoch": 0.6069407208817397, + "grad_norm": 5.691039562225342, + "learning_rate": 1.5169794459338697e-05, + "loss": 0.7648, + "num_input_tokens_seen": 2369312, + "step": 4075 + }, + { + "epoch": 0.6076854334226989, + "grad_norm": 7.155378341674805, + "learning_rate": 1.5188412272862677e-05, + "loss": 0.5453, + "num_input_tokens_seen": 2372352, + "step": 4080 + }, + { + "epoch": 0.6084301459636581, + "grad_norm": 5.804598331451416, + "learning_rate": 1.5207030086386656e-05, + "loss": 0.5228, + "num_input_tokens_seen": 2375264, + "step": 4085 + }, + { + "epoch": 0.6091748585046172, + "grad_norm": 8.494084358215332, + "learning_rate": 1.5225647899910636e-05, + "loss": 0.6214, + "num_input_tokens_seen": 2377984, + "step": 4090 + }, + { + "epoch": 0.6099195710455764, + "grad_norm": 7.307822227478027, + "learning_rate": 1.5244265713434616e-05, + "loss": 0.6884, + "num_input_tokens_seen": 2381120, + "step": 4095 + }, + { + "epoch": 0.6106642835865356, + "grad_norm": 18.41765022277832, + "learning_rate": 1.5262883526958593e-05, + "loss": 0.8576, + "num_input_tokens_seen": 2383744, + "step": 4100 + }, + { + "epoch": 0.6114089961274948, + "grad_norm": 5.322934627532959, + "learning_rate": 1.5281501340482574e-05, + "loss": 0.5319, + "num_input_tokens_seen": 2386432, + "step": 4105 + }, + { + "epoch": 0.612153708668454, + "grad_norm": 7.289163589477539, + "learning_rate": 1.5300119154006553e-05, + "loss": 0.7601, + "num_input_tokens_seen": 2389312, + "step": 4110 + }, + { + "epoch": 0.6128984212094132, + "grad_norm": 4.677017688751221, + "learning_rate": 1.5318736967530534e-05, + "loss": 0.6554, + "num_input_tokens_seen": 2392608, + "step": 4115 + }, + { + "epoch": 0.6136431337503724, + "grad_norm": 11.213264465332031, + "learning_rate": 1.5337354781054513e-05, + "loss": 0.5395, + "num_input_tokens_seen": 2395296, + "step": 4120 + }, + { + "epoch": 0.6143878462913316, + "grad_norm": 7.245213508605957, + "learning_rate": 1.5355972594578494e-05, + "loss": 0.5465, + "num_input_tokens_seen": 2398112, + "step": 4125 + }, + { + "epoch": 0.6151325588322908, + "grad_norm": 4.8818464279174805, + "learning_rate": 1.5374590408102472e-05, + "loss": 0.6946, + "num_input_tokens_seen": 2400704, + "step": 4130 + }, + { + "epoch": 0.6158772713732499, + "grad_norm": 6.526327133178711, + "learning_rate": 1.5393208221626454e-05, + "loss": 0.5321, + "num_input_tokens_seen": 2403392, + "step": 4135 + }, + { + "epoch": 0.6166219839142091, + "grad_norm": 5.687621593475342, + "learning_rate": 1.5411826035150436e-05, + "loss": 0.524, + "num_input_tokens_seen": 2406432, + "step": 4140 + }, + { + "epoch": 0.6173666964551683, + "grad_norm": 8.32625961303711, + "learning_rate": 1.543044384867441e-05, + "loss": 0.7399, + "num_input_tokens_seen": 2409376, + "step": 4145 + }, + { + "epoch": 0.6181114089961275, + "grad_norm": 14.914022445678711, + "learning_rate": 1.5449061662198392e-05, + "loss": 0.8324, + "num_input_tokens_seen": 2412864, + "step": 4150 + }, + { + "epoch": 0.6188561215370867, + "grad_norm": 10.604575157165527, + "learning_rate": 1.546767947572237e-05, + "loss": 0.7606, + "num_input_tokens_seen": 2415744, + "step": 4155 + }, + { + "epoch": 0.6196008340780459, + "grad_norm": 9.917621612548828, + "learning_rate": 1.5486297289246352e-05, + "loss": 0.7688, + "num_input_tokens_seen": 2418560, + "step": 4160 + }, + { + "epoch": 0.6203455466190051, + "grad_norm": 6.567759037017822, + "learning_rate": 1.550491510277033e-05, + "loss": 0.6139, + "num_input_tokens_seen": 2421440, + "step": 4165 + }, + { + "epoch": 0.6210902591599643, + "grad_norm": 5.966513156890869, + "learning_rate": 1.5523532916294312e-05, + "loss": 0.7516, + "num_input_tokens_seen": 2424640, + "step": 4170 + }, + { + "epoch": 0.6218349717009234, + "grad_norm": 6.697953224182129, + "learning_rate": 1.554215072981829e-05, + "loss": 0.7258, + "num_input_tokens_seen": 2427488, + "step": 4175 + }, + { + "epoch": 0.6225796842418826, + "grad_norm": 9.668734550476074, + "learning_rate": 1.5560768543342272e-05, + "loss": 0.6075, + "num_input_tokens_seen": 2430208, + "step": 4180 + }, + { + "epoch": 0.6233243967828418, + "grad_norm": 8.021125793457031, + "learning_rate": 1.557938635686625e-05, + "loss": 0.6871, + "num_input_tokens_seen": 2432864, + "step": 4185 + }, + { + "epoch": 0.624069109323801, + "grad_norm": 7.878726005554199, + "learning_rate": 1.5598004170390228e-05, + "loss": 0.6759, + "num_input_tokens_seen": 2435840, + "step": 4190 + }, + { + "epoch": 0.6248138218647602, + "grad_norm": 15.81725025177002, + "learning_rate": 1.561662198391421e-05, + "loss": 0.6742, + "num_input_tokens_seen": 2438560, + "step": 4195 + }, + { + "epoch": 0.6255585344057194, + "grad_norm": 5.357878684997559, + "learning_rate": 1.5635239797438188e-05, + "loss": 0.6982, + "num_input_tokens_seen": 2441376, + "step": 4200 + }, + { + "epoch": 0.6263032469466786, + "grad_norm": 6.64098596572876, + "learning_rate": 1.565385761096217e-05, + "loss": 0.7313, + "num_input_tokens_seen": 2444288, + "step": 4205 + }, + { + "epoch": 0.6270479594876378, + "grad_norm": 8.60537338256836, + "learning_rate": 1.5672475424486148e-05, + "loss": 0.6454, + "num_input_tokens_seen": 2447168, + "step": 4210 + }, + { + "epoch": 0.627792672028597, + "grad_norm": 2.8321824073791504, + "learning_rate": 1.569109323801013e-05, + "loss": 0.617, + "num_input_tokens_seen": 2450240, + "step": 4215 + }, + { + "epoch": 0.6285373845695561, + "grad_norm": 5.931424140930176, + "learning_rate": 1.570971105153411e-05, + "loss": 0.7659, + "num_input_tokens_seen": 2453024, + "step": 4220 + }, + { + "epoch": 0.6292820971105153, + "grad_norm": 4.853425979614258, + "learning_rate": 1.572832886505809e-05, + "loss": 0.6774, + "num_input_tokens_seen": 2455872, + "step": 4225 + }, + { + "epoch": 0.6300268096514745, + "grad_norm": 4.697386741638184, + "learning_rate": 1.5746946678582068e-05, + "loss": 0.6786, + "num_input_tokens_seen": 2458624, + "step": 4230 + }, + { + "epoch": 0.6307715221924337, + "grad_norm": 6.067267894744873, + "learning_rate": 1.5765564492106046e-05, + "loss": 0.7309, + "num_input_tokens_seen": 2461312, + "step": 4235 + }, + { + "epoch": 0.6315162347333929, + "grad_norm": 5.88886022567749, + "learning_rate": 1.5784182305630027e-05, + "loss": 0.8002, + "num_input_tokens_seen": 2464352, + "step": 4240 + }, + { + "epoch": 0.6322609472743521, + "grad_norm": 4.343058109283447, + "learning_rate": 1.5802800119154006e-05, + "loss": 0.7207, + "num_input_tokens_seen": 2467040, + "step": 4245 + }, + { + "epoch": 0.6330056598153113, + "grad_norm": 14.663269996643066, + "learning_rate": 1.5821417932677987e-05, + "loss": 0.6339, + "num_input_tokens_seen": 2470112, + "step": 4250 + }, + { + "epoch": 0.6337503723562705, + "grad_norm": 4.468962669372559, + "learning_rate": 1.5840035746201966e-05, + "loss": 0.7237, + "num_input_tokens_seen": 2472768, + "step": 4255 + }, + { + "epoch": 0.6344950848972297, + "grad_norm": 3.7524573802948, + "learning_rate": 1.5858653559725947e-05, + "loss": 0.6379, + "num_input_tokens_seen": 2475936, + "step": 4260 + }, + { + "epoch": 0.6352397974381888, + "grad_norm": 5.4355340003967285, + "learning_rate": 1.587727137324993e-05, + "loss": 0.6915, + "num_input_tokens_seen": 2478688, + "step": 4265 + }, + { + "epoch": 0.635984509979148, + "grad_norm": 4.425848484039307, + "learning_rate": 1.5895889186773907e-05, + "loss": 0.7229, + "num_input_tokens_seen": 2481728, + "step": 4270 + }, + { + "epoch": 0.6367292225201072, + "grad_norm": 8.394963264465332, + "learning_rate": 1.5914507000297885e-05, + "loss": 0.6041, + "num_input_tokens_seen": 2484416, + "step": 4275 + }, + { + "epoch": 0.6374739350610664, + "grad_norm": 5.269087791442871, + "learning_rate": 1.5933124813821863e-05, + "loss": 0.6263, + "num_input_tokens_seen": 2487296, + "step": 4280 + }, + { + "epoch": 0.6382186476020256, + "grad_norm": 11.436367988586426, + "learning_rate": 1.5951742627345845e-05, + "loss": 0.776, + "num_input_tokens_seen": 2490336, + "step": 4285 + }, + { + "epoch": 0.6389633601429848, + "grad_norm": 6.556698322296143, + "learning_rate": 1.5970360440869823e-05, + "loss": 0.7149, + "num_input_tokens_seen": 2493600, + "step": 4290 + }, + { + "epoch": 0.639708072683944, + "grad_norm": 5.045993804931641, + "learning_rate": 1.5988978254393805e-05, + "loss": 0.5709, + "num_input_tokens_seen": 2496352, + "step": 4295 + }, + { + "epoch": 0.6404527852249032, + "grad_norm": 7.095966339111328, + "learning_rate": 1.6007596067917787e-05, + "loss": 0.7099, + "num_input_tokens_seen": 2499296, + "step": 4300 + }, + { + "epoch": 0.6411974977658623, + "grad_norm": 6.410251617431641, + "learning_rate": 1.6026213881441765e-05, + "loss": 0.5969, + "num_input_tokens_seen": 2502080, + "step": 4305 + }, + { + "epoch": 0.6419422103068215, + "grad_norm": 6.237090587615967, + "learning_rate": 1.6044831694965746e-05, + "loss": 0.6048, + "num_input_tokens_seen": 2504832, + "step": 4310 + }, + { + "epoch": 0.6426869228477807, + "grad_norm": 3.245898723602295, + "learning_rate": 1.606344950848972e-05, + "loss": 0.5826, + "num_input_tokens_seen": 2507776, + "step": 4315 + }, + { + "epoch": 0.6434316353887399, + "grad_norm": 3.3262081146240234, + "learning_rate": 1.6082067322013703e-05, + "loss": 0.6808, + "num_input_tokens_seen": 2510720, + "step": 4320 + }, + { + "epoch": 0.6441763479296991, + "grad_norm": 5.021090507507324, + "learning_rate": 1.610068513553768e-05, + "loss": 0.5799, + "num_input_tokens_seen": 2513600, + "step": 4325 + }, + { + "epoch": 0.6449210604706583, + "grad_norm": 4.894305229187012, + "learning_rate": 1.6119302949061663e-05, + "loss": 0.6946, + "num_input_tokens_seen": 2516576, + "step": 4330 + }, + { + "epoch": 0.6456657730116175, + "grad_norm": 5.151403903961182, + "learning_rate": 1.6137920762585644e-05, + "loss": 0.7297, + "num_input_tokens_seen": 2519296, + "step": 4335 + }, + { + "epoch": 0.6464104855525767, + "grad_norm": 10.951947212219238, + "learning_rate": 1.6156538576109623e-05, + "loss": 0.7145, + "num_input_tokens_seen": 2522112, + "step": 4340 + }, + { + "epoch": 0.6471551980935359, + "grad_norm": 7.8822550773620605, + "learning_rate": 1.6175156389633604e-05, + "loss": 0.7483, + "num_input_tokens_seen": 2525280, + "step": 4345 + }, + { + "epoch": 0.647899910634495, + "grad_norm": 4.65081262588501, + "learning_rate": 1.6193774203157582e-05, + "loss": 0.7761, + "num_input_tokens_seen": 2528288, + "step": 4350 + }, + { + "epoch": 0.6486446231754542, + "grad_norm": 9.446943283081055, + "learning_rate": 1.6212392016681564e-05, + "loss": 0.8533, + "num_input_tokens_seen": 2531040, + "step": 4355 + }, + { + "epoch": 0.6493893357164134, + "grad_norm": 2.2164571285247803, + "learning_rate": 1.623100983020554e-05, + "loss": 0.7357, + "num_input_tokens_seen": 2533952, + "step": 4360 + }, + { + "epoch": 0.6501340482573726, + "grad_norm": 3.089479446411133, + "learning_rate": 1.624962764372952e-05, + "loss": 0.5815, + "num_input_tokens_seen": 2536864, + "step": 4365 + }, + { + "epoch": 0.6508787607983318, + "grad_norm": 3.187958002090454, + "learning_rate": 1.62682454572535e-05, + "loss": 0.5752, + "num_input_tokens_seen": 2539552, + "step": 4370 + }, + { + "epoch": 0.651623473339291, + "grad_norm": 4.719966888427734, + "learning_rate": 1.628686327077748e-05, + "loss": 0.6837, + "num_input_tokens_seen": 2542208, + "step": 4375 + }, + { + "epoch": 0.6523681858802503, + "grad_norm": 3.7752015590667725, + "learning_rate": 1.6305481084301462e-05, + "loss": 0.7524, + "num_input_tokens_seen": 2545152, + "step": 4380 + }, + { + "epoch": 0.6531128984212095, + "grad_norm": 6.594426155090332, + "learning_rate": 1.632409889782544e-05, + "loss": 0.7809, + "num_input_tokens_seen": 2548032, + "step": 4385 + }, + { + "epoch": 0.6538576109621687, + "grad_norm": 4.809311866760254, + "learning_rate": 1.6342716711349422e-05, + "loss": 0.6854, + "num_input_tokens_seen": 2550976, + "step": 4390 + }, + { + "epoch": 0.6546023235031277, + "grad_norm": 5.153758525848389, + "learning_rate": 1.63613345248734e-05, + "loss": 0.6981, + "num_input_tokens_seen": 2554336, + "step": 4395 + }, + { + "epoch": 0.655347036044087, + "grad_norm": 10.075767517089844, + "learning_rate": 1.637995233839738e-05, + "loss": 0.7461, + "num_input_tokens_seen": 2557504, + "step": 4400 + }, + { + "epoch": 0.6560917485850462, + "grad_norm": 7.942836761474609, + "learning_rate": 1.6398570151921357e-05, + "loss": 0.6743, + "num_input_tokens_seen": 2560288, + "step": 4405 + }, + { + "epoch": 0.6568364611260054, + "grad_norm": 5.927549362182617, + "learning_rate": 1.6417187965445338e-05, + "loss": 0.6685, + "num_input_tokens_seen": 2563136, + "step": 4410 + }, + { + "epoch": 0.6575811736669646, + "grad_norm": 5.778417587280273, + "learning_rate": 1.643580577896932e-05, + "loss": 0.5547, + "num_input_tokens_seen": 2566144, + "step": 4415 + }, + { + "epoch": 0.6583258862079238, + "grad_norm": 4.513813018798828, + "learning_rate": 1.6454423592493298e-05, + "loss": 0.5959, + "num_input_tokens_seen": 2568800, + "step": 4420 + }, + { + "epoch": 0.659070598748883, + "grad_norm": 7.3920063972473145, + "learning_rate": 1.647304140601728e-05, + "loss": 0.7216, + "num_input_tokens_seen": 2571840, + "step": 4425 + }, + { + "epoch": 0.6598153112898422, + "grad_norm": 6.2078423500061035, + "learning_rate": 1.6491659219541258e-05, + "loss": 0.6654, + "num_input_tokens_seen": 2574688, + "step": 4430 + }, + { + "epoch": 0.6605600238308013, + "grad_norm": 5.138099670410156, + "learning_rate": 1.651027703306524e-05, + "loss": 0.6754, + "num_input_tokens_seen": 2577376, + "step": 4435 + }, + { + "epoch": 0.6613047363717605, + "grad_norm": 9.616948127746582, + "learning_rate": 1.6528894846589218e-05, + "loss": 0.7805, + "num_input_tokens_seen": 2580448, + "step": 4440 + }, + { + "epoch": 0.6620494489127197, + "grad_norm": 5.410340785980225, + "learning_rate": 1.6547512660113196e-05, + "loss": 0.6887, + "num_input_tokens_seen": 2583168, + "step": 4445 + }, + { + "epoch": 0.6627941614536789, + "grad_norm": 6.241281032562256, + "learning_rate": 1.6566130473637174e-05, + "loss": 0.65, + "num_input_tokens_seen": 2585984, + "step": 4450 + }, + { + "epoch": 0.6635388739946381, + "grad_norm": 3.2669191360473633, + "learning_rate": 1.6584748287161156e-05, + "loss": 0.9295, + "num_input_tokens_seen": 2588928, + "step": 4455 + }, + { + "epoch": 0.6642835865355973, + "grad_norm": 5.421036720275879, + "learning_rate": 1.6603366100685137e-05, + "loss": 0.6916, + "num_input_tokens_seen": 2591584, + "step": 4460 + }, + { + "epoch": 0.6650282990765565, + "grad_norm": 7.053751468658447, + "learning_rate": 1.6621983914209116e-05, + "loss": 0.7145, + "num_input_tokens_seen": 2594496, + "step": 4465 + }, + { + "epoch": 0.6657730116175157, + "grad_norm": 5.37930965423584, + "learning_rate": 1.6640601727733097e-05, + "loss": 0.7004, + "num_input_tokens_seen": 2597408, + "step": 4470 + }, + { + "epoch": 0.6665177241584749, + "grad_norm": 7.251705169677734, + "learning_rate": 1.6659219541257075e-05, + "loss": 0.546, + "num_input_tokens_seen": 2600320, + "step": 4475 + }, + { + "epoch": 0.667262436699434, + "grad_norm": 3.6238160133361816, + "learning_rate": 1.6677837354781057e-05, + "loss": 0.5762, + "num_input_tokens_seen": 2603072, + "step": 4480 + }, + { + "epoch": 0.6680071492403932, + "grad_norm": 3.5749146938323975, + "learning_rate": 1.6696455168305035e-05, + "loss": 0.9108, + "num_input_tokens_seen": 2605728, + "step": 4485 + }, + { + "epoch": 0.6687518617813524, + "grad_norm": 7.863435745239258, + "learning_rate": 1.6715072981829014e-05, + "loss": 0.5766, + "num_input_tokens_seen": 2608320, + "step": 4490 + }, + { + "epoch": 0.6694965743223116, + "grad_norm": 10.37212085723877, + "learning_rate": 1.6733690795352995e-05, + "loss": 0.6795, + "num_input_tokens_seen": 2611168, + "step": 4495 + }, + { + "epoch": 0.6702412868632708, + "grad_norm": 7.4695234298706055, + "learning_rate": 1.6752308608876973e-05, + "loss": 0.6201, + "num_input_tokens_seen": 2614016, + "step": 4500 + }, + { + "epoch": 0.67098599940423, + "grad_norm": 6.426466464996338, + "learning_rate": 1.6770926422400955e-05, + "loss": 0.6952, + "num_input_tokens_seen": 2616864, + "step": 4505 + }, + { + "epoch": 0.6717307119451892, + "grad_norm": 5.279740333557129, + "learning_rate": 1.6789544235924933e-05, + "loss": 0.8006, + "num_input_tokens_seen": 2619808, + "step": 4510 + }, + { + "epoch": 0.6724754244861484, + "grad_norm": 6.641730785369873, + "learning_rate": 1.6808162049448915e-05, + "loss": 0.6527, + "num_input_tokens_seen": 2622464, + "step": 4515 + }, + { + "epoch": 0.6732201370271076, + "grad_norm": 4.667259216308594, + "learning_rate": 1.6826779862972893e-05, + "loss": 0.6563, + "num_input_tokens_seen": 2625216, + "step": 4520 + }, + { + "epoch": 0.6739648495680667, + "grad_norm": 6.256143093109131, + "learning_rate": 1.6845397676496875e-05, + "loss": 0.6822, + "num_input_tokens_seen": 2628096, + "step": 4525 + }, + { + "epoch": 0.6747095621090259, + "grad_norm": 5.165081024169922, + "learning_rate": 1.6864015490020853e-05, + "loss": 0.7743, + "num_input_tokens_seen": 2631072, + "step": 4530 + }, + { + "epoch": 0.6754542746499851, + "grad_norm": 3.8855783939361572, + "learning_rate": 1.688263330354483e-05, + "loss": 0.6206, + "num_input_tokens_seen": 2634016, + "step": 4535 + }, + { + "epoch": 0.6761989871909443, + "grad_norm": 4.585870265960693, + "learning_rate": 1.6901251117068813e-05, + "loss": 0.6163, + "num_input_tokens_seen": 2637376, + "step": 4540 + }, + { + "epoch": 0.6769436997319035, + "grad_norm": 6.8759260177612305, + "learning_rate": 1.691986893059279e-05, + "loss": 0.6561, + "num_input_tokens_seen": 2640192, + "step": 4545 + }, + { + "epoch": 0.6776884122728627, + "grad_norm": 4.8580756187438965, + "learning_rate": 1.6938486744116773e-05, + "loss": 0.7016, + "num_input_tokens_seen": 2643040, + "step": 4550 + }, + { + "epoch": 0.6784331248138219, + "grad_norm": 6.060779094696045, + "learning_rate": 1.695710455764075e-05, + "loss": 0.5584, + "num_input_tokens_seen": 2645856, + "step": 4555 + }, + { + "epoch": 0.6791778373547811, + "grad_norm": 13.365805625915527, + "learning_rate": 1.6975722371164733e-05, + "loss": 0.8073, + "num_input_tokens_seen": 2648928, + "step": 4560 + }, + { + "epoch": 0.6799225498957402, + "grad_norm": 8.253595352172852, + "learning_rate": 1.699434018468871e-05, + "loss": 0.6493, + "num_input_tokens_seen": 2651648, + "step": 4565 + }, + { + "epoch": 0.6806672624366994, + "grad_norm": 4.679676532745361, + "learning_rate": 1.7012957998212692e-05, + "loss": 0.6037, + "num_input_tokens_seen": 2654528, + "step": 4570 + }, + { + "epoch": 0.6814119749776586, + "grad_norm": 7.76223087310791, + "learning_rate": 1.703157581173667e-05, + "loss": 0.6446, + "num_input_tokens_seen": 2657376, + "step": 4575 + }, + { + "epoch": 0.6821566875186178, + "grad_norm": 12.932477951049805, + "learning_rate": 1.705019362526065e-05, + "loss": 0.7246, + "num_input_tokens_seen": 2660160, + "step": 4580 + }, + { + "epoch": 0.682901400059577, + "grad_norm": 4.016125202178955, + "learning_rate": 1.706881143878463e-05, + "loss": 0.623, + "num_input_tokens_seen": 2663520, + "step": 4585 + }, + { + "epoch": 0.6836461126005362, + "grad_norm": 8.566823959350586, + "learning_rate": 1.708742925230861e-05, + "loss": 0.635, + "num_input_tokens_seen": 2666272, + "step": 4590 + }, + { + "epoch": 0.6843908251414954, + "grad_norm": 6.359520435333252, + "learning_rate": 1.710604706583259e-05, + "loss": 0.6925, + "num_input_tokens_seen": 2669248, + "step": 4595 + }, + { + "epoch": 0.6851355376824546, + "grad_norm": 16.22835922241211, + "learning_rate": 1.712466487935657e-05, + "loss": 0.4973, + "num_input_tokens_seen": 2672064, + "step": 4600 + }, + { + "epoch": 0.6858802502234138, + "grad_norm": 8.306657791137695, + "learning_rate": 1.714328269288055e-05, + "loss": 0.7138, + "num_input_tokens_seen": 2675008, + "step": 4605 + }, + { + "epoch": 0.6866249627643729, + "grad_norm": 7.691039562225342, + "learning_rate": 1.716190050640453e-05, + "loss": 0.6883, + "num_input_tokens_seen": 2678080, + "step": 4610 + }, + { + "epoch": 0.6873696753053321, + "grad_norm": 5.446042060852051, + "learning_rate": 1.718051831992851e-05, + "loss": 0.7876, + "num_input_tokens_seen": 2680960, + "step": 4615 + }, + { + "epoch": 0.6881143878462913, + "grad_norm": 6.587165832519531, + "learning_rate": 1.7199136133452488e-05, + "loss": 0.679, + "num_input_tokens_seen": 2683904, + "step": 4620 + }, + { + "epoch": 0.6888591003872505, + "grad_norm": 10.104375839233398, + "learning_rate": 1.7217753946976467e-05, + "loss": 0.6107, + "num_input_tokens_seen": 2686880, + "step": 4625 + }, + { + "epoch": 0.6896038129282097, + "grad_norm": 7.982815265655518, + "learning_rate": 1.7236371760500448e-05, + "loss": 0.65, + "num_input_tokens_seen": 2689536, + "step": 4630 + }, + { + "epoch": 0.6903485254691689, + "grad_norm": 11.700895309448242, + "learning_rate": 1.7254989574024426e-05, + "loss": 0.6951, + "num_input_tokens_seen": 2692384, + "step": 4635 + }, + { + "epoch": 0.6910932380101281, + "grad_norm": 16.953807830810547, + "learning_rate": 1.7273607387548408e-05, + "loss": 0.8068, + "num_input_tokens_seen": 2695424, + "step": 4640 + }, + { + "epoch": 0.6918379505510873, + "grad_norm": 5.64668083190918, + "learning_rate": 1.7292225201072386e-05, + "loss": 0.747, + "num_input_tokens_seen": 2698464, + "step": 4645 + }, + { + "epoch": 0.6925826630920465, + "grad_norm": 8.799254417419434, + "learning_rate": 1.7310843014596368e-05, + "loss": 0.5177, + "num_input_tokens_seen": 2701184, + "step": 4650 + }, + { + "epoch": 0.6933273756330056, + "grad_norm": 14.11662483215332, + "learning_rate": 1.7329460828120346e-05, + "loss": 0.6433, + "num_input_tokens_seen": 2704160, + "step": 4655 + }, + { + "epoch": 0.6940720881739648, + "grad_norm": 6.2813262939453125, + "learning_rate": 1.7348078641644328e-05, + "loss": 0.575, + "num_input_tokens_seen": 2706944, + "step": 4660 + }, + { + "epoch": 0.694816800714924, + "grad_norm": 5.663827896118164, + "learning_rate": 1.7366696455168306e-05, + "loss": 0.7345, + "num_input_tokens_seen": 2709824, + "step": 4665 + }, + { + "epoch": 0.6955615132558832, + "grad_norm": 15.062468528747559, + "learning_rate": 1.7385314268692284e-05, + "loss": 0.8231, + "num_input_tokens_seen": 2712448, + "step": 4670 + }, + { + "epoch": 0.6963062257968424, + "grad_norm": 5.409207344055176, + "learning_rate": 1.7403932082216266e-05, + "loss": 0.6853, + "num_input_tokens_seen": 2715456, + "step": 4675 + }, + { + "epoch": 0.6970509383378016, + "grad_norm": 5.887481212615967, + "learning_rate": 1.7422549895740244e-05, + "loss": 0.6136, + "num_input_tokens_seen": 2718080, + "step": 4680 + }, + { + "epoch": 0.6977956508787608, + "grad_norm": 3.99128794670105, + "learning_rate": 1.7441167709264226e-05, + "loss": 0.6125, + "num_input_tokens_seen": 2720928, + "step": 4685 + }, + { + "epoch": 0.69854036341972, + "grad_norm": 11.07091999053955, + "learning_rate": 1.7459785522788204e-05, + "loss": 0.6998, + "num_input_tokens_seen": 2724096, + "step": 4690 + }, + { + "epoch": 0.6992850759606791, + "grad_norm": 6.873650550842285, + "learning_rate": 1.7478403336312185e-05, + "loss": 0.6168, + "num_input_tokens_seen": 2727264, + "step": 4695 + }, + { + "epoch": 0.7000297885016383, + "grad_norm": 5.5583815574646, + "learning_rate": 1.7497021149836164e-05, + "loss": 0.5299, + "num_input_tokens_seen": 2729760, + "step": 4700 + }, + { + "epoch": 0.7007745010425975, + "grad_norm": 6.663151264190674, + "learning_rate": 1.7515638963360142e-05, + "loss": 0.7093, + "num_input_tokens_seen": 2732416, + "step": 4705 + }, + { + "epoch": 0.7015192135835567, + "grad_norm": 14.916585922241211, + "learning_rate": 1.7534256776884124e-05, + "loss": 0.7076, + "num_input_tokens_seen": 2735072, + "step": 4710 + }, + { + "epoch": 0.7022639261245159, + "grad_norm": 10.405411720275879, + "learning_rate": 1.7552874590408102e-05, + "loss": 0.8008, + "num_input_tokens_seen": 2737792, + "step": 4715 + }, + { + "epoch": 0.7030086386654751, + "grad_norm": 7.693538665771484, + "learning_rate": 1.7571492403932083e-05, + "loss": 0.9158, + "num_input_tokens_seen": 2740704, + "step": 4720 + }, + { + "epoch": 0.7037533512064343, + "grad_norm": 7.042656421661377, + "learning_rate": 1.759011021745606e-05, + "loss": 0.6513, + "num_input_tokens_seen": 2743520, + "step": 4725 + }, + { + "epoch": 0.7044980637473935, + "grad_norm": 7.306209087371826, + "learning_rate": 1.7608728030980043e-05, + "loss": 0.5285, + "num_input_tokens_seen": 2746368, + "step": 4730 + }, + { + "epoch": 0.7052427762883527, + "grad_norm": 4.411949157714844, + "learning_rate": 1.762734584450402e-05, + "loss": 0.607, + "num_input_tokens_seen": 2749184, + "step": 4735 + }, + { + "epoch": 0.7059874888293118, + "grad_norm": 8.757414817810059, + "learning_rate": 1.7645963658028003e-05, + "loss": 0.7292, + "num_input_tokens_seen": 2752000, + "step": 4740 + }, + { + "epoch": 0.706732201370271, + "grad_norm": 5.159996032714844, + "learning_rate": 1.7664581471551985e-05, + "loss": 0.7292, + "num_input_tokens_seen": 2754688, + "step": 4745 + }, + { + "epoch": 0.7074769139112302, + "grad_norm": 11.460047721862793, + "learning_rate": 1.768319928507596e-05, + "loss": 0.8051, + "num_input_tokens_seen": 2757472, + "step": 4750 + }, + { + "epoch": 0.7082216264521894, + "grad_norm": 3.09690260887146, + "learning_rate": 1.770181709859994e-05, + "loss": 0.5923, + "num_input_tokens_seen": 2760288, + "step": 4755 + }, + { + "epoch": 0.7089663389931486, + "grad_norm": 6.207311630249023, + "learning_rate": 1.772043491212392e-05, + "loss": 0.6924, + "num_input_tokens_seen": 2763040, + "step": 4760 + }, + { + "epoch": 0.7097110515341079, + "grad_norm": 9.953872680664062, + "learning_rate": 1.77390527256479e-05, + "loss": 0.8044, + "num_input_tokens_seen": 2766176, + "step": 4765 + }, + { + "epoch": 0.710455764075067, + "grad_norm": 6.2388200759887695, + "learning_rate": 1.775767053917188e-05, + "loss": 0.7327, + "num_input_tokens_seen": 2769408, + "step": 4770 + }, + { + "epoch": 0.7112004766160263, + "grad_norm": 3.4224841594696045, + "learning_rate": 1.777628835269586e-05, + "loss": 0.7246, + "num_input_tokens_seen": 2772256, + "step": 4775 + }, + { + "epoch": 0.7119451891569855, + "grad_norm": 5.62139892578125, + "learning_rate": 1.779490616621984e-05, + "loss": 0.7145, + "num_input_tokens_seen": 2774912, + "step": 4780 + }, + { + "epoch": 0.7126899016979446, + "grad_norm": 12.218836784362793, + "learning_rate": 1.781352397974382e-05, + "loss": 0.6419, + "num_input_tokens_seen": 2777600, + "step": 4785 + }, + { + "epoch": 0.7134346142389038, + "grad_norm": 6.698795318603516, + "learning_rate": 1.78321417932678e-05, + "loss": 0.5734, + "num_input_tokens_seen": 2780480, + "step": 4790 + }, + { + "epoch": 0.714179326779863, + "grad_norm": 9.402045249938965, + "learning_rate": 1.7850759606791777e-05, + "loss": 0.7599, + "num_input_tokens_seen": 2783648, + "step": 4795 + }, + { + "epoch": 0.7149240393208222, + "grad_norm": 7.108006000518799, + "learning_rate": 1.786937742031576e-05, + "loss": 0.5888, + "num_input_tokens_seen": 2786880, + "step": 4800 + }, + { + "epoch": 0.7156687518617814, + "grad_norm": 3.5693907737731934, + "learning_rate": 1.7887995233839737e-05, + "loss": 0.6619, + "num_input_tokens_seen": 2789440, + "step": 4805 + }, + { + "epoch": 0.7164134644027406, + "grad_norm": 5.145169258117676, + "learning_rate": 1.790661304736372e-05, + "loss": 0.6101, + "num_input_tokens_seen": 2792640, + "step": 4810 + }, + { + "epoch": 0.7171581769436998, + "grad_norm": 5.959521293640137, + "learning_rate": 1.7925230860887697e-05, + "loss": 0.5928, + "num_input_tokens_seen": 2795616, + "step": 4815 + }, + { + "epoch": 0.717902889484659, + "grad_norm": 5.1801228523254395, + "learning_rate": 1.794384867441168e-05, + "loss": 0.6766, + "num_input_tokens_seen": 2798752, + "step": 4820 + }, + { + "epoch": 0.7186476020256181, + "grad_norm": 11.452186584472656, + "learning_rate": 1.796246648793566e-05, + "loss": 0.7338, + "num_input_tokens_seen": 2801344, + "step": 4825 + }, + { + "epoch": 0.7193923145665773, + "grad_norm": 4.6685261726379395, + "learning_rate": 1.798108430145964e-05, + "loss": 0.7075, + "num_input_tokens_seen": 2804384, + "step": 4830 + }, + { + "epoch": 0.7201370271075365, + "grad_norm": 10.7567777633667, + "learning_rate": 1.7999702114983617e-05, + "loss": 0.7292, + "num_input_tokens_seen": 2807456, + "step": 4835 + }, + { + "epoch": 0.7208817396484957, + "grad_norm": 11.446730613708496, + "learning_rate": 1.8018319928507595e-05, + "loss": 0.7071, + "num_input_tokens_seen": 2810400, + "step": 4840 + }, + { + "epoch": 0.7216264521894549, + "grad_norm": 7.785788059234619, + "learning_rate": 1.8036937742031576e-05, + "loss": 0.6244, + "num_input_tokens_seen": 2813408, + "step": 4845 + }, + { + "epoch": 0.7223711647304141, + "grad_norm": 4.870847225189209, + "learning_rate": 1.8055555555555555e-05, + "loss": 0.686, + "num_input_tokens_seen": 2816384, + "step": 4850 + }, + { + "epoch": 0.7231158772713733, + "grad_norm": 8.418082237243652, + "learning_rate": 1.8074173369079536e-05, + "loss": 0.8599, + "num_input_tokens_seen": 2819552, + "step": 4855 + }, + { + "epoch": 0.7238605898123325, + "grad_norm": 5.087862968444824, + "learning_rate": 1.8092791182603515e-05, + "loss": 0.5375, + "num_input_tokens_seen": 2822496, + "step": 4860 + }, + { + "epoch": 0.7246053023532917, + "grad_norm": 11.859097480773926, + "learning_rate": 1.8111408996127496e-05, + "loss": 0.7388, + "num_input_tokens_seen": 2825472, + "step": 4865 + }, + { + "epoch": 0.7253500148942508, + "grad_norm": 5.975281715393066, + "learning_rate": 1.8130026809651478e-05, + "loss": 0.5519, + "num_input_tokens_seen": 2828288, + "step": 4870 + }, + { + "epoch": 0.72609472743521, + "grad_norm": 5.252568244934082, + "learning_rate": 1.8148644623175456e-05, + "loss": 0.7225, + "num_input_tokens_seen": 2831232, + "step": 4875 + }, + { + "epoch": 0.7268394399761692, + "grad_norm": 8.131300926208496, + "learning_rate": 1.8167262436699434e-05, + "loss": 0.6105, + "num_input_tokens_seen": 2834208, + "step": 4880 + }, + { + "epoch": 0.7275841525171284, + "grad_norm": 12.397829055786133, + "learning_rate": 1.8185880250223413e-05, + "loss": 0.8269, + "num_input_tokens_seen": 2836832, + "step": 4885 + }, + { + "epoch": 0.7283288650580876, + "grad_norm": 14.189763069152832, + "learning_rate": 1.8204498063747394e-05, + "loss": 0.6729, + "num_input_tokens_seen": 2839296, + "step": 4890 + }, + { + "epoch": 0.7290735775990468, + "grad_norm": 4.5426435470581055, + "learning_rate": 1.8223115877271372e-05, + "loss": 0.6464, + "num_input_tokens_seen": 2842048, + "step": 4895 + }, + { + "epoch": 0.729818290140006, + "grad_norm": 5.4965033531188965, + "learning_rate": 1.8241733690795354e-05, + "loss": 0.9101, + "num_input_tokens_seen": 2844896, + "step": 4900 + }, + { + "epoch": 0.7305630026809652, + "grad_norm": 8.93502140045166, + "learning_rate": 1.8260351504319336e-05, + "loss": 0.5779, + "num_input_tokens_seen": 2847552, + "step": 4905 + }, + { + "epoch": 0.7313077152219244, + "grad_norm": 4.635172367095947, + "learning_rate": 1.8278969317843314e-05, + "loss": 0.5577, + "num_input_tokens_seen": 2850368, + "step": 4910 + }, + { + "epoch": 0.7320524277628835, + "grad_norm": 6.8198442459106445, + "learning_rate": 1.8297587131367295e-05, + "loss": 0.6631, + "num_input_tokens_seen": 2853344, + "step": 4915 + }, + { + "epoch": 0.7327971403038427, + "grad_norm": 7.462096691131592, + "learning_rate": 1.831620494489127e-05, + "loss": 0.7043, + "num_input_tokens_seen": 2856416, + "step": 4920 + }, + { + "epoch": 0.7335418528448019, + "grad_norm": 4.373554706573486, + "learning_rate": 1.8334822758415252e-05, + "loss": 0.6535, + "num_input_tokens_seen": 2859360, + "step": 4925 + }, + { + "epoch": 0.7342865653857611, + "grad_norm": 3.7155330181121826, + "learning_rate": 1.835344057193923e-05, + "loss": 0.6756, + "num_input_tokens_seen": 2862368, + "step": 4930 + }, + { + "epoch": 0.7350312779267203, + "grad_norm": 8.917332649230957, + "learning_rate": 1.8372058385463212e-05, + "loss": 0.6647, + "num_input_tokens_seen": 2865120, + "step": 4935 + }, + { + "epoch": 0.7357759904676795, + "grad_norm": 7.3002729415893555, + "learning_rate": 1.8390676198987193e-05, + "loss": 0.7099, + "num_input_tokens_seen": 2868224, + "step": 4940 + }, + { + "epoch": 0.7365207030086387, + "grad_norm": 4.9059834480285645, + "learning_rate": 1.840929401251117e-05, + "loss": 0.6059, + "num_input_tokens_seen": 2871360, + "step": 4945 + }, + { + "epoch": 0.7372654155495979, + "grad_norm": 5.862856388092041, + "learning_rate": 1.8427911826035153e-05, + "loss": 0.5068, + "num_input_tokens_seen": 2874496, + "step": 4950 + }, + { + "epoch": 0.738010128090557, + "grad_norm": 15.770673751831055, + "learning_rate": 1.844652963955913e-05, + "loss": 0.7657, + "num_input_tokens_seen": 2877248, + "step": 4955 + }, + { + "epoch": 0.7387548406315162, + "grad_norm": 10.478066444396973, + "learning_rate": 1.8465147453083113e-05, + "loss": 0.6832, + "num_input_tokens_seen": 2880128, + "step": 4960 + }, + { + "epoch": 0.7394995531724754, + "grad_norm": 5.965473651885986, + "learning_rate": 1.8483765266607088e-05, + "loss": 0.4799, + "num_input_tokens_seen": 2882944, + "step": 4965 + }, + { + "epoch": 0.7402442657134346, + "grad_norm": 7.922229766845703, + "learning_rate": 1.850238308013107e-05, + "loss": 0.706, + "num_input_tokens_seen": 2885856, + "step": 4970 + }, + { + "epoch": 0.7409889782543938, + "grad_norm": 15.09994888305664, + "learning_rate": 1.8521000893655048e-05, + "loss": 0.852, + "num_input_tokens_seen": 2888768, + "step": 4975 + }, + { + "epoch": 0.741733690795353, + "grad_norm": 9.361420631408691, + "learning_rate": 1.853961870717903e-05, + "loss": 0.5451, + "num_input_tokens_seen": 2891648, + "step": 4980 + }, + { + "epoch": 0.7424784033363122, + "grad_norm": 7.8774333000183105, + "learning_rate": 1.855823652070301e-05, + "loss": 0.7736, + "num_input_tokens_seen": 2894464, + "step": 4985 + }, + { + "epoch": 0.7432231158772714, + "grad_norm": 5.064788341522217, + "learning_rate": 1.857685433422699e-05, + "loss": 0.5074, + "num_input_tokens_seen": 2897088, + "step": 4990 + }, + { + "epoch": 0.7439678284182306, + "grad_norm": 8.066676139831543, + "learning_rate": 1.859547214775097e-05, + "loss": 0.6493, + "num_input_tokens_seen": 2900032, + "step": 4995 + }, + { + "epoch": 0.7447125409591897, + "grad_norm": 5.546191215515137, + "learning_rate": 1.861408996127495e-05, + "loss": 0.6536, + "num_input_tokens_seen": 2902944, + "step": 5000 + }, + { + "epoch": 0.7454572535001489, + "grad_norm": 5.397414684295654, + "learning_rate": 1.863270777479893e-05, + "loss": 0.4508, + "num_input_tokens_seen": 2905856, + "step": 5005 + }, + { + "epoch": 0.7462019660411081, + "grad_norm": 9.962852478027344, + "learning_rate": 1.8651325588322906e-05, + "loss": 0.7012, + "num_input_tokens_seen": 2908544, + "step": 5010 + }, + { + "epoch": 0.7469466785820673, + "grad_norm": 5.338865280151367, + "learning_rate": 1.8669943401846887e-05, + "loss": 0.6275, + "num_input_tokens_seen": 2911584, + "step": 5015 + }, + { + "epoch": 0.7476913911230265, + "grad_norm": 6.6547770500183105, + "learning_rate": 1.868856121537087e-05, + "loss": 0.6551, + "num_input_tokens_seen": 2914752, + "step": 5020 + }, + { + "epoch": 0.7484361036639857, + "grad_norm": 10.666709899902344, + "learning_rate": 1.8707179028894847e-05, + "loss": 0.791, + "num_input_tokens_seen": 2917472, + "step": 5025 + }, + { + "epoch": 0.7491808162049449, + "grad_norm": 8.064480781555176, + "learning_rate": 1.872579684241883e-05, + "loss": 0.8478, + "num_input_tokens_seen": 2920224, + "step": 5030 + }, + { + "epoch": 0.7499255287459041, + "grad_norm": 10.482688903808594, + "learning_rate": 1.8744414655942807e-05, + "loss": 0.5543, + "num_input_tokens_seen": 2923136, + "step": 5035 + }, + { + "epoch": 0.7506702412868632, + "grad_norm": 7.163956165313721, + "learning_rate": 1.876303246946679e-05, + "loss": 0.5964, + "num_input_tokens_seen": 2926816, + "step": 5040 + }, + { + "epoch": 0.7514149538278224, + "grad_norm": 8.282546997070312, + "learning_rate": 1.8781650282990767e-05, + "loss": 0.8146, + "num_input_tokens_seen": 2929504, + "step": 5045 + }, + { + "epoch": 0.7521596663687816, + "grad_norm": 7.354517459869385, + "learning_rate": 1.8800268096514745e-05, + "loss": 0.7004, + "num_input_tokens_seen": 2932448, + "step": 5050 + }, + { + "epoch": 0.7529043789097408, + "grad_norm": 3.9694557189941406, + "learning_rate": 1.8818885910038723e-05, + "loss": 0.7807, + "num_input_tokens_seen": 2935648, + "step": 5055 + }, + { + "epoch": 0.7536490914507, + "grad_norm": 4.580183029174805, + "learning_rate": 1.8837503723562705e-05, + "loss": 0.7709, + "num_input_tokens_seen": 2938496, + "step": 5060 + }, + { + "epoch": 0.7543938039916592, + "grad_norm": 3.5712530612945557, + "learning_rate": 1.8856121537086686e-05, + "loss": 0.6947, + "num_input_tokens_seen": 2941536, + "step": 5065 + }, + { + "epoch": 0.7551385165326184, + "grad_norm": 5.183640480041504, + "learning_rate": 1.8874739350610665e-05, + "loss": 0.6683, + "num_input_tokens_seen": 2944736, + "step": 5070 + }, + { + "epoch": 0.7558832290735776, + "grad_norm": 6.186522006988525, + "learning_rate": 1.8893357164134646e-05, + "loss": 0.6387, + "num_input_tokens_seen": 2947552, + "step": 5075 + }, + { + "epoch": 0.7566279416145368, + "grad_norm": 5.424081802368164, + "learning_rate": 1.8911974977658625e-05, + "loss": 0.673, + "num_input_tokens_seen": 2950592, + "step": 5080 + }, + { + "epoch": 0.7573726541554959, + "grad_norm": 6.553135871887207, + "learning_rate": 1.8930592791182606e-05, + "loss": 0.7629, + "num_input_tokens_seen": 2953408, + "step": 5085 + }, + { + "epoch": 0.7581173666964551, + "grad_norm": 6.921181678771973, + "learning_rate": 1.8949210604706584e-05, + "loss": 0.4689, + "num_input_tokens_seen": 2956064, + "step": 5090 + }, + { + "epoch": 0.7588620792374143, + "grad_norm": 6.603172302246094, + "learning_rate": 1.8967828418230563e-05, + "loss": 0.609, + "num_input_tokens_seen": 2959392, + "step": 5095 + }, + { + "epoch": 0.7596067917783735, + "grad_norm": 5.210688591003418, + "learning_rate": 1.8986446231754544e-05, + "loss": 0.6698, + "num_input_tokens_seen": 2962240, + "step": 5100 + }, + { + "epoch": 0.7603515043193327, + "grad_norm": 5.373937129974365, + "learning_rate": 1.9005064045278523e-05, + "loss": 0.6469, + "num_input_tokens_seen": 2965152, + "step": 5105 + }, + { + "epoch": 0.7610962168602919, + "grad_norm": 6.325917720794678, + "learning_rate": 1.9023681858802504e-05, + "loss": 0.6416, + "num_input_tokens_seen": 2968064, + "step": 5110 + }, + { + "epoch": 0.7618409294012511, + "grad_norm": 4.71870231628418, + "learning_rate": 1.9042299672326482e-05, + "loss": 0.6257, + "num_input_tokens_seen": 2970752, + "step": 5115 + }, + { + "epoch": 0.7625856419422103, + "grad_norm": 7.341087818145752, + "learning_rate": 1.9060917485850464e-05, + "loss": 0.6287, + "num_input_tokens_seen": 2973664, + "step": 5120 + }, + { + "epoch": 0.7633303544831695, + "grad_norm": 6.43916654586792, + "learning_rate": 1.9079535299374442e-05, + "loss": 0.8084, + "num_input_tokens_seen": 2976704, + "step": 5125 + }, + { + "epoch": 0.7640750670241286, + "grad_norm": 10.238484382629395, + "learning_rate": 1.9098153112898424e-05, + "loss": 0.5751, + "num_input_tokens_seen": 2979680, + "step": 5130 + }, + { + "epoch": 0.7648197795650878, + "grad_norm": 4.393367290496826, + "learning_rate": 1.9116770926422402e-05, + "loss": 0.6105, + "num_input_tokens_seen": 2982464, + "step": 5135 + }, + { + "epoch": 0.765564492106047, + "grad_norm": 7.134841442108154, + "learning_rate": 1.913538873994638e-05, + "loss": 0.4923, + "num_input_tokens_seen": 2985216, + "step": 5140 + }, + { + "epoch": 0.7663092046470062, + "grad_norm": 10.631134033203125, + "learning_rate": 1.9154006553470362e-05, + "loss": 0.8103, + "num_input_tokens_seen": 2988000, + "step": 5145 + }, + { + "epoch": 0.7670539171879655, + "grad_norm": 12.81425666809082, + "learning_rate": 1.917262436699434e-05, + "loss": 0.6222, + "num_input_tokens_seen": 2990752, + "step": 5150 + }, + { + "epoch": 0.7677986297289247, + "grad_norm": 13.455652236938477, + "learning_rate": 1.9191242180518322e-05, + "loss": 0.6457, + "num_input_tokens_seen": 2993664, + "step": 5155 + }, + { + "epoch": 0.7685433422698839, + "grad_norm": 7.488417625427246, + "learning_rate": 1.92098599940423e-05, + "loss": 0.6137, + "num_input_tokens_seen": 2996480, + "step": 5160 + }, + { + "epoch": 0.7692880548108431, + "grad_norm": 4.984339714050293, + "learning_rate": 1.922847780756628e-05, + "loss": 0.8483, + "num_input_tokens_seen": 2999520, + "step": 5165 + }, + { + "epoch": 0.7700327673518021, + "grad_norm": 8.502500534057617, + "learning_rate": 1.924709562109026e-05, + "loss": 0.6878, + "num_input_tokens_seen": 3002496, + "step": 5170 + }, + { + "epoch": 0.7707774798927614, + "grad_norm": 6.352755546569824, + "learning_rate": 1.926571343461424e-05, + "loss": 0.5819, + "num_input_tokens_seen": 3005312, + "step": 5175 + }, + { + "epoch": 0.7715221924337206, + "grad_norm": 7.204667091369629, + "learning_rate": 1.928433124813822e-05, + "loss": 0.6861, + "num_input_tokens_seen": 3008128, + "step": 5180 + }, + { + "epoch": 0.7722669049746798, + "grad_norm": 8.01055908203125, + "learning_rate": 1.9302949061662198e-05, + "loss": 0.6601, + "num_input_tokens_seen": 3011072, + "step": 5185 + }, + { + "epoch": 0.773011617515639, + "grad_norm": 8.629267692565918, + "learning_rate": 1.932156687518618e-05, + "loss": 0.7444, + "num_input_tokens_seen": 3014272, + "step": 5190 + }, + { + "epoch": 0.7737563300565982, + "grad_norm": 5.2754645347595215, + "learning_rate": 1.9340184688710158e-05, + "loss": 0.5924, + "num_input_tokens_seen": 3017248, + "step": 5195 + }, + { + "epoch": 0.7745010425975574, + "grad_norm": 11.344070434570312, + "learning_rate": 1.935880250223414e-05, + "loss": 0.6774, + "num_input_tokens_seen": 3020160, + "step": 5200 + }, + { + "epoch": 0.7752457551385166, + "grad_norm": 5.5594940185546875, + "learning_rate": 1.9377420315758118e-05, + "loss": 0.7079, + "num_input_tokens_seen": 3023200, + "step": 5205 + }, + { + "epoch": 0.7759904676794758, + "grad_norm": 4.323903560638428, + "learning_rate": 1.93960381292821e-05, + "loss": 0.5338, + "num_input_tokens_seen": 3026240, + "step": 5210 + }, + { + "epoch": 0.7767351802204349, + "grad_norm": 7.679587364196777, + "learning_rate": 1.9414655942806077e-05, + "loss": 0.6385, + "num_input_tokens_seen": 3028928, + "step": 5215 + }, + { + "epoch": 0.7774798927613941, + "grad_norm": 5.2896037101745605, + "learning_rate": 1.943327375633006e-05, + "loss": 0.7483, + "num_input_tokens_seen": 3031808, + "step": 5220 + }, + { + "epoch": 0.7782246053023533, + "grad_norm": 5.048310279846191, + "learning_rate": 1.9451891569854037e-05, + "loss": 0.75, + "num_input_tokens_seen": 3034656, + "step": 5225 + }, + { + "epoch": 0.7789693178433125, + "grad_norm": 6.536549091339111, + "learning_rate": 1.9470509383378016e-05, + "loss": 0.7225, + "num_input_tokens_seen": 3037760, + "step": 5230 + }, + { + "epoch": 0.7797140303842717, + "grad_norm": 5.0216875076293945, + "learning_rate": 1.9489127196901997e-05, + "loss": 0.6584, + "num_input_tokens_seen": 3040672, + "step": 5235 + }, + { + "epoch": 0.7804587429252309, + "grad_norm": 4.20925235748291, + "learning_rate": 1.9507745010425975e-05, + "loss": 0.6752, + "num_input_tokens_seen": 3043552, + "step": 5240 + }, + { + "epoch": 0.7812034554661901, + "grad_norm": 6.86761999130249, + "learning_rate": 1.9526362823949957e-05, + "loss": 0.6934, + "num_input_tokens_seen": 3046944, + "step": 5245 + }, + { + "epoch": 0.7819481680071493, + "grad_norm": 6.824461936950684, + "learning_rate": 1.9544980637473935e-05, + "loss": 0.6397, + "num_input_tokens_seen": 3049952, + "step": 5250 + }, + { + "epoch": 0.7826928805481085, + "grad_norm": 14.228548049926758, + "learning_rate": 1.9563598450997917e-05, + "loss": 0.6743, + "num_input_tokens_seen": 3052576, + "step": 5255 + }, + { + "epoch": 0.7834375930890676, + "grad_norm": 6.214419364929199, + "learning_rate": 1.9582216264521895e-05, + "loss": 0.5849, + "num_input_tokens_seen": 3055520, + "step": 5260 + }, + { + "epoch": 0.7841823056300268, + "grad_norm": 4.537745475769043, + "learning_rate": 1.9600834078045877e-05, + "loss": 0.731, + "num_input_tokens_seen": 3058112, + "step": 5265 + }, + { + "epoch": 0.784927018170986, + "grad_norm": 2.8762753009796143, + "learning_rate": 1.9619451891569855e-05, + "loss": 0.5914, + "num_input_tokens_seen": 3061024, + "step": 5270 + }, + { + "epoch": 0.7856717307119452, + "grad_norm": 6.938438892364502, + "learning_rate": 1.9638069705093833e-05, + "loss": 0.6728, + "num_input_tokens_seen": 3064160, + "step": 5275 + }, + { + "epoch": 0.7864164432529044, + "grad_norm": 6.003373146057129, + "learning_rate": 1.9656687518617815e-05, + "loss": 0.5811, + "num_input_tokens_seen": 3066912, + "step": 5280 + }, + { + "epoch": 0.7871611557938636, + "grad_norm": 6.553404331207275, + "learning_rate": 1.9675305332141793e-05, + "loss": 0.5347, + "num_input_tokens_seen": 3069856, + "step": 5285 + }, + { + "epoch": 0.7879058683348228, + "grad_norm": 7.227819442749023, + "learning_rate": 1.9693923145665775e-05, + "loss": 0.6032, + "num_input_tokens_seen": 3072448, + "step": 5290 + }, + { + "epoch": 0.788650580875782, + "grad_norm": 9.558149337768555, + "learning_rate": 1.9712540959189753e-05, + "loss": 0.8745, + "num_input_tokens_seen": 3075392, + "step": 5295 + }, + { + "epoch": 0.7893952934167411, + "grad_norm": 8.053738594055176, + "learning_rate": 1.9731158772713735e-05, + "loss": 0.7794, + "num_input_tokens_seen": 3078176, + "step": 5300 + }, + { + "epoch": 0.7901400059577003, + "grad_norm": 4.57532262802124, + "learning_rate": 1.9749776586237713e-05, + "loss": 0.699, + "num_input_tokens_seen": 3081024, + "step": 5305 + }, + { + "epoch": 0.7908847184986595, + "grad_norm": 7.166165828704834, + "learning_rate": 1.976839439976169e-05, + "loss": 0.594, + "num_input_tokens_seen": 3083840, + "step": 5310 + }, + { + "epoch": 0.7916294310396187, + "grad_norm": 8.175718307495117, + "learning_rate": 1.9787012213285673e-05, + "loss": 0.648, + "num_input_tokens_seen": 3086560, + "step": 5315 + }, + { + "epoch": 0.7923741435805779, + "grad_norm": 7.155991077423096, + "learning_rate": 1.980563002680965e-05, + "loss": 0.6876, + "num_input_tokens_seen": 3089440, + "step": 5320 + }, + { + "epoch": 0.7931188561215371, + "grad_norm": 4.200064659118652, + "learning_rate": 1.9824247840333632e-05, + "loss": 0.5284, + "num_input_tokens_seen": 3092192, + "step": 5325 + }, + { + "epoch": 0.7938635686624963, + "grad_norm": 4.119879722595215, + "learning_rate": 1.984286565385761e-05, + "loss": 0.6215, + "num_input_tokens_seen": 3095264, + "step": 5330 + }, + { + "epoch": 0.7946082812034555, + "grad_norm": 4.216334819793701, + "learning_rate": 1.9861483467381592e-05, + "loss": 0.5973, + "num_input_tokens_seen": 3098272, + "step": 5335 + }, + { + "epoch": 0.7953529937444147, + "grad_norm": 4.202436923980713, + "learning_rate": 1.988010128090557e-05, + "loss": 0.7475, + "num_input_tokens_seen": 3101216, + "step": 5340 + }, + { + "epoch": 0.7960977062853738, + "grad_norm": 5.9474382400512695, + "learning_rate": 1.9898719094429552e-05, + "loss": 0.6288, + "num_input_tokens_seen": 3104032, + "step": 5345 + }, + { + "epoch": 0.796842418826333, + "grad_norm": 7.19460916519165, + "learning_rate": 1.9917336907953534e-05, + "loss": 0.7673, + "num_input_tokens_seen": 3106784, + "step": 5350 + }, + { + "epoch": 0.7975871313672922, + "grad_norm": 3.7512433528900146, + "learning_rate": 1.993595472147751e-05, + "loss": 0.742, + "num_input_tokens_seen": 3109440, + "step": 5355 + }, + { + "epoch": 0.7983318439082514, + "grad_norm": 13.19396686553955, + "learning_rate": 1.995457253500149e-05, + "loss": 0.6396, + "num_input_tokens_seen": 3112448, + "step": 5360 + }, + { + "epoch": 0.7990765564492106, + "grad_norm": 6.2171854972839355, + "learning_rate": 1.997319034852547e-05, + "loss": 0.5642, + "num_input_tokens_seen": 3115264, + "step": 5365 + }, + { + "epoch": 0.7998212689901698, + "grad_norm": 7.7366132736206055, + "learning_rate": 1.999180816204945e-05, + "loss": 0.6355, + "num_input_tokens_seen": 3118016, + "step": 5370 + }, + { + "epoch": 0.800565981531129, + "grad_norm": 6.072612285614014, + "learning_rate": 2.001042597557343e-05, + "loss": 0.5918, + "num_input_tokens_seen": 3121056, + "step": 5375 + }, + { + "epoch": 0.8013106940720882, + "grad_norm": 11.232367515563965, + "learning_rate": 2.002904378909741e-05, + "loss": 0.6208, + "num_input_tokens_seen": 3123936, + "step": 5380 + }, + { + "epoch": 0.8020554066130474, + "grad_norm": 11.26679801940918, + "learning_rate": 2.0047661602621388e-05, + "loss": 0.8782, + "num_input_tokens_seen": 3126816, + "step": 5385 + }, + { + "epoch": 0.8028001191540065, + "grad_norm": 4.639781475067139, + "learning_rate": 2.006627941614537e-05, + "loss": 0.5842, + "num_input_tokens_seen": 3129792, + "step": 5390 + }, + { + "epoch": 0.8035448316949657, + "grad_norm": 4.836690902709961, + "learning_rate": 2.008489722966935e-05, + "loss": 0.729, + "num_input_tokens_seen": 3132832, + "step": 5395 + }, + { + "epoch": 0.8042895442359249, + "grad_norm": 6.788226127624512, + "learning_rate": 2.0103515043193326e-05, + "loss": 0.7065, + "num_input_tokens_seen": 3135968, + "step": 5400 + }, + { + "epoch": 0.8050342567768841, + "grad_norm": 5.671861171722412, + "learning_rate": 2.0122132856717308e-05, + "loss": 0.8606, + "num_input_tokens_seen": 3138752, + "step": 5405 + }, + { + "epoch": 0.8057789693178433, + "grad_norm": 6.4647932052612305, + "learning_rate": 2.0140750670241286e-05, + "loss": 0.6785, + "num_input_tokens_seen": 3141696, + "step": 5410 + }, + { + "epoch": 0.8065236818588025, + "grad_norm": 4.249176979064941, + "learning_rate": 2.0159368483765268e-05, + "loss": 0.539, + "num_input_tokens_seen": 3144352, + "step": 5415 + }, + { + "epoch": 0.8072683943997617, + "grad_norm": 4.814325332641602, + "learning_rate": 2.0177986297289246e-05, + "loss": 0.6995, + "num_input_tokens_seen": 3147040, + "step": 5420 + }, + { + "epoch": 0.8080131069407209, + "grad_norm": 14.043322563171387, + "learning_rate": 2.0196604110813228e-05, + "loss": 0.8315, + "num_input_tokens_seen": 3150080, + "step": 5425 + }, + { + "epoch": 0.80875781948168, + "grad_norm": 6.624270915985107, + "learning_rate": 2.021522192433721e-05, + "loss": 0.5055, + "num_input_tokens_seen": 3152992, + "step": 5430 + }, + { + "epoch": 0.8095025320226392, + "grad_norm": 11.872994422912598, + "learning_rate": 2.0233839737861187e-05, + "loss": 0.8085, + "num_input_tokens_seen": 3155936, + "step": 5435 + }, + { + "epoch": 0.8102472445635984, + "grad_norm": 1.9421582221984863, + "learning_rate": 2.0252457551385166e-05, + "loss": 0.5796, + "num_input_tokens_seen": 3158912, + "step": 5440 + }, + { + "epoch": 0.8109919571045576, + "grad_norm": 7.619606971740723, + "learning_rate": 2.0271075364909144e-05, + "loss": 0.6783, + "num_input_tokens_seen": 3161920, + "step": 5445 + }, + { + "epoch": 0.8117366696455168, + "grad_norm": 4.240540027618408, + "learning_rate": 2.0289693178433126e-05, + "loss": 0.694, + "num_input_tokens_seen": 3165184, + "step": 5450 + }, + { + "epoch": 0.812481382186476, + "grad_norm": 5.1189727783203125, + "learning_rate": 2.0308310991957104e-05, + "loss": 0.6686, + "num_input_tokens_seen": 3168512, + "step": 5455 + }, + { + "epoch": 0.8132260947274352, + "grad_norm": 5.106654167175293, + "learning_rate": 2.0326928805481085e-05, + "loss": 0.5766, + "num_input_tokens_seen": 3171168, + "step": 5460 + }, + { + "epoch": 0.8139708072683944, + "grad_norm": 3.436267614364624, + "learning_rate": 2.0345546619005064e-05, + "loss": 0.6602, + "num_input_tokens_seen": 3173888, + "step": 5465 + }, + { + "epoch": 0.8147155198093536, + "grad_norm": 6.3795952796936035, + "learning_rate": 2.0364164432529045e-05, + "loss": 0.8173, + "num_input_tokens_seen": 3176800, + "step": 5470 + }, + { + "epoch": 0.8154602323503127, + "grad_norm": 7.927309989929199, + "learning_rate": 2.0382782246053027e-05, + "loss": 0.6154, + "num_input_tokens_seen": 3179552, + "step": 5475 + }, + { + "epoch": 0.8162049448912719, + "grad_norm": 5.284095287322998, + "learning_rate": 2.0401400059577005e-05, + "loss": 0.7969, + "num_input_tokens_seen": 3182400, + "step": 5480 + }, + { + "epoch": 0.8169496574322311, + "grad_norm": 8.669631004333496, + "learning_rate": 2.0420017873100983e-05, + "loss": 0.7698, + "num_input_tokens_seen": 3185280, + "step": 5485 + }, + { + "epoch": 0.8176943699731903, + "grad_norm": 3.5581107139587402, + "learning_rate": 2.043863568662496e-05, + "loss": 0.5247, + "num_input_tokens_seen": 3188128, + "step": 5490 + }, + { + "epoch": 0.8184390825141495, + "grad_norm": 4.101832866668701, + "learning_rate": 2.0457253500148943e-05, + "loss": 0.5978, + "num_input_tokens_seen": 3191072, + "step": 5495 + }, + { + "epoch": 0.8191837950551087, + "grad_norm": 14.849308013916016, + "learning_rate": 2.047587131367292e-05, + "loss": 0.6518, + "num_input_tokens_seen": 3194272, + "step": 5500 + }, + { + "epoch": 0.819928507596068, + "grad_norm": 7.558806896209717, + "learning_rate": 2.0494489127196903e-05, + "loss": 0.7665, + "num_input_tokens_seen": 3196992, + "step": 5505 + }, + { + "epoch": 0.8206732201370271, + "grad_norm": 9.08980941772461, + "learning_rate": 2.0513106940720885e-05, + "loss": 0.5986, + "num_input_tokens_seen": 3200096, + "step": 5510 + }, + { + "epoch": 0.8214179326779864, + "grad_norm": 4.833821773529053, + "learning_rate": 2.0531724754244863e-05, + "loss": 0.7196, + "num_input_tokens_seen": 3203104, + "step": 5515 + }, + { + "epoch": 0.8221626452189454, + "grad_norm": 21.98853302001953, + "learning_rate": 2.0550342567768845e-05, + "loss": 0.6772, + "num_input_tokens_seen": 3205728, + "step": 5520 + }, + { + "epoch": 0.8229073577599046, + "grad_norm": 13.469545364379883, + "learning_rate": 2.0568960381292823e-05, + "loss": 0.5544, + "num_input_tokens_seen": 3208320, + "step": 5525 + }, + { + "epoch": 0.8236520703008638, + "grad_norm": 5.244712829589844, + "learning_rate": 2.05875781948168e-05, + "loss": 0.7325, + "num_input_tokens_seen": 3211072, + "step": 5530 + }, + { + "epoch": 0.824396782841823, + "grad_norm": 5.758069038391113, + "learning_rate": 2.060619600834078e-05, + "loss": 0.6469, + "num_input_tokens_seen": 3214144, + "step": 5535 + }, + { + "epoch": 0.8251414953827823, + "grad_norm": 11.415926933288574, + "learning_rate": 2.062481382186476e-05, + "loss": 0.7574, + "num_input_tokens_seen": 3216992, + "step": 5540 + }, + { + "epoch": 0.8258862079237415, + "grad_norm": 7.378901958465576, + "learning_rate": 2.0643431635388742e-05, + "loss": 0.7546, + "num_input_tokens_seen": 3220064, + "step": 5545 + }, + { + "epoch": 0.8266309204647007, + "grad_norm": 5.098346710205078, + "learning_rate": 2.066204944891272e-05, + "loss": 0.4633, + "num_input_tokens_seen": 3222880, + "step": 5550 + }, + { + "epoch": 0.8273756330056599, + "grad_norm": 4.477623462677002, + "learning_rate": 2.0680667262436702e-05, + "loss": 0.6306, + "num_input_tokens_seen": 3225952, + "step": 5555 + }, + { + "epoch": 0.828120345546619, + "grad_norm": 6.2468767166137695, + "learning_rate": 2.069928507596068e-05, + "loss": 0.8442, + "num_input_tokens_seen": 3228672, + "step": 5560 + }, + { + "epoch": 0.8288650580875782, + "grad_norm": 4.078744888305664, + "learning_rate": 2.0717902889484662e-05, + "loss": 0.6217, + "num_input_tokens_seen": 3231392, + "step": 5565 + }, + { + "epoch": 0.8296097706285374, + "grad_norm": 15.150184631347656, + "learning_rate": 2.0736520703008637e-05, + "loss": 0.6113, + "num_input_tokens_seen": 3234272, + "step": 5570 + }, + { + "epoch": 0.8303544831694966, + "grad_norm": 6.7556562423706055, + "learning_rate": 2.075513851653262e-05, + "loss": 0.7404, + "num_input_tokens_seen": 3237056, + "step": 5575 + }, + { + "epoch": 0.8310991957104558, + "grad_norm": 5.943795680999756, + "learning_rate": 2.0773756330056597e-05, + "loss": 0.6061, + "num_input_tokens_seen": 3240064, + "step": 5580 + }, + { + "epoch": 0.831843908251415, + "grad_norm": 9.222502708435059, + "learning_rate": 2.079237414358058e-05, + "loss": 0.6918, + "num_input_tokens_seen": 3243072, + "step": 5585 + }, + { + "epoch": 0.8325886207923742, + "grad_norm": 6.652446746826172, + "learning_rate": 2.081099195710456e-05, + "loss": 0.7187, + "num_input_tokens_seen": 3245920, + "step": 5590 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 4.7361273765563965, + "learning_rate": 2.082960977062854e-05, + "loss": 0.71, + "num_input_tokens_seen": 3249280, + "step": 5595 + }, + { + "epoch": 0.8340780458742926, + "grad_norm": 4.287582874298096, + "learning_rate": 2.084822758415252e-05, + "loss": 0.6093, + "num_input_tokens_seen": 3252320, + "step": 5600 + }, + { + "epoch": 0.8348227584152517, + "grad_norm": 7.236633777618408, + "learning_rate": 2.0866845397676498e-05, + "loss": 0.6249, + "num_input_tokens_seen": 3255456, + "step": 5605 + }, + { + "epoch": 0.8355674709562109, + "grad_norm": 8.034500122070312, + "learning_rate": 2.088546321120048e-05, + "loss": 0.6749, + "num_input_tokens_seen": 3258144, + "step": 5610 + }, + { + "epoch": 0.8363121834971701, + "grad_norm": 4.814291000366211, + "learning_rate": 2.0904081024724455e-05, + "loss": 0.7227, + "num_input_tokens_seen": 3260832, + "step": 5615 + }, + { + "epoch": 0.8370568960381293, + "grad_norm": 5.348927974700928, + "learning_rate": 2.0922698838248436e-05, + "loss": 0.7272, + "num_input_tokens_seen": 3263712, + "step": 5620 + }, + { + "epoch": 0.8378016085790885, + "grad_norm": 9.487268447875977, + "learning_rate": 2.0941316651772418e-05, + "loss": 0.6762, + "num_input_tokens_seen": 3266688, + "step": 5625 + }, + { + "epoch": 0.8385463211200477, + "grad_norm": 6.108944892883301, + "learning_rate": 2.0959934465296396e-05, + "loss": 0.7984, + "num_input_tokens_seen": 3269504, + "step": 5630 + }, + { + "epoch": 0.8392910336610069, + "grad_norm": 2.8755974769592285, + "learning_rate": 2.0978552278820378e-05, + "loss": 0.5116, + "num_input_tokens_seen": 3272608, + "step": 5635 + }, + { + "epoch": 0.8400357462019661, + "grad_norm": 8.186783790588379, + "learning_rate": 2.0997170092344356e-05, + "loss": 0.5714, + "num_input_tokens_seen": 3275584, + "step": 5640 + }, + { + "epoch": 0.8407804587429253, + "grad_norm": 25.931730270385742, + "learning_rate": 2.1015787905868338e-05, + "loss": 0.5671, + "num_input_tokens_seen": 3278368, + "step": 5645 + }, + { + "epoch": 0.8415251712838844, + "grad_norm": 8.721343040466309, + "learning_rate": 2.1034405719392316e-05, + "loss": 0.7249, + "num_input_tokens_seen": 3281312, + "step": 5650 + }, + { + "epoch": 0.8422698838248436, + "grad_norm": 4.938953876495361, + "learning_rate": 2.1053023532916297e-05, + "loss": 0.5391, + "num_input_tokens_seen": 3284128, + "step": 5655 + }, + { + "epoch": 0.8430145963658028, + "grad_norm": 8.75028133392334, + "learning_rate": 2.1071641346440272e-05, + "loss": 0.6088, + "num_input_tokens_seen": 3286816, + "step": 5660 + }, + { + "epoch": 0.843759308906762, + "grad_norm": 5.072882652282715, + "learning_rate": 2.1090259159964254e-05, + "loss": 0.5777, + "num_input_tokens_seen": 3289824, + "step": 5665 + }, + { + "epoch": 0.8445040214477212, + "grad_norm": 15.472270965576172, + "learning_rate": 2.1108876973488236e-05, + "loss": 0.5642, + "num_input_tokens_seen": 3293152, + "step": 5670 + }, + { + "epoch": 0.8452487339886804, + "grad_norm": 11.075738906860352, + "learning_rate": 2.1127494787012214e-05, + "loss": 0.6739, + "num_input_tokens_seen": 3295968, + "step": 5675 + }, + { + "epoch": 0.8459934465296396, + "grad_norm": 9.647725105285645, + "learning_rate": 2.1146112600536195e-05, + "loss": 0.7355, + "num_input_tokens_seen": 3298528, + "step": 5680 + }, + { + "epoch": 0.8467381590705988, + "grad_norm": 5.180730819702148, + "learning_rate": 2.1164730414060174e-05, + "loss": 0.731, + "num_input_tokens_seen": 3301696, + "step": 5685 + }, + { + "epoch": 0.8474828716115579, + "grad_norm": 10.877184867858887, + "learning_rate": 2.1183348227584155e-05, + "loss": 0.7463, + "num_input_tokens_seen": 3304672, + "step": 5690 + }, + { + "epoch": 0.8482275841525171, + "grad_norm": 5.153077125549316, + "learning_rate": 2.1201966041108133e-05, + "loss": 0.627, + "num_input_tokens_seen": 3307168, + "step": 5695 + }, + { + "epoch": 0.8489722966934763, + "grad_norm": 7.308355808258057, + "learning_rate": 2.1220583854632112e-05, + "loss": 0.6607, + "num_input_tokens_seen": 3309952, + "step": 5700 + }, + { + "epoch": 0.8497170092344355, + "grad_norm": 7.207441806793213, + "learning_rate": 2.1239201668156093e-05, + "loss": 0.7317, + "num_input_tokens_seen": 3312800, + "step": 5705 + }, + { + "epoch": 0.8504617217753947, + "grad_norm": 9.927770614624023, + "learning_rate": 2.125781948168007e-05, + "loss": 0.6606, + "num_input_tokens_seen": 3315808, + "step": 5710 + }, + { + "epoch": 0.8512064343163539, + "grad_norm": 7.579364776611328, + "learning_rate": 2.1276437295204053e-05, + "loss": 0.7426, + "num_input_tokens_seen": 3318720, + "step": 5715 + }, + { + "epoch": 0.8519511468573131, + "grad_norm": 3.2974064350128174, + "learning_rate": 2.129505510872803e-05, + "loss": 0.6908, + "num_input_tokens_seen": 3321728, + "step": 5720 + }, + { + "epoch": 0.8526958593982723, + "grad_norm": 4.553257465362549, + "learning_rate": 2.1313672922252013e-05, + "loss": 0.7111, + "num_input_tokens_seen": 3324640, + "step": 5725 + }, + { + "epoch": 0.8534405719392315, + "grad_norm": 3.7902615070343018, + "learning_rate": 2.133229073577599e-05, + "loss": 0.7563, + "num_input_tokens_seen": 3327456, + "step": 5730 + }, + { + "epoch": 0.8541852844801906, + "grad_norm": 3.352454662322998, + "learning_rate": 2.1350908549299973e-05, + "loss": 0.6618, + "num_input_tokens_seen": 3330560, + "step": 5735 + }, + { + "epoch": 0.8549299970211498, + "grad_norm": 12.07020092010498, + "learning_rate": 2.136952636282395e-05, + "loss": 0.7711, + "num_input_tokens_seen": 3333600, + "step": 5740 + }, + { + "epoch": 0.855674709562109, + "grad_norm": 7.251728057861328, + "learning_rate": 2.138814417634793e-05, + "loss": 0.7042, + "num_input_tokens_seen": 3336800, + "step": 5745 + }, + { + "epoch": 0.8564194221030682, + "grad_norm": 7.550848484039307, + "learning_rate": 2.140676198987191e-05, + "loss": 0.5533, + "num_input_tokens_seen": 3339552, + "step": 5750 + }, + { + "epoch": 0.8571641346440274, + "grad_norm": 4.661020278930664, + "learning_rate": 2.142537980339589e-05, + "loss": 0.5803, + "num_input_tokens_seen": 3342624, + "step": 5755 + }, + { + "epoch": 0.8579088471849866, + "grad_norm": 5.362486362457275, + "learning_rate": 2.144399761691987e-05, + "loss": 0.4835, + "num_input_tokens_seen": 3345984, + "step": 5760 + }, + { + "epoch": 0.8586535597259458, + "grad_norm": 8.498034477233887, + "learning_rate": 2.146261543044385e-05, + "loss": 0.5077, + "num_input_tokens_seen": 3348864, + "step": 5765 + }, + { + "epoch": 0.859398272266905, + "grad_norm": 5.049686908721924, + "learning_rate": 2.148123324396783e-05, + "loss": 0.5176, + "num_input_tokens_seen": 3351744, + "step": 5770 + }, + { + "epoch": 0.8601429848078642, + "grad_norm": 8.768350601196289, + "learning_rate": 2.149985105749181e-05, + "loss": 0.7003, + "num_input_tokens_seen": 3354528, + "step": 5775 + }, + { + "epoch": 0.8608876973488233, + "grad_norm": 5.974066734313965, + "learning_rate": 2.151846887101579e-05, + "loss": 0.6692, + "num_input_tokens_seen": 3357376, + "step": 5780 + }, + { + "epoch": 0.8616324098897825, + "grad_norm": 8.275078773498535, + "learning_rate": 2.153708668453977e-05, + "loss": 0.8655, + "num_input_tokens_seen": 3360224, + "step": 5785 + }, + { + "epoch": 0.8623771224307417, + "grad_norm": 6.456306457519531, + "learning_rate": 2.1555704498063747e-05, + "loss": 0.5437, + "num_input_tokens_seen": 3362816, + "step": 5790 + }, + { + "epoch": 0.8631218349717009, + "grad_norm": 3.195430040359497, + "learning_rate": 2.157432231158773e-05, + "loss": 0.7384, + "num_input_tokens_seen": 3365728, + "step": 5795 + }, + { + "epoch": 0.8638665475126601, + "grad_norm": 5.296041011810303, + "learning_rate": 2.1592940125111707e-05, + "loss": 0.6481, + "num_input_tokens_seen": 3368448, + "step": 5800 + }, + { + "epoch": 0.8646112600536193, + "grad_norm": 5.982844829559326, + "learning_rate": 2.161155793863569e-05, + "loss": 0.6728, + "num_input_tokens_seen": 3371392, + "step": 5805 + }, + { + "epoch": 0.8653559725945785, + "grad_norm": 5.527019500732422, + "learning_rate": 2.1630175752159667e-05, + "loss": 0.5294, + "num_input_tokens_seen": 3373952, + "step": 5810 + }, + { + "epoch": 0.8661006851355377, + "grad_norm": 8.475357055664062, + "learning_rate": 2.164879356568365e-05, + "loss": 0.6446, + "num_input_tokens_seen": 3376672, + "step": 5815 + }, + { + "epoch": 0.8668453976764968, + "grad_norm": 6.900132656097412, + "learning_rate": 2.1667411379207627e-05, + "loss": 0.8751, + "num_input_tokens_seen": 3379840, + "step": 5820 + }, + { + "epoch": 0.867590110217456, + "grad_norm": 5.404610633850098, + "learning_rate": 2.1686029192731608e-05, + "loss": 0.5861, + "num_input_tokens_seen": 3382592, + "step": 5825 + }, + { + "epoch": 0.8683348227584152, + "grad_norm": 4.531290531158447, + "learning_rate": 2.1704647006255586e-05, + "loss": 0.7713, + "num_input_tokens_seen": 3385248, + "step": 5830 + }, + { + "epoch": 0.8690795352993744, + "grad_norm": 3.531033515930176, + "learning_rate": 2.1723264819779565e-05, + "loss": 0.7095, + "num_input_tokens_seen": 3388288, + "step": 5835 + }, + { + "epoch": 0.8698242478403336, + "grad_norm": 6.7946295738220215, + "learning_rate": 2.1741882633303546e-05, + "loss": 0.7318, + "num_input_tokens_seen": 3391008, + "step": 5840 + }, + { + "epoch": 0.8705689603812928, + "grad_norm": 2.6904098987579346, + "learning_rate": 2.1760500446827525e-05, + "loss": 0.5848, + "num_input_tokens_seen": 3393920, + "step": 5845 + }, + { + "epoch": 0.871313672922252, + "grad_norm": 3.0655410289764404, + "learning_rate": 2.1779118260351506e-05, + "loss": 0.7681, + "num_input_tokens_seen": 3396768, + "step": 5850 + }, + { + "epoch": 0.8720583854632112, + "grad_norm": 8.992547988891602, + "learning_rate": 2.1797736073875484e-05, + "loss": 0.7299, + "num_input_tokens_seen": 3399680, + "step": 5855 + }, + { + "epoch": 0.8728030980041704, + "grad_norm": 8.491602897644043, + "learning_rate": 2.1816353887399466e-05, + "loss": 0.6535, + "num_input_tokens_seen": 3402528, + "step": 5860 + }, + { + "epoch": 0.8735478105451295, + "grad_norm": 3.1488449573516846, + "learning_rate": 2.1834971700923444e-05, + "loss": 0.5424, + "num_input_tokens_seen": 3405248, + "step": 5865 + }, + { + "epoch": 0.8742925230860887, + "grad_norm": 4.123893737792969, + "learning_rate": 2.1853589514447426e-05, + "loss": 0.6838, + "num_input_tokens_seen": 3408448, + "step": 5870 + }, + { + "epoch": 0.8750372356270479, + "grad_norm": 6.091264724731445, + "learning_rate": 2.1872207327971404e-05, + "loss": 0.7105, + "num_input_tokens_seen": 3411520, + "step": 5875 + }, + { + "epoch": 0.8757819481680071, + "grad_norm": 2.9534552097320557, + "learning_rate": 2.1890825141495382e-05, + "loss": 0.4796, + "num_input_tokens_seen": 3414528, + "step": 5880 + }, + { + "epoch": 0.8765266607089663, + "grad_norm": 6.669384956359863, + "learning_rate": 2.1909442955019364e-05, + "loss": 0.774, + "num_input_tokens_seen": 3417280, + "step": 5885 + }, + { + "epoch": 0.8772713732499255, + "grad_norm": 6.328810691833496, + "learning_rate": 2.1928060768543342e-05, + "loss": 0.5396, + "num_input_tokens_seen": 3420416, + "step": 5890 + }, + { + "epoch": 0.8780160857908847, + "grad_norm": 6.637282848358154, + "learning_rate": 2.1946678582067324e-05, + "loss": 0.5112, + "num_input_tokens_seen": 3423104, + "step": 5895 + }, + { + "epoch": 0.878760798331844, + "grad_norm": 7.09002685546875, + "learning_rate": 2.1965296395591302e-05, + "loss": 0.5858, + "num_input_tokens_seen": 3426080, + "step": 5900 + }, + { + "epoch": 0.8795055108728032, + "grad_norm": 4.865998268127441, + "learning_rate": 2.1983914209115284e-05, + "loss": 0.5647, + "num_input_tokens_seen": 3429152, + "step": 5905 + }, + { + "epoch": 0.8802502234137622, + "grad_norm": 8.057366371154785, + "learning_rate": 2.2002532022639262e-05, + "loss": 0.8932, + "num_input_tokens_seen": 3431904, + "step": 5910 + }, + { + "epoch": 0.8809949359547214, + "grad_norm": 6.041391849517822, + "learning_rate": 2.202114983616324e-05, + "loss": 0.6134, + "num_input_tokens_seen": 3434880, + "step": 5915 + }, + { + "epoch": 0.8817396484956807, + "grad_norm": 4.681719779968262, + "learning_rate": 2.2039767649687222e-05, + "loss": 0.7436, + "num_input_tokens_seen": 3437728, + "step": 5920 + }, + { + "epoch": 0.8824843610366399, + "grad_norm": 7.739383697509766, + "learning_rate": 2.20583854632112e-05, + "loss": 0.5644, + "num_input_tokens_seen": 3440640, + "step": 5925 + }, + { + "epoch": 0.883229073577599, + "grad_norm": 6.27061653137207, + "learning_rate": 2.207700327673518e-05, + "loss": 0.7169, + "num_input_tokens_seen": 3443488, + "step": 5930 + }, + { + "epoch": 0.8839737861185583, + "grad_norm": 5.521172046661377, + "learning_rate": 2.209562109025916e-05, + "loss": 0.5655, + "num_input_tokens_seen": 3446176, + "step": 5935 + }, + { + "epoch": 0.8847184986595175, + "grad_norm": 3.763373851776123, + "learning_rate": 2.211423890378314e-05, + "loss": 0.6887, + "num_input_tokens_seen": 3448960, + "step": 5940 + }, + { + "epoch": 0.8854632112004767, + "grad_norm": 9.57369613647461, + "learning_rate": 2.213285671730712e-05, + "loss": 0.4369, + "num_input_tokens_seen": 3451648, + "step": 5945 + }, + { + "epoch": 0.8862079237414358, + "grad_norm": 6.020166397094727, + "learning_rate": 2.21514745308311e-05, + "loss": 0.8665, + "num_input_tokens_seen": 3454464, + "step": 5950 + }, + { + "epoch": 0.886952636282395, + "grad_norm": 5.686753749847412, + "learning_rate": 2.2170092344355083e-05, + "loss": 0.6037, + "num_input_tokens_seen": 3457248, + "step": 5955 + }, + { + "epoch": 0.8876973488233542, + "grad_norm": 4.0258307456970215, + "learning_rate": 2.2188710157879058e-05, + "loss": 0.6106, + "num_input_tokens_seen": 3459872, + "step": 5960 + }, + { + "epoch": 0.8884420613643134, + "grad_norm": 5.050349235534668, + "learning_rate": 2.220732797140304e-05, + "loss": 0.6191, + "num_input_tokens_seen": 3462880, + "step": 5965 + }, + { + "epoch": 0.8891867739052726, + "grad_norm": 9.256659507751465, + "learning_rate": 2.2225945784927018e-05, + "loss": 0.6473, + "num_input_tokens_seen": 3465792, + "step": 5970 + }, + { + "epoch": 0.8899314864462318, + "grad_norm": 4.53715705871582, + "learning_rate": 2.2244563598451e-05, + "loss": 0.7022, + "num_input_tokens_seen": 3468608, + "step": 5975 + }, + { + "epoch": 0.890676198987191, + "grad_norm": 10.52013111114502, + "learning_rate": 2.2263181411974977e-05, + "loss": 0.6365, + "num_input_tokens_seen": 3471520, + "step": 5980 + }, + { + "epoch": 0.8914209115281502, + "grad_norm": 5.076442241668701, + "learning_rate": 2.228179922549896e-05, + "loss": 0.705, + "num_input_tokens_seen": 3474432, + "step": 5985 + }, + { + "epoch": 0.8921656240691094, + "grad_norm": 4.371711254119873, + "learning_rate": 2.2300417039022937e-05, + "loss": 0.6178, + "num_input_tokens_seen": 3477216, + "step": 5990 + }, + { + "epoch": 0.8929103366100685, + "grad_norm": 4.6771345138549805, + "learning_rate": 2.231903485254692e-05, + "loss": 0.7024, + "num_input_tokens_seen": 3480160, + "step": 5995 + }, + { + "epoch": 0.8936550491510277, + "grad_norm": 8.860610008239746, + "learning_rate": 2.23376526660709e-05, + "loss": 0.5738, + "num_input_tokens_seen": 3483040, + "step": 6000 + }, + { + "epoch": 0.8943997616919869, + "grad_norm": 8.467142105102539, + "learning_rate": 2.2356270479594875e-05, + "loss": 0.6152, + "num_input_tokens_seen": 3485824, + "step": 6005 + }, + { + "epoch": 0.8951444742329461, + "grad_norm": 6.281243801116943, + "learning_rate": 2.2374888293118857e-05, + "loss": 0.6462, + "num_input_tokens_seen": 3488544, + "step": 6010 + }, + { + "epoch": 0.8958891867739053, + "grad_norm": 4.1797194480896, + "learning_rate": 2.2393506106642835e-05, + "loss": 0.6937, + "num_input_tokens_seen": 3491680, + "step": 6015 + }, + { + "epoch": 0.8966338993148645, + "grad_norm": 5.762698650360107, + "learning_rate": 2.2412123920166817e-05, + "loss": 0.7714, + "num_input_tokens_seen": 3494784, + "step": 6020 + }, + { + "epoch": 0.8973786118558237, + "grad_norm": 10.705824851989746, + "learning_rate": 2.2430741733690795e-05, + "loss": 0.6494, + "num_input_tokens_seen": 3497504, + "step": 6025 + }, + { + "epoch": 0.8981233243967829, + "grad_norm": 5.388837814331055, + "learning_rate": 2.2449359547214777e-05, + "loss": 0.5968, + "num_input_tokens_seen": 3500416, + "step": 6030 + }, + { + "epoch": 0.898868036937742, + "grad_norm": 12.578166007995605, + "learning_rate": 2.246797736073876e-05, + "loss": 0.829, + "num_input_tokens_seen": 3503168, + "step": 6035 + }, + { + "epoch": 0.8996127494787012, + "grad_norm": 5.935365200042725, + "learning_rate": 2.2486595174262737e-05, + "loss": 0.7809, + "num_input_tokens_seen": 3506080, + "step": 6040 + }, + { + "epoch": 0.9003574620196604, + "grad_norm": 6.310588359832764, + "learning_rate": 2.2505212987786715e-05, + "loss": 0.7254, + "num_input_tokens_seen": 3509216, + "step": 6045 + }, + { + "epoch": 0.9011021745606196, + "grad_norm": 8.918623924255371, + "learning_rate": 2.2523830801310693e-05, + "loss": 0.7901, + "num_input_tokens_seen": 3512064, + "step": 6050 + }, + { + "epoch": 0.9018468871015788, + "grad_norm": 5.21087646484375, + "learning_rate": 2.2542448614834675e-05, + "loss": 0.6813, + "num_input_tokens_seen": 3514784, + "step": 6055 + }, + { + "epoch": 0.902591599642538, + "grad_norm": 5.0879740715026855, + "learning_rate": 2.2561066428358653e-05, + "loss": 0.631, + "num_input_tokens_seen": 3518048, + "step": 6060 + }, + { + "epoch": 0.9033363121834972, + "grad_norm": 4.694946765899658, + "learning_rate": 2.2579684241882634e-05, + "loss": 0.6258, + "num_input_tokens_seen": 3521184, + "step": 6065 + }, + { + "epoch": 0.9040810247244564, + "grad_norm": 4.3360185623168945, + "learning_rate": 2.2598302055406613e-05, + "loss": 0.6255, + "num_input_tokens_seen": 3524224, + "step": 6070 + }, + { + "epoch": 0.9048257372654156, + "grad_norm": 4.325817108154297, + "learning_rate": 2.2616919868930594e-05, + "loss": 0.6915, + "num_input_tokens_seen": 3527104, + "step": 6075 + }, + { + "epoch": 0.9055704498063747, + "grad_norm": 4.885454177856445, + "learning_rate": 2.2635537682454576e-05, + "loss": 0.589, + "num_input_tokens_seen": 3529824, + "step": 6080 + }, + { + "epoch": 0.9063151623473339, + "grad_norm": 9.01417064666748, + "learning_rate": 2.2654155495978554e-05, + "loss": 0.7859, + "num_input_tokens_seen": 3532768, + "step": 6085 + }, + { + "epoch": 0.9070598748882931, + "grad_norm": 8.323040008544922, + "learning_rate": 2.2672773309502532e-05, + "loss": 0.6025, + "num_input_tokens_seen": 3535552, + "step": 6090 + }, + { + "epoch": 0.9078045874292523, + "grad_norm": 4.234975337982178, + "learning_rate": 2.269139112302651e-05, + "loss": 0.6901, + "num_input_tokens_seen": 3538336, + "step": 6095 + }, + { + "epoch": 0.9085492999702115, + "grad_norm": 12.392552375793457, + "learning_rate": 2.2710008936550492e-05, + "loss": 0.8423, + "num_input_tokens_seen": 3540928, + "step": 6100 + }, + { + "epoch": 0.9092940125111707, + "grad_norm": 10.22829818725586, + "learning_rate": 2.272862675007447e-05, + "loss": 0.634, + "num_input_tokens_seen": 3544000, + "step": 6105 + }, + { + "epoch": 0.9100387250521299, + "grad_norm": 5.676419258117676, + "learning_rate": 2.2747244563598452e-05, + "loss": 0.8218, + "num_input_tokens_seen": 3546784, + "step": 6110 + }, + { + "epoch": 0.9107834375930891, + "grad_norm": 4.655232906341553, + "learning_rate": 2.2765862377122434e-05, + "loss": 0.6011, + "num_input_tokens_seen": 3549696, + "step": 6115 + }, + { + "epoch": 0.9115281501340483, + "grad_norm": 3.217960834503174, + "learning_rate": 2.2784480190646412e-05, + "loss": 0.5222, + "num_input_tokens_seen": 3552480, + "step": 6120 + }, + { + "epoch": 0.9122728626750074, + "grad_norm": 7.658663272857666, + "learning_rate": 2.2803098004170394e-05, + "loss": 0.5456, + "num_input_tokens_seen": 3555360, + "step": 6125 + }, + { + "epoch": 0.9130175752159666, + "grad_norm": 7.398868560791016, + "learning_rate": 2.2821715817694372e-05, + "loss": 0.6588, + "num_input_tokens_seen": 3558528, + "step": 6130 + }, + { + "epoch": 0.9137622877569258, + "grad_norm": 5.691643714904785, + "learning_rate": 2.284033363121835e-05, + "loss": 0.6591, + "num_input_tokens_seen": 3561280, + "step": 6135 + }, + { + "epoch": 0.914507000297885, + "grad_norm": 2.9445743560791016, + "learning_rate": 2.2858951444742328e-05, + "loss": 0.4799, + "num_input_tokens_seen": 3564544, + "step": 6140 + }, + { + "epoch": 0.9152517128388442, + "grad_norm": 6.986311912536621, + "learning_rate": 2.287756925826631e-05, + "loss": 0.7415, + "num_input_tokens_seen": 3567104, + "step": 6145 + }, + { + "epoch": 0.9159964253798034, + "grad_norm": 7.380460262298584, + "learning_rate": 2.289618707179029e-05, + "loss": 0.5169, + "num_input_tokens_seen": 3570080, + "step": 6150 + }, + { + "epoch": 0.9167411379207626, + "grad_norm": 13.775399208068848, + "learning_rate": 2.291480488531427e-05, + "loss": 0.6865, + "num_input_tokens_seen": 3573056, + "step": 6155 + }, + { + "epoch": 0.9174858504617218, + "grad_norm": 4.817423343658447, + "learning_rate": 2.293342269883825e-05, + "loss": 0.7117, + "num_input_tokens_seen": 3576064, + "step": 6160 + }, + { + "epoch": 0.9182305630026809, + "grad_norm": 7.551164150238037, + "learning_rate": 2.295204051236223e-05, + "loss": 0.7159, + "num_input_tokens_seen": 3579040, + "step": 6165 + }, + { + "epoch": 0.9189752755436401, + "grad_norm": 5.1075439453125, + "learning_rate": 2.297065832588621e-05, + "loss": 0.5716, + "num_input_tokens_seen": 3581984, + "step": 6170 + }, + { + "epoch": 0.9197199880845993, + "grad_norm": 9.317129135131836, + "learning_rate": 2.2989276139410186e-05, + "loss": 0.6916, + "num_input_tokens_seen": 3584960, + "step": 6175 + }, + { + "epoch": 0.9204647006255585, + "grad_norm": 4.125770092010498, + "learning_rate": 2.3007893952934168e-05, + "loss": 0.6725, + "num_input_tokens_seen": 3587808, + "step": 6180 + }, + { + "epoch": 0.9212094131665177, + "grad_norm": 4.952744007110596, + "learning_rate": 2.3026511766458146e-05, + "loss": 0.6135, + "num_input_tokens_seen": 3590656, + "step": 6185 + }, + { + "epoch": 0.9219541257074769, + "grad_norm": 4.1541643142700195, + "learning_rate": 2.3045129579982128e-05, + "loss": 0.8301, + "num_input_tokens_seen": 3593248, + "step": 6190 + }, + { + "epoch": 0.9226988382484361, + "grad_norm": 7.430983543395996, + "learning_rate": 2.306374739350611e-05, + "loss": 0.5812, + "num_input_tokens_seen": 3596192, + "step": 6195 + }, + { + "epoch": 0.9234435507893953, + "grad_norm": 4.7521185874938965, + "learning_rate": 2.3082365207030087e-05, + "loss": 0.7691, + "num_input_tokens_seen": 3599264, + "step": 6200 + }, + { + "epoch": 0.9241882633303545, + "grad_norm": 3.002824544906616, + "learning_rate": 2.310098302055407e-05, + "loss": 0.5998, + "num_input_tokens_seen": 3602304, + "step": 6205 + }, + { + "epoch": 0.9249329758713136, + "grad_norm": 2.2385735511779785, + "learning_rate": 2.3119600834078047e-05, + "loss": 0.605, + "num_input_tokens_seen": 3605056, + "step": 6210 + }, + { + "epoch": 0.9256776884122728, + "grad_norm": 8.179418563842773, + "learning_rate": 2.313821864760203e-05, + "loss": 0.5642, + "num_input_tokens_seen": 3607904, + "step": 6215 + }, + { + "epoch": 0.926422400953232, + "grad_norm": 5.82619571685791, + "learning_rate": 2.3156836461126004e-05, + "loss": 0.7669, + "num_input_tokens_seen": 3611008, + "step": 6220 + }, + { + "epoch": 0.9271671134941912, + "grad_norm": 7.177924156188965, + "learning_rate": 2.3175454274649985e-05, + "loss": 0.8955, + "num_input_tokens_seen": 3613920, + "step": 6225 + }, + { + "epoch": 0.9279118260351504, + "grad_norm": 5.813066482543945, + "learning_rate": 2.3194072088173967e-05, + "loss": 0.5576, + "num_input_tokens_seen": 3616608, + "step": 6230 + }, + { + "epoch": 0.9286565385761096, + "grad_norm": 6.6306071281433105, + "learning_rate": 2.3212689901697945e-05, + "loss": 0.6201, + "num_input_tokens_seen": 3619776, + "step": 6235 + }, + { + "epoch": 0.9294012511170688, + "grad_norm": 5.840342044830322, + "learning_rate": 2.3231307715221927e-05, + "loss": 0.6388, + "num_input_tokens_seen": 3622624, + "step": 6240 + }, + { + "epoch": 0.930145963658028, + "grad_norm": 8.04843521118164, + "learning_rate": 2.3249925528745905e-05, + "loss": 0.477, + "num_input_tokens_seen": 3625536, + "step": 6245 + }, + { + "epoch": 0.9308906761989872, + "grad_norm": 9.863022804260254, + "learning_rate": 2.3268543342269887e-05, + "loss": 0.6527, + "num_input_tokens_seen": 3628288, + "step": 6250 + }, + { + "epoch": 0.9316353887399463, + "grad_norm": 7.092312335968018, + "learning_rate": 2.3287161155793865e-05, + "loss": 0.7162, + "num_input_tokens_seen": 3631200, + "step": 6255 + }, + { + "epoch": 0.9323801012809055, + "grad_norm": 4.177121639251709, + "learning_rate": 2.3305778969317847e-05, + "loss": 0.7502, + "num_input_tokens_seen": 3634144, + "step": 6260 + }, + { + "epoch": 0.9331248138218647, + "grad_norm": 5.617934703826904, + "learning_rate": 2.332439678284182e-05, + "loss": 0.6446, + "num_input_tokens_seen": 3637024, + "step": 6265 + }, + { + "epoch": 0.9338695263628239, + "grad_norm": 9.249531745910645, + "learning_rate": 2.3343014596365803e-05, + "loss": 0.8359, + "num_input_tokens_seen": 3640000, + "step": 6270 + }, + { + "epoch": 0.9346142389037831, + "grad_norm": 8.76955795288086, + "learning_rate": 2.3361632409889785e-05, + "loss": 0.8468, + "num_input_tokens_seen": 3643168, + "step": 6275 + }, + { + "epoch": 0.9353589514447423, + "grad_norm": 5.3314080238342285, + "learning_rate": 2.3380250223413763e-05, + "loss": 0.5435, + "num_input_tokens_seen": 3645856, + "step": 6280 + }, + { + "epoch": 0.9361036639857016, + "grad_norm": 4.1428914070129395, + "learning_rate": 2.3398868036937744e-05, + "loss": 0.7711, + "num_input_tokens_seen": 3648480, + "step": 6285 + }, + { + "epoch": 0.9368483765266608, + "grad_norm": 3.973022222518921, + "learning_rate": 2.3417485850461723e-05, + "loss": 0.6367, + "num_input_tokens_seen": 3651168, + "step": 6290 + }, + { + "epoch": 0.9375930890676198, + "grad_norm": 8.17785930633545, + "learning_rate": 2.3436103663985704e-05, + "loss": 0.6985, + "num_input_tokens_seen": 3654144, + "step": 6295 + }, + { + "epoch": 0.938337801608579, + "grad_norm": 4.532053470611572, + "learning_rate": 2.3454721477509683e-05, + "loss": 0.8477, + "num_input_tokens_seen": 3657056, + "step": 6300 + }, + { + "epoch": 0.9390825141495382, + "grad_norm": 3.7526559829711914, + "learning_rate": 2.347333929103366e-05, + "loss": 0.6986, + "num_input_tokens_seen": 3660192, + "step": 6305 + }, + { + "epoch": 0.9398272266904975, + "grad_norm": 4.995903968811035, + "learning_rate": 2.3491957104557642e-05, + "loss": 0.5991, + "num_input_tokens_seen": 3663008, + "step": 6310 + }, + { + "epoch": 0.9405719392314567, + "grad_norm": 9.052454948425293, + "learning_rate": 2.351057491808162e-05, + "loss": 0.7087, + "num_input_tokens_seen": 3665984, + "step": 6315 + }, + { + "epoch": 0.9413166517724159, + "grad_norm": 7.311150074005127, + "learning_rate": 2.3529192731605602e-05, + "loss": 0.8, + "num_input_tokens_seen": 3668896, + "step": 6320 + }, + { + "epoch": 0.9420613643133751, + "grad_norm": 4.518275737762451, + "learning_rate": 2.354781054512958e-05, + "loss": 0.6294, + "num_input_tokens_seen": 3671552, + "step": 6325 + }, + { + "epoch": 0.9428060768543343, + "grad_norm": 2.915361166000366, + "learning_rate": 2.3566428358653562e-05, + "loss": 0.7029, + "num_input_tokens_seen": 3674528, + "step": 6330 + }, + { + "epoch": 0.9435507893952935, + "grad_norm": 5.28243350982666, + "learning_rate": 2.358504617217754e-05, + "loss": 0.7643, + "num_input_tokens_seen": 3677184, + "step": 6335 + }, + { + "epoch": 0.9442955019362526, + "grad_norm": 7.640462398529053, + "learning_rate": 2.3603663985701522e-05, + "loss": 0.7008, + "num_input_tokens_seen": 3679744, + "step": 6340 + }, + { + "epoch": 0.9450402144772118, + "grad_norm": 5.809264659881592, + "learning_rate": 2.36222817992255e-05, + "loss": 0.8038, + "num_input_tokens_seen": 3682624, + "step": 6345 + }, + { + "epoch": 0.945784927018171, + "grad_norm": 2.366241216659546, + "learning_rate": 2.364089961274948e-05, + "loss": 0.6717, + "num_input_tokens_seen": 3685984, + "step": 6350 + }, + { + "epoch": 0.9465296395591302, + "grad_norm": 2.3986008167266846, + "learning_rate": 2.365951742627346e-05, + "loss": 0.6199, + "num_input_tokens_seen": 3688800, + "step": 6355 + }, + { + "epoch": 0.9472743521000894, + "grad_norm": 5.243259906768799, + "learning_rate": 2.3678135239797438e-05, + "loss": 0.7394, + "num_input_tokens_seen": 3691840, + "step": 6360 + }, + { + "epoch": 0.9480190646410486, + "grad_norm": 5.092966556549072, + "learning_rate": 2.369675305332142e-05, + "loss": 0.5234, + "num_input_tokens_seen": 3694720, + "step": 6365 + }, + { + "epoch": 0.9487637771820078, + "grad_norm": 5.701410293579102, + "learning_rate": 2.3715370866845398e-05, + "loss": 0.719, + "num_input_tokens_seen": 3697568, + "step": 6370 + }, + { + "epoch": 0.949508489722967, + "grad_norm": 9.379063606262207, + "learning_rate": 2.373398868036938e-05, + "loss": 0.6655, + "num_input_tokens_seen": 3700416, + "step": 6375 + }, + { + "epoch": 0.9502532022639262, + "grad_norm": 7.619600296020508, + "learning_rate": 2.3752606493893358e-05, + "loss": 0.6658, + "num_input_tokens_seen": 3703456, + "step": 6380 + }, + { + "epoch": 0.9509979148048853, + "grad_norm": 7.139358997344971, + "learning_rate": 2.377122430741734e-05, + "loss": 0.5945, + "num_input_tokens_seen": 3706048, + "step": 6385 + }, + { + "epoch": 0.9517426273458445, + "grad_norm": 7.517419338226318, + "learning_rate": 2.3789842120941318e-05, + "loss": 0.6587, + "num_input_tokens_seen": 3708736, + "step": 6390 + }, + { + "epoch": 0.9524873398868037, + "grad_norm": 5.452589511871338, + "learning_rate": 2.3808459934465296e-05, + "loss": 0.4081, + "num_input_tokens_seen": 3711392, + "step": 6395 + }, + { + "epoch": 0.9532320524277629, + "grad_norm": 4.585803508758545, + "learning_rate": 2.3827077747989278e-05, + "loss": 0.7961, + "num_input_tokens_seen": 3714304, + "step": 6400 + }, + { + "epoch": 0.9539767649687221, + "grad_norm": 4.072673797607422, + "learning_rate": 2.3845695561513256e-05, + "loss": 0.6045, + "num_input_tokens_seen": 3717120, + "step": 6405 + }, + { + "epoch": 0.9547214775096813, + "grad_norm": 7.758693695068359, + "learning_rate": 2.3864313375037238e-05, + "loss": 0.7087, + "num_input_tokens_seen": 3719648, + "step": 6410 + }, + { + "epoch": 0.9554661900506405, + "grad_norm": 10.83351993560791, + "learning_rate": 2.3882931188561216e-05, + "loss": 0.7117, + "num_input_tokens_seen": 3722688, + "step": 6415 + }, + { + "epoch": 0.9562109025915997, + "grad_norm": 6.668304443359375, + "learning_rate": 2.3901549002085197e-05, + "loss": 0.7494, + "num_input_tokens_seen": 3725440, + "step": 6420 + }, + { + "epoch": 0.9569556151325588, + "grad_norm": 4.765830039978027, + "learning_rate": 2.3920166815609176e-05, + "loss": 0.4408, + "num_input_tokens_seen": 3728320, + "step": 6425 + }, + { + "epoch": 0.957700327673518, + "grad_norm": 4.320715427398682, + "learning_rate": 2.3938784629133157e-05, + "loss": 0.5902, + "num_input_tokens_seen": 3731360, + "step": 6430 + }, + { + "epoch": 0.9584450402144772, + "grad_norm": 5.602354526519775, + "learning_rate": 2.3957402442657135e-05, + "loss": 0.6218, + "num_input_tokens_seen": 3733984, + "step": 6435 + }, + { + "epoch": 0.9591897527554364, + "grad_norm": 5.554492473602295, + "learning_rate": 2.3976020256181114e-05, + "loss": 0.6003, + "num_input_tokens_seen": 3736832, + "step": 6440 + }, + { + "epoch": 0.9599344652963956, + "grad_norm": 4.654677391052246, + "learning_rate": 2.3994638069705095e-05, + "loss": 0.7569, + "num_input_tokens_seen": 3739808, + "step": 6445 + }, + { + "epoch": 0.9606791778373548, + "grad_norm": 4.050685405731201, + "learning_rate": 2.4013255883229074e-05, + "loss": 0.5495, + "num_input_tokens_seen": 3742816, + "step": 6450 + }, + { + "epoch": 0.961423890378314, + "grad_norm": 7.930570602416992, + "learning_rate": 2.4031873696753055e-05, + "loss": 0.6241, + "num_input_tokens_seen": 3745760, + "step": 6455 + }, + { + "epoch": 0.9621686029192732, + "grad_norm": 6.845648765563965, + "learning_rate": 2.4050491510277033e-05, + "loss": 0.7599, + "num_input_tokens_seen": 3748512, + "step": 6460 + }, + { + "epoch": 0.9629133154602324, + "grad_norm": 6.195060729980469, + "learning_rate": 2.4069109323801015e-05, + "loss": 0.7327, + "num_input_tokens_seen": 3751520, + "step": 6465 + }, + { + "epoch": 0.9636580280011915, + "grad_norm": 4.8750505447387695, + "learning_rate": 2.4087727137324993e-05, + "loss": 0.8244, + "num_input_tokens_seen": 3754368, + "step": 6470 + }, + { + "epoch": 0.9644027405421507, + "grad_norm": 4.265342712402344, + "learning_rate": 2.4106344950848975e-05, + "loss": 0.6815, + "num_input_tokens_seen": 3757536, + "step": 6475 + }, + { + "epoch": 0.9651474530831099, + "grad_norm": 4.379352569580078, + "learning_rate": 2.4124962764372953e-05, + "loss": 0.572, + "num_input_tokens_seen": 3760768, + "step": 6480 + }, + { + "epoch": 0.9658921656240691, + "grad_norm": 5.378579616546631, + "learning_rate": 2.414358057789693e-05, + "loss": 0.4967, + "num_input_tokens_seen": 3763584, + "step": 6485 + }, + { + "epoch": 0.9666368781650283, + "grad_norm": 5.553722381591797, + "learning_rate": 2.4162198391420913e-05, + "loss": 0.6973, + "num_input_tokens_seen": 3766560, + "step": 6490 + }, + { + "epoch": 0.9673815907059875, + "grad_norm": 6.38198709487915, + "learning_rate": 2.418081620494489e-05, + "loss": 0.5085, + "num_input_tokens_seen": 3769920, + "step": 6495 + }, + { + "epoch": 0.9681263032469467, + "grad_norm": 4.510551929473877, + "learning_rate": 2.4199434018468873e-05, + "loss": 0.5819, + "num_input_tokens_seen": 3772800, + "step": 6500 + }, + { + "epoch": 0.9688710157879059, + "grad_norm": 6.452040672302246, + "learning_rate": 2.421805183199285e-05, + "loss": 0.5953, + "num_input_tokens_seen": 3775712, + "step": 6505 + }, + { + "epoch": 0.9696157283288651, + "grad_norm": 7.951143264770508, + "learning_rate": 2.4236669645516833e-05, + "loss": 0.5862, + "num_input_tokens_seen": 3778496, + "step": 6510 + }, + { + "epoch": 0.9703604408698242, + "grad_norm": 4.441718578338623, + "learning_rate": 2.425528745904081e-05, + "loss": 0.6335, + "num_input_tokens_seen": 3781664, + "step": 6515 + }, + { + "epoch": 0.9711051534107834, + "grad_norm": 10.478961944580078, + "learning_rate": 2.4273905272564793e-05, + "loss": 0.888, + "num_input_tokens_seen": 3784416, + "step": 6520 + }, + { + "epoch": 0.9718498659517426, + "grad_norm": 5.014004230499268, + "learning_rate": 2.429252308608877e-05, + "loss": 0.6984, + "num_input_tokens_seen": 3787168, + "step": 6525 + }, + { + "epoch": 0.9725945784927018, + "grad_norm": 9.628277778625488, + "learning_rate": 2.431114089961275e-05, + "loss": 0.6532, + "num_input_tokens_seen": 3790144, + "step": 6530 + }, + { + "epoch": 0.973339291033661, + "grad_norm": 5.262460231781006, + "learning_rate": 2.432975871313673e-05, + "loss": 0.6717, + "num_input_tokens_seen": 3792992, + "step": 6535 + }, + { + "epoch": 0.9740840035746202, + "grad_norm": 6.584347724914551, + "learning_rate": 2.434837652666071e-05, + "loss": 0.8229, + "num_input_tokens_seen": 3795840, + "step": 6540 + }, + { + "epoch": 0.9748287161155794, + "grad_norm": 6.476673126220703, + "learning_rate": 2.436699434018469e-05, + "loss": 0.7582, + "num_input_tokens_seen": 3798464, + "step": 6545 + }, + { + "epoch": 0.9755734286565386, + "grad_norm": 5.518436908721924, + "learning_rate": 2.438561215370867e-05, + "loss": 0.6376, + "num_input_tokens_seen": 3801472, + "step": 6550 + }, + { + "epoch": 0.9763181411974977, + "grad_norm": 3.935141086578369, + "learning_rate": 2.440422996723265e-05, + "loss": 0.5809, + "num_input_tokens_seen": 3804320, + "step": 6555 + }, + { + "epoch": 0.9770628537384569, + "grad_norm": 5.060183525085449, + "learning_rate": 2.4422847780756632e-05, + "loss": 0.6798, + "num_input_tokens_seen": 3807072, + "step": 6560 + }, + { + "epoch": 0.9778075662794161, + "grad_norm": 4.268924236297607, + "learning_rate": 2.4441465594280607e-05, + "loss": 0.7352, + "num_input_tokens_seen": 3809984, + "step": 6565 + }, + { + "epoch": 0.9785522788203753, + "grad_norm": 6.48234224319458, + "learning_rate": 2.446008340780459e-05, + "loss": 0.6725, + "num_input_tokens_seen": 3813024, + "step": 6570 + }, + { + "epoch": 0.9792969913613345, + "grad_norm": 3.3551788330078125, + "learning_rate": 2.4478701221328567e-05, + "loss": 0.6835, + "num_input_tokens_seen": 3815584, + "step": 6575 + }, + { + "epoch": 0.9800417039022937, + "grad_norm": 3.71736216545105, + "learning_rate": 2.4497319034852548e-05, + "loss": 0.5737, + "num_input_tokens_seen": 3818272, + "step": 6580 + }, + { + "epoch": 0.9807864164432529, + "grad_norm": 4.6760454177856445, + "learning_rate": 2.4515936848376527e-05, + "loss": 0.4754, + "num_input_tokens_seen": 3821088, + "step": 6585 + }, + { + "epoch": 0.9815311289842121, + "grad_norm": 4.958709716796875, + "learning_rate": 2.4534554661900508e-05, + "loss": 0.6228, + "num_input_tokens_seen": 3823872, + "step": 6590 + }, + { + "epoch": 0.9822758415251713, + "grad_norm": 5.314762592315674, + "learning_rate": 2.4553172475424486e-05, + "loss": 0.6073, + "num_input_tokens_seen": 3826752, + "step": 6595 + }, + { + "epoch": 0.9830205540661304, + "grad_norm": 8.099893569946289, + "learning_rate": 2.4571790288948468e-05, + "loss": 0.6688, + "num_input_tokens_seen": 3829504, + "step": 6600 + }, + { + "epoch": 0.9837652666070896, + "grad_norm": 4.459744930267334, + "learning_rate": 2.459040810247245e-05, + "loss": 0.5275, + "num_input_tokens_seen": 3832448, + "step": 6605 + }, + { + "epoch": 0.9845099791480488, + "grad_norm": 10.636013984680176, + "learning_rate": 2.4609025915996424e-05, + "loss": 0.6701, + "num_input_tokens_seen": 3835456, + "step": 6610 + }, + { + "epoch": 0.985254691689008, + "grad_norm": 4.791472434997559, + "learning_rate": 2.4627643729520406e-05, + "loss": 0.6198, + "num_input_tokens_seen": 3838560, + "step": 6615 + }, + { + "epoch": 0.9859994042299672, + "grad_norm": 4.9397053718566895, + "learning_rate": 2.4646261543044384e-05, + "loss": 0.7882, + "num_input_tokens_seen": 3841568, + "step": 6620 + }, + { + "epoch": 0.9867441167709264, + "grad_norm": 5.429640293121338, + "learning_rate": 2.4664879356568366e-05, + "loss": 0.7997, + "num_input_tokens_seen": 3844384, + "step": 6625 + }, + { + "epoch": 0.9874888293118856, + "grad_norm": 4.229193687438965, + "learning_rate": 2.4683497170092344e-05, + "loss": 0.6079, + "num_input_tokens_seen": 3847616, + "step": 6630 + }, + { + "epoch": 0.9882335418528448, + "grad_norm": 4.03995943069458, + "learning_rate": 2.4702114983616326e-05, + "loss": 0.7518, + "num_input_tokens_seen": 3850496, + "step": 6635 + }, + { + "epoch": 0.988978254393804, + "grad_norm": 4.372453212738037, + "learning_rate": 2.4720732797140307e-05, + "loss": 0.6165, + "num_input_tokens_seen": 3853600, + "step": 6640 + }, + { + "epoch": 0.9897229669347631, + "grad_norm": 8.920284271240234, + "learning_rate": 2.4739350610664286e-05, + "loss": 0.6925, + "num_input_tokens_seen": 3856416, + "step": 6645 + }, + { + "epoch": 0.9904676794757223, + "grad_norm": 3.9725608825683594, + "learning_rate": 2.4757968424188267e-05, + "loss": 0.6268, + "num_input_tokens_seen": 3858976, + "step": 6650 + }, + { + "epoch": 0.9912123920166815, + "grad_norm": 5.4005022048950195, + "learning_rate": 2.4776586237712242e-05, + "loss": 0.7361, + "num_input_tokens_seen": 3861984, + "step": 6655 + }, + { + "epoch": 0.9919571045576407, + "grad_norm": 7.912161827087402, + "learning_rate": 2.4795204051236224e-05, + "loss": 0.7769, + "num_input_tokens_seen": 3864768, + "step": 6660 + }, + { + "epoch": 0.9927018170986, + "grad_norm": 6.969575881958008, + "learning_rate": 2.4813821864760202e-05, + "loss": 0.5893, + "num_input_tokens_seen": 3867680, + "step": 6665 + }, + { + "epoch": 0.9934465296395592, + "grad_norm": 5.230376243591309, + "learning_rate": 2.4832439678284184e-05, + "loss": 0.6631, + "num_input_tokens_seen": 3870560, + "step": 6670 + }, + { + "epoch": 0.9941912421805184, + "grad_norm": 5.56076192855835, + "learning_rate": 2.4851057491808162e-05, + "loss": 0.5712, + "num_input_tokens_seen": 3873408, + "step": 6675 + }, + { + "epoch": 0.9949359547214776, + "grad_norm": 6.2866082191467285, + "learning_rate": 2.4869675305332143e-05, + "loss": 0.7119, + "num_input_tokens_seen": 3876320, + "step": 6680 + }, + { + "epoch": 0.9956806672624366, + "grad_norm": 2.8665618896484375, + "learning_rate": 2.4888293118856125e-05, + "loss": 0.6968, + "num_input_tokens_seen": 3879008, + "step": 6685 + }, + { + "epoch": 0.9964253798033958, + "grad_norm": 5.730622291564941, + "learning_rate": 2.4906910932380103e-05, + "loss": 0.6994, + "num_input_tokens_seen": 3881760, + "step": 6690 + }, + { + "epoch": 0.997170092344355, + "grad_norm": 4.970673561096191, + "learning_rate": 2.492552874590408e-05, + "loss": 0.7024, + "num_input_tokens_seen": 3884672, + "step": 6695 + }, + { + "epoch": 0.9979148048853143, + "grad_norm": 3.6665239334106445, + "learning_rate": 2.494414655942806e-05, + "loss": 0.613, + "num_input_tokens_seen": 3887360, + "step": 6700 + }, + { + "epoch": 0.9986595174262735, + "grad_norm": 4.773792266845703, + "learning_rate": 2.496276437295204e-05, + "loss": 0.6612, + "num_input_tokens_seen": 3889920, + "step": 6705 + }, + { + "epoch": 0.9994042299672327, + "grad_norm": 4.482601165771484, + "learning_rate": 2.498138218647602e-05, + "loss": 0.5315, + "num_input_tokens_seen": 3892704, + "step": 6710 + }, + { + "epoch": 1.0, + "eval_loss": 0.6720787286758423, + "eval_runtime": 51.2103, + "eval_samples_per_second": 58.27, + "eval_steps_per_second": 14.567, + "num_input_tokens_seen": 3894552, + "step": 6714 + }, + { + "epoch": 1.0001489425081918, + "grad_norm": 4.018333435058594, + "learning_rate": 2.5e-05, + "loss": 0.5735, + "num_input_tokens_seen": 3895096, + "step": 6715 + }, + { + "epoch": 1.000893655049151, + "grad_norm": 10.65987491607666, + "learning_rate": 2.5018617813523983e-05, + "loss": 0.565, + "num_input_tokens_seen": 3898200, + "step": 6720 + }, + { + "epoch": 1.0016383675901102, + "grad_norm": 5.051242351531982, + "learning_rate": 2.503723562704796e-05, + "loss": 0.5842, + "num_input_tokens_seen": 3900984, + "step": 6725 + }, + { + "epoch": 1.0023830801310694, + "grad_norm": 10.00973892211914, + "learning_rate": 2.5055853440571943e-05, + "loss": 0.6241, + "num_input_tokens_seen": 3903928, + "step": 6730 + }, + { + "epoch": 1.0031277926720286, + "grad_norm": 8.477496147155762, + "learning_rate": 2.507447125409592e-05, + "loss": 0.8063, + "num_input_tokens_seen": 3906840, + "step": 6735 + }, + { + "epoch": 1.0038725052129878, + "grad_norm": 5.010937690734863, + "learning_rate": 2.5093089067619903e-05, + "loss": 0.574, + "num_input_tokens_seen": 3909880, + "step": 6740 + }, + { + "epoch": 1.004617217753947, + "grad_norm": 6.3476104736328125, + "learning_rate": 2.511170688114388e-05, + "loss": 0.6717, + "num_input_tokens_seen": 3912952, + "step": 6745 + }, + { + "epoch": 1.0053619302949062, + "grad_norm": 10.657556533813477, + "learning_rate": 2.5130324694667862e-05, + "loss": 0.5851, + "num_input_tokens_seen": 3916312, + "step": 6750 + }, + { + "epoch": 1.0061066428358654, + "grad_norm": 7.08699369430542, + "learning_rate": 2.514894250819184e-05, + "loss": 0.5762, + "num_input_tokens_seen": 3919576, + "step": 6755 + }, + { + "epoch": 1.0068513553768246, + "grad_norm": 3.894279956817627, + "learning_rate": 2.5167560321715815e-05, + "loss": 0.5897, + "num_input_tokens_seen": 3922680, + "step": 6760 + }, + { + "epoch": 1.0075960679177838, + "grad_norm": 7.205716609954834, + "learning_rate": 2.5186178135239797e-05, + "loss": 0.7264, + "num_input_tokens_seen": 3925560, + "step": 6765 + }, + { + "epoch": 1.008340780458743, + "grad_norm": 3.8584744930267334, + "learning_rate": 2.5204795948763775e-05, + "loss": 0.4615, + "num_input_tokens_seen": 3928248, + "step": 6770 + }, + { + "epoch": 1.0090854929997022, + "grad_norm": 4.019835948944092, + "learning_rate": 2.5223413762287757e-05, + "loss": 0.5525, + "num_input_tokens_seen": 3931160, + "step": 6775 + }, + { + "epoch": 1.0098302055406614, + "grad_norm": 9.12194538116455, + "learning_rate": 2.5242031575811735e-05, + "loss": 0.6975, + "num_input_tokens_seen": 3933720, + "step": 6780 + }, + { + "epoch": 1.0105749180816206, + "grad_norm": 11.311967849731445, + "learning_rate": 2.5260649389335717e-05, + "loss": 0.8385, + "num_input_tokens_seen": 3936728, + "step": 6785 + }, + { + "epoch": 1.0113196306225798, + "grad_norm": 4.145586013793945, + "learning_rate": 2.5279267202859695e-05, + "loss": 0.5106, + "num_input_tokens_seen": 3939832, + "step": 6790 + }, + { + "epoch": 1.0120643431635388, + "grad_norm": 3.1437065601348877, + "learning_rate": 2.5297885016383677e-05, + "loss": 0.4646, + "num_input_tokens_seen": 3942616, + "step": 6795 + }, + { + "epoch": 1.012809055704498, + "grad_norm": 2.8730411529541016, + "learning_rate": 2.5316502829907658e-05, + "loss": 0.4266, + "num_input_tokens_seen": 3945464, + "step": 6800 + }, + { + "epoch": 1.0135537682454572, + "grad_norm": 5.4021782875061035, + "learning_rate": 2.5335120643431636e-05, + "loss": 0.6578, + "num_input_tokens_seen": 3948536, + "step": 6805 + }, + { + "epoch": 1.0142984807864164, + "grad_norm": 4.290023326873779, + "learning_rate": 2.5353738456955618e-05, + "loss": 0.7037, + "num_input_tokens_seen": 3951288, + "step": 6810 + }, + { + "epoch": 1.0150431933273756, + "grad_norm": 6.265732288360596, + "learning_rate": 2.5372356270479596e-05, + "loss": 0.7291, + "num_input_tokens_seen": 3954072, + "step": 6815 + }, + { + "epoch": 1.0157879058683348, + "grad_norm": 6.715002059936523, + "learning_rate": 2.5390974084003578e-05, + "loss": 0.7043, + "num_input_tokens_seen": 3956824, + "step": 6820 + }, + { + "epoch": 1.016532618409294, + "grad_norm": 4.555670738220215, + "learning_rate": 2.5409591897527556e-05, + "loss": 0.6596, + "num_input_tokens_seen": 3959608, + "step": 6825 + }, + { + "epoch": 1.0172773309502532, + "grad_norm": 5.2045369148254395, + "learning_rate": 2.5428209711051538e-05, + "loss": 0.694, + "num_input_tokens_seen": 3962104, + "step": 6830 + }, + { + "epoch": 1.0180220434912124, + "grad_norm": 6.889452934265137, + "learning_rate": 2.5446827524575516e-05, + "loss": 0.52, + "num_input_tokens_seen": 3964984, + "step": 6835 + }, + { + "epoch": 1.0187667560321716, + "grad_norm": 4.938897132873535, + "learning_rate": 2.5465445338099498e-05, + "loss": 0.6272, + "num_input_tokens_seen": 3967800, + "step": 6840 + }, + { + "epoch": 1.0195114685731308, + "grad_norm": 13.478050231933594, + "learning_rate": 2.5484063151623473e-05, + "loss": 0.6951, + "num_input_tokens_seen": 3970712, + "step": 6845 + }, + { + "epoch": 1.02025618111409, + "grad_norm": 4.343696594238281, + "learning_rate": 2.550268096514745e-05, + "loss": 0.521, + "num_input_tokens_seen": 3973624, + "step": 6850 + }, + { + "epoch": 1.0210008936550492, + "grad_norm": 6.648683547973633, + "learning_rate": 2.5521298778671432e-05, + "loss": 0.7769, + "num_input_tokens_seen": 3976792, + "step": 6855 + }, + { + "epoch": 1.0217456061960084, + "grad_norm": 11.885497093200684, + "learning_rate": 2.553991659219541e-05, + "loss": 0.6593, + "num_input_tokens_seen": 3979512, + "step": 6860 + }, + { + "epoch": 1.0224903187369676, + "grad_norm": 6.292271137237549, + "learning_rate": 2.5558534405719392e-05, + "loss": 0.6123, + "num_input_tokens_seen": 3982360, + "step": 6865 + }, + { + "epoch": 1.0232350312779268, + "grad_norm": 7.075900077819824, + "learning_rate": 2.557715221924337e-05, + "loss": 0.6186, + "num_input_tokens_seen": 3985240, + "step": 6870 + }, + { + "epoch": 1.023979743818886, + "grad_norm": 5.387895107269287, + "learning_rate": 2.5595770032767352e-05, + "loss": 0.5689, + "num_input_tokens_seen": 3987960, + "step": 6875 + }, + { + "epoch": 1.024724456359845, + "grad_norm": 10.589505195617676, + "learning_rate": 2.5614387846291334e-05, + "loss": 0.5877, + "num_input_tokens_seen": 3990712, + "step": 6880 + }, + { + "epoch": 1.0254691689008042, + "grad_norm": 4.094969749450684, + "learning_rate": 2.5633005659815312e-05, + "loss": 0.5955, + "num_input_tokens_seen": 3993592, + "step": 6885 + }, + { + "epoch": 1.0262138814417634, + "grad_norm": 4.595489025115967, + "learning_rate": 2.5651623473339294e-05, + "loss": 0.6718, + "num_input_tokens_seen": 3996792, + "step": 6890 + }, + { + "epoch": 1.0269585939827226, + "grad_norm": 7.303249835968018, + "learning_rate": 2.5670241286863272e-05, + "loss": 0.6858, + "num_input_tokens_seen": 3999512, + "step": 6895 + }, + { + "epoch": 1.0277033065236818, + "grad_norm": 6.637876987457275, + "learning_rate": 2.5688859100387253e-05, + "loss": 0.697, + "num_input_tokens_seen": 4002488, + "step": 6900 + }, + { + "epoch": 1.028448019064641, + "grad_norm": 5.6108269691467285, + "learning_rate": 2.570747691391123e-05, + "loss": 0.8589, + "num_input_tokens_seen": 4005336, + "step": 6905 + }, + { + "epoch": 1.0291927316056002, + "grad_norm": 9.577675819396973, + "learning_rate": 2.5726094727435213e-05, + "loss": 0.7226, + "num_input_tokens_seen": 4008184, + "step": 6910 + }, + { + "epoch": 1.0299374441465594, + "grad_norm": 5.592278957366943, + "learning_rate": 2.574471254095919e-05, + "loss": 0.6141, + "num_input_tokens_seen": 4011128, + "step": 6915 + }, + { + "epoch": 1.0306821566875186, + "grad_norm": 5.495172500610352, + "learning_rate": 2.5763330354483173e-05, + "loss": 0.6142, + "num_input_tokens_seen": 4014168, + "step": 6920 + }, + { + "epoch": 1.0314268692284778, + "grad_norm": 4.306805610656738, + "learning_rate": 2.578194816800715e-05, + "loss": 0.5494, + "num_input_tokens_seen": 4017144, + "step": 6925 + }, + { + "epoch": 1.032171581769437, + "grad_norm": 5.08223819732666, + "learning_rate": 2.5800565981531133e-05, + "loss": 0.6986, + "num_input_tokens_seen": 4020024, + "step": 6930 + }, + { + "epoch": 1.0329162943103962, + "grad_norm": 5.203743934631348, + "learning_rate": 2.5819183795055108e-05, + "loss": 0.4531, + "num_input_tokens_seen": 4022904, + "step": 6935 + }, + { + "epoch": 1.0336610068513554, + "grad_norm": 7.913916110992432, + "learning_rate": 2.5837801608579086e-05, + "loss": 0.4933, + "num_input_tokens_seen": 4025656, + "step": 6940 + }, + { + "epoch": 1.0344057193923146, + "grad_norm": 5.484188556671143, + "learning_rate": 2.5856419422103068e-05, + "loss": 0.6471, + "num_input_tokens_seen": 4028632, + "step": 6945 + }, + { + "epoch": 1.0351504319332738, + "grad_norm": 20.605751037597656, + "learning_rate": 2.587503723562705e-05, + "loss": 0.8219, + "num_input_tokens_seen": 4031480, + "step": 6950 + }, + { + "epoch": 1.035895144474233, + "grad_norm": 6.091693878173828, + "learning_rate": 2.5893655049151027e-05, + "loss": 0.8841, + "num_input_tokens_seen": 4034424, + "step": 6955 + }, + { + "epoch": 1.0366398570151922, + "grad_norm": 8.692134857177734, + "learning_rate": 2.591227286267501e-05, + "loss": 0.4847, + "num_input_tokens_seen": 4037272, + "step": 6960 + }, + { + "epoch": 1.0373845695561514, + "grad_norm": 8.95309829711914, + "learning_rate": 2.5930890676198987e-05, + "loss": 0.5919, + "num_input_tokens_seen": 4040312, + "step": 6965 + }, + { + "epoch": 1.0381292820971104, + "grad_norm": 6.009830474853516, + "learning_rate": 2.594950848972297e-05, + "loss": 0.7666, + "num_input_tokens_seen": 4043224, + "step": 6970 + }, + { + "epoch": 1.0388739946380696, + "grad_norm": 6.735565662384033, + "learning_rate": 2.5968126303246947e-05, + "loss": 0.6994, + "num_input_tokens_seen": 4046168, + "step": 6975 + }, + { + "epoch": 1.0396187071790288, + "grad_norm": 10.574033737182617, + "learning_rate": 2.598674411677093e-05, + "loss": 0.6221, + "num_input_tokens_seen": 4049080, + "step": 6980 + }, + { + "epoch": 1.040363419719988, + "grad_norm": 6.512435436248779, + "learning_rate": 2.6005361930294907e-05, + "loss": 0.6524, + "num_input_tokens_seen": 4052312, + "step": 6985 + }, + { + "epoch": 1.0411081322609472, + "grad_norm": 6.792271137237549, + "learning_rate": 2.602397974381889e-05, + "loss": 0.5089, + "num_input_tokens_seen": 4055000, + "step": 6990 + }, + { + "epoch": 1.0418528448019064, + "grad_norm": 7.768829822540283, + "learning_rate": 2.6042597557342867e-05, + "loss": 0.7244, + "num_input_tokens_seen": 4058136, + "step": 6995 + }, + { + "epoch": 1.0425975573428656, + "grad_norm": 4.831637859344482, + "learning_rate": 2.606121537086685e-05, + "loss": 0.6337, + "num_input_tokens_seen": 4061176, + "step": 7000 + }, + { + "epoch": 1.0433422698838248, + "grad_norm": 5.209082126617432, + "learning_rate": 2.6079833184390827e-05, + "loss": 0.6199, + "num_input_tokens_seen": 4063960, + "step": 7005 + }, + { + "epoch": 1.044086982424784, + "grad_norm": 4.633509635925293, + "learning_rate": 2.609845099791481e-05, + "loss": 0.4791, + "num_input_tokens_seen": 4066872, + "step": 7010 + }, + { + "epoch": 1.0448316949657432, + "grad_norm": 5.300487041473389, + "learning_rate": 2.611706881143879e-05, + "loss": 0.6511, + "num_input_tokens_seen": 4069688, + "step": 7015 + }, + { + "epoch": 1.0455764075067024, + "grad_norm": 6.346035003662109, + "learning_rate": 2.613568662496276e-05, + "loss": 0.665, + "num_input_tokens_seen": 4072504, + "step": 7020 + }, + { + "epoch": 1.0463211200476616, + "grad_norm": 3.644904613494873, + "learning_rate": 2.6154304438486743e-05, + "loss": 0.635, + "num_input_tokens_seen": 4075448, + "step": 7025 + }, + { + "epoch": 1.0470658325886208, + "grad_norm": 6.204829216003418, + "learning_rate": 2.6172922252010725e-05, + "loss": 0.7273, + "num_input_tokens_seen": 4078168, + "step": 7030 + }, + { + "epoch": 1.04781054512958, + "grad_norm": 7.130980968475342, + "learning_rate": 2.6191540065534703e-05, + "loss": 0.696, + "num_input_tokens_seen": 4081048, + "step": 7035 + }, + { + "epoch": 1.0485552576705393, + "grad_norm": 8.686850547790527, + "learning_rate": 2.6210157879058685e-05, + "loss": 0.6838, + "num_input_tokens_seen": 4083928, + "step": 7040 + }, + { + "epoch": 1.0492999702114985, + "grad_norm": 4.5708465576171875, + "learning_rate": 2.6228775692582663e-05, + "loss": 0.608, + "num_input_tokens_seen": 4086680, + "step": 7045 + }, + { + "epoch": 1.0500446827524577, + "grad_norm": 7.288931369781494, + "learning_rate": 2.6247393506106644e-05, + "loss": 0.7099, + "num_input_tokens_seen": 4089592, + "step": 7050 + }, + { + "epoch": 1.0507893952934166, + "grad_norm": 7.838111400604248, + "learning_rate": 2.6266011319630623e-05, + "loss": 0.7249, + "num_input_tokens_seen": 4092504, + "step": 7055 + }, + { + "epoch": 1.0515341078343758, + "grad_norm": 2.952751398086548, + "learning_rate": 2.6284629133154604e-05, + "loss": 0.6071, + "num_input_tokens_seen": 4095576, + "step": 7060 + }, + { + "epoch": 1.052278820375335, + "grad_norm": 5.948769569396973, + "learning_rate": 2.6303246946678582e-05, + "loss": 0.5797, + "num_input_tokens_seen": 4098744, + "step": 7065 + }, + { + "epoch": 1.0530235329162942, + "grad_norm": 4.351602077484131, + "learning_rate": 2.6321864760202564e-05, + "loss": 0.6667, + "num_input_tokens_seen": 4101624, + "step": 7070 + }, + { + "epoch": 1.0537682454572534, + "grad_norm": 5.118619441986084, + "learning_rate": 2.6340482573726542e-05, + "loss": 0.5865, + "num_input_tokens_seen": 4104856, + "step": 7075 + }, + { + "epoch": 1.0545129579982127, + "grad_norm": 3.3546996116638184, + "learning_rate": 2.6359100387250524e-05, + "loss": 0.6052, + "num_input_tokens_seen": 4107800, + "step": 7080 + }, + { + "epoch": 1.0552576705391719, + "grad_norm": 8.993636131286621, + "learning_rate": 2.6377718200774502e-05, + "loss": 0.5599, + "num_input_tokens_seen": 4110744, + "step": 7085 + }, + { + "epoch": 1.056002383080131, + "grad_norm": 7.811604022979736, + "learning_rate": 2.6396336014298484e-05, + "loss": 0.7548, + "num_input_tokens_seen": 4113592, + "step": 7090 + }, + { + "epoch": 1.0567470956210903, + "grad_norm": 4.3658366203308105, + "learning_rate": 2.6414953827822465e-05, + "loss": 0.5791, + "num_input_tokens_seen": 4116472, + "step": 7095 + }, + { + "epoch": 1.0574918081620495, + "grad_norm": 10.809727668762207, + "learning_rate": 2.6433571641346444e-05, + "loss": 0.5365, + "num_input_tokens_seen": 4119288, + "step": 7100 + }, + { + "epoch": 1.0582365207030087, + "grad_norm": 7.103723526000977, + "learning_rate": 2.645218945487042e-05, + "loss": 0.9131, + "num_input_tokens_seen": 4122488, + "step": 7105 + }, + { + "epoch": 1.0589812332439679, + "grad_norm": 8.014568328857422, + "learning_rate": 2.64708072683944e-05, + "loss": 0.616, + "num_input_tokens_seen": 4125432, + "step": 7110 + }, + { + "epoch": 1.059725945784927, + "grad_norm": 5.996285438537598, + "learning_rate": 2.648942508191838e-05, + "loss": 0.5338, + "num_input_tokens_seen": 4128248, + "step": 7115 + }, + { + "epoch": 1.0604706583258863, + "grad_norm": 6.671701908111572, + "learning_rate": 2.650804289544236e-05, + "loss": 0.7045, + "num_input_tokens_seen": 4131256, + "step": 7120 + }, + { + "epoch": 1.0612153708668455, + "grad_norm": 6.3735432624816895, + "learning_rate": 2.6526660708966338e-05, + "loss": 0.8194, + "num_input_tokens_seen": 4134296, + "step": 7125 + }, + { + "epoch": 1.0619600834078047, + "grad_norm": 6.847781181335449, + "learning_rate": 2.654527852249032e-05, + "loss": 0.7326, + "num_input_tokens_seen": 4137112, + "step": 7130 + }, + { + "epoch": 1.0627047959487639, + "grad_norm": 3.433504104614258, + "learning_rate": 2.6563896336014298e-05, + "loss": 0.6969, + "num_input_tokens_seen": 4140024, + "step": 7135 + }, + { + "epoch": 1.063449508489723, + "grad_norm": 5.39858341217041, + "learning_rate": 2.658251414953828e-05, + "loss": 0.5797, + "num_input_tokens_seen": 4142968, + "step": 7140 + }, + { + "epoch": 1.064194221030682, + "grad_norm": 4.266496181488037, + "learning_rate": 2.6601131963062258e-05, + "loss": 0.5766, + "num_input_tokens_seen": 4145784, + "step": 7145 + }, + { + "epoch": 1.0649389335716413, + "grad_norm": 9.916876792907715, + "learning_rate": 2.661974977658624e-05, + "loss": 0.6853, + "num_input_tokens_seen": 4148408, + "step": 7150 + }, + { + "epoch": 1.0656836461126005, + "grad_norm": 4.767374038696289, + "learning_rate": 2.6638367590110218e-05, + "loss": 0.6169, + "num_input_tokens_seen": 4151352, + "step": 7155 + }, + { + "epoch": 1.0664283586535597, + "grad_norm": 4.497918128967285, + "learning_rate": 2.66569854036342e-05, + "loss": 0.5985, + "num_input_tokens_seen": 4154168, + "step": 7160 + }, + { + "epoch": 1.0671730711945189, + "grad_norm": 4.465814590454102, + "learning_rate": 2.667560321715818e-05, + "loss": 0.5236, + "num_input_tokens_seen": 4157240, + "step": 7165 + }, + { + "epoch": 1.067917783735478, + "grad_norm": 6.471644401550293, + "learning_rate": 2.669422103068216e-05, + "loss": 0.6266, + "num_input_tokens_seen": 4160216, + "step": 7170 + }, + { + "epoch": 1.0686624962764373, + "grad_norm": 7.119153022766113, + "learning_rate": 2.671283884420614e-05, + "loss": 0.4459, + "num_input_tokens_seen": 4162968, + "step": 7175 + }, + { + "epoch": 1.0694072088173965, + "grad_norm": 5.626287460327148, + "learning_rate": 2.673145665773012e-05, + "loss": 0.5905, + "num_input_tokens_seen": 4166232, + "step": 7180 + }, + { + "epoch": 1.0701519213583557, + "grad_norm": 8.079995155334473, + "learning_rate": 2.67500744712541e-05, + "loss": 0.6076, + "num_input_tokens_seen": 4169368, + "step": 7185 + }, + { + "epoch": 1.0708966338993149, + "grad_norm": 9.138162612915039, + "learning_rate": 2.676869228477808e-05, + "loss": 0.6623, + "num_input_tokens_seen": 4172344, + "step": 7190 + }, + { + "epoch": 1.071641346440274, + "grad_norm": 10.897883415222168, + "learning_rate": 2.6787310098302054e-05, + "loss": 0.4375, + "num_input_tokens_seen": 4175064, + "step": 7195 + }, + { + "epoch": 1.0723860589812333, + "grad_norm": 9.682476043701172, + "learning_rate": 2.6805927911826035e-05, + "loss": 0.5157, + "num_input_tokens_seen": 4177752, + "step": 7200 + }, + { + "epoch": 1.0731307715221925, + "grad_norm": 15.910842895507812, + "learning_rate": 2.6824545725350014e-05, + "loss": 0.5168, + "num_input_tokens_seen": 4180408, + "step": 7205 + }, + { + "epoch": 1.0738754840631517, + "grad_norm": 6.741359233856201, + "learning_rate": 2.6843163538873995e-05, + "loss": 0.5772, + "num_input_tokens_seen": 4183480, + "step": 7210 + }, + { + "epoch": 1.074620196604111, + "grad_norm": 19.083663940429688, + "learning_rate": 2.6861781352397974e-05, + "loss": 0.6663, + "num_input_tokens_seen": 4186200, + "step": 7215 + }, + { + "epoch": 1.07536490914507, + "grad_norm": 12.973228454589844, + "learning_rate": 2.6880399165921955e-05, + "loss": 0.686, + "num_input_tokens_seen": 4189112, + "step": 7220 + }, + { + "epoch": 1.076109621686029, + "grad_norm": 12.25261116027832, + "learning_rate": 2.6899016979445933e-05, + "loss": 0.7083, + "num_input_tokens_seen": 4192024, + "step": 7225 + }, + { + "epoch": 1.0768543342269883, + "grad_norm": 6.763296604156494, + "learning_rate": 2.6917634792969915e-05, + "loss": 0.7805, + "num_input_tokens_seen": 4194776, + "step": 7230 + }, + { + "epoch": 1.0775990467679475, + "grad_norm": 10.33425521850586, + "learning_rate": 2.6936252606493893e-05, + "loss": 0.7177, + "num_input_tokens_seen": 4197880, + "step": 7235 + }, + { + "epoch": 1.0783437593089067, + "grad_norm": 4.877995491027832, + "learning_rate": 2.6954870420017875e-05, + "loss": 0.8045, + "num_input_tokens_seen": 4200920, + "step": 7240 + }, + { + "epoch": 1.079088471849866, + "grad_norm": 5.407392501831055, + "learning_rate": 2.6973488233541856e-05, + "loss": 0.8014, + "num_input_tokens_seen": 4203832, + "step": 7245 + }, + { + "epoch": 1.079833184390825, + "grad_norm": 3.7631242275238037, + "learning_rate": 2.6992106047065835e-05, + "loss": 0.7319, + "num_input_tokens_seen": 4206872, + "step": 7250 + }, + { + "epoch": 1.0805778969317843, + "grad_norm": 4.461559295654297, + "learning_rate": 2.7010723860589816e-05, + "loss": 0.6599, + "num_input_tokens_seen": 4209688, + "step": 7255 + }, + { + "epoch": 1.0813226094727435, + "grad_norm": 6.730692386627197, + "learning_rate": 2.7029341674113795e-05, + "loss": 0.6887, + "num_input_tokens_seen": 4212600, + "step": 7260 + }, + { + "epoch": 1.0820673220137027, + "grad_norm": 5.1640625, + "learning_rate": 2.7047959487637776e-05, + "loss": 0.6623, + "num_input_tokens_seen": 4215480, + "step": 7265 + }, + { + "epoch": 1.082812034554662, + "grad_norm": 4.550693511962891, + "learning_rate": 2.7066577301161754e-05, + "loss": 0.5322, + "num_input_tokens_seen": 4218360, + "step": 7270 + }, + { + "epoch": 1.083556747095621, + "grad_norm": 5.319338798522949, + "learning_rate": 2.7085195114685736e-05, + "loss": 0.5663, + "num_input_tokens_seen": 4221528, + "step": 7275 + }, + { + "epoch": 1.0843014596365803, + "grad_norm": 4.434370517730713, + "learning_rate": 2.710381292820971e-05, + "loss": 0.5429, + "num_input_tokens_seen": 4224120, + "step": 7280 + }, + { + "epoch": 1.0850461721775395, + "grad_norm": 5.124112129211426, + "learning_rate": 2.712243074173369e-05, + "loss": 0.6384, + "num_input_tokens_seen": 4226840, + "step": 7285 + }, + { + "epoch": 1.0857908847184987, + "grad_norm": 16.25092124938965, + "learning_rate": 2.714104855525767e-05, + "loss": 0.8639, + "num_input_tokens_seen": 4229976, + "step": 7290 + }, + { + "epoch": 1.086535597259458, + "grad_norm": 7.462133407592773, + "learning_rate": 2.715966636878165e-05, + "loss": 0.5966, + "num_input_tokens_seen": 4233080, + "step": 7295 + }, + { + "epoch": 1.0872803098004171, + "grad_norm": 11.337442398071289, + "learning_rate": 2.717828418230563e-05, + "loss": 0.6186, + "num_input_tokens_seen": 4235800, + "step": 7300 + }, + { + "epoch": 1.0880250223413763, + "grad_norm": 8.60619831085205, + "learning_rate": 2.719690199582961e-05, + "loss": 0.6766, + "num_input_tokens_seen": 4238552, + "step": 7305 + }, + { + "epoch": 1.0887697348823355, + "grad_norm": 6.623571395874023, + "learning_rate": 2.721551980935359e-05, + "loss": 0.7578, + "num_input_tokens_seen": 4241624, + "step": 7310 + }, + { + "epoch": 1.0895144474232945, + "grad_norm": 8.479644775390625, + "learning_rate": 2.723413762287757e-05, + "loss": 0.6544, + "num_input_tokens_seen": 4244664, + "step": 7315 + }, + { + "epoch": 1.0902591599642537, + "grad_norm": 7.510477066040039, + "learning_rate": 2.725275543640155e-05, + "loss": 0.6647, + "num_input_tokens_seen": 4247512, + "step": 7320 + }, + { + "epoch": 1.091003872505213, + "grad_norm": 5.5192790031433105, + "learning_rate": 2.7271373249925532e-05, + "loss": 0.6685, + "num_input_tokens_seen": 4250392, + "step": 7325 + }, + { + "epoch": 1.0917485850461721, + "grad_norm": 5.37265682220459, + "learning_rate": 2.728999106344951e-05, + "loss": 0.662, + "num_input_tokens_seen": 4253496, + "step": 7330 + }, + { + "epoch": 1.0924932975871313, + "grad_norm": 5.506422996520996, + "learning_rate": 2.7308608876973492e-05, + "loss": 0.5748, + "num_input_tokens_seen": 4256472, + "step": 7335 + }, + { + "epoch": 1.0932380101280905, + "grad_norm": 10.559425354003906, + "learning_rate": 2.732722669049747e-05, + "loss": 0.6279, + "num_input_tokens_seen": 4259256, + "step": 7340 + }, + { + "epoch": 1.0939827226690497, + "grad_norm": 5.470729827880859, + "learning_rate": 2.734584450402145e-05, + "loss": 0.5965, + "num_input_tokens_seen": 4261816, + "step": 7345 + }, + { + "epoch": 1.094727435210009, + "grad_norm": 5.486850738525391, + "learning_rate": 2.736446231754543e-05, + "loss": 0.6013, + "num_input_tokens_seen": 4265048, + "step": 7350 + }, + { + "epoch": 1.0954721477509681, + "grad_norm": 9.194994926452637, + "learning_rate": 2.738308013106941e-05, + "loss": 0.6968, + "num_input_tokens_seen": 4267864, + "step": 7355 + }, + { + "epoch": 1.0962168602919273, + "grad_norm": 6.690040588378906, + "learning_rate": 2.740169794459339e-05, + "loss": 0.4649, + "num_input_tokens_seen": 4270776, + "step": 7360 + }, + { + "epoch": 1.0969615728328865, + "grad_norm": 6.794023513793945, + "learning_rate": 2.7420315758117365e-05, + "loss": 0.6117, + "num_input_tokens_seen": 4273656, + "step": 7365 + }, + { + "epoch": 1.0977062853738457, + "grad_norm": 4.946835517883301, + "learning_rate": 2.7438933571641346e-05, + "loss": 0.4264, + "num_input_tokens_seen": 4276408, + "step": 7370 + }, + { + "epoch": 1.098450997914805, + "grad_norm": 7.775631904602051, + "learning_rate": 2.7457551385165324e-05, + "loss": 0.6116, + "num_input_tokens_seen": 4279768, + "step": 7375 + }, + { + "epoch": 1.0991957104557641, + "grad_norm": 5.294595241546631, + "learning_rate": 2.7476169198689306e-05, + "loss": 0.7168, + "num_input_tokens_seen": 4282712, + "step": 7380 + }, + { + "epoch": 1.0999404229967233, + "grad_norm": 4.971120357513428, + "learning_rate": 2.7494787012213284e-05, + "loss": 0.673, + "num_input_tokens_seen": 4285784, + "step": 7385 + }, + { + "epoch": 1.1006851355376825, + "grad_norm": 6.023274898529053, + "learning_rate": 2.7513404825737266e-05, + "loss": 0.671, + "num_input_tokens_seen": 4288824, + "step": 7390 + }, + { + "epoch": 1.1014298480786415, + "grad_norm": 4.8932671546936035, + "learning_rate": 2.7532022639261244e-05, + "loss": 0.557, + "num_input_tokens_seen": 4291384, + "step": 7395 + }, + { + "epoch": 1.1021745606196007, + "grad_norm": 8.825800895690918, + "learning_rate": 2.7550640452785226e-05, + "loss": 0.7123, + "num_input_tokens_seen": 4294200, + "step": 7400 + }, + { + "epoch": 1.10291927316056, + "grad_norm": 6.011749744415283, + "learning_rate": 2.7569258266309207e-05, + "loss": 0.5486, + "num_input_tokens_seen": 4297176, + "step": 7405 + }, + { + "epoch": 1.1036639857015191, + "grad_norm": 6.256982326507568, + "learning_rate": 2.7587876079833186e-05, + "loss": 0.8307, + "num_input_tokens_seen": 4299992, + "step": 7410 + }, + { + "epoch": 1.1044086982424783, + "grad_norm": 9.81971549987793, + "learning_rate": 2.7606493893357167e-05, + "loss": 0.4437, + "num_input_tokens_seen": 4303320, + "step": 7415 + }, + { + "epoch": 1.1051534107834375, + "grad_norm": 11.376420021057129, + "learning_rate": 2.7625111706881145e-05, + "loss": 0.4911, + "num_input_tokens_seen": 4306232, + "step": 7420 + }, + { + "epoch": 1.1058981233243967, + "grad_norm": 5.575237274169922, + "learning_rate": 2.7643729520405127e-05, + "loss": 0.5572, + "num_input_tokens_seen": 4309400, + "step": 7425 + }, + { + "epoch": 1.106642835865356, + "grad_norm": 7.723153114318848, + "learning_rate": 2.7662347333929105e-05, + "loss": 0.7367, + "num_input_tokens_seen": 4312376, + "step": 7430 + }, + { + "epoch": 1.1073875484063151, + "grad_norm": 7.192080020904541, + "learning_rate": 2.7680965147453087e-05, + "loss": 0.594, + "num_input_tokens_seen": 4316280, + "step": 7435 + }, + { + "epoch": 1.1081322609472744, + "grad_norm": 7.124946594238281, + "learning_rate": 2.7699582960977065e-05, + "loss": 0.6129, + "num_input_tokens_seen": 4319000, + "step": 7440 + }, + { + "epoch": 1.1088769734882336, + "grad_norm": 14.096083641052246, + "learning_rate": 2.7718200774501047e-05, + "loss": 0.5968, + "num_input_tokens_seen": 4321912, + "step": 7445 + }, + { + "epoch": 1.1096216860291928, + "grad_norm": 4.969721794128418, + "learning_rate": 2.7736818588025025e-05, + "loss": 0.5407, + "num_input_tokens_seen": 4325304, + "step": 7450 + }, + { + "epoch": 1.110366398570152, + "grad_norm": 6.131608486175537, + "learning_rate": 2.7755436401549e-05, + "loss": 0.6767, + "num_input_tokens_seen": 4328280, + "step": 7455 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 7.544022560119629, + "learning_rate": 2.777405421507298e-05, + "loss": 0.4878, + "num_input_tokens_seen": 4331032, + "step": 7460 + }, + { + "epoch": 1.1118558236520704, + "grad_norm": 14.608673095703125, + "learning_rate": 2.779267202859696e-05, + "loss": 0.9704, + "num_input_tokens_seen": 4334104, + "step": 7465 + }, + { + "epoch": 1.1126005361930296, + "grad_norm": 8.563528060913086, + "learning_rate": 2.781128984212094e-05, + "loss": 0.8435, + "num_input_tokens_seen": 4336952, + "step": 7470 + }, + { + "epoch": 1.1133452487339888, + "grad_norm": 9.404393196105957, + "learning_rate": 2.782990765564492e-05, + "loss": 0.6124, + "num_input_tokens_seen": 4340088, + "step": 7475 + }, + { + "epoch": 1.114089961274948, + "grad_norm": 2.2673354148864746, + "learning_rate": 2.78485254691689e-05, + "loss": 0.4214, + "num_input_tokens_seen": 4342872, + "step": 7480 + }, + { + "epoch": 1.1148346738159072, + "grad_norm": 7.5797905921936035, + "learning_rate": 2.7867143282692883e-05, + "loss": 0.6837, + "num_input_tokens_seen": 4345752, + "step": 7485 + }, + { + "epoch": 1.1155793863568662, + "grad_norm": 4.082639217376709, + "learning_rate": 2.788576109621686e-05, + "loss": 0.6202, + "num_input_tokens_seen": 4348376, + "step": 7490 + }, + { + "epoch": 1.1163240988978254, + "grad_norm": 5.711368560791016, + "learning_rate": 2.7904378909740843e-05, + "loss": 0.6775, + "num_input_tokens_seen": 4351256, + "step": 7495 + }, + { + "epoch": 1.1170688114387846, + "grad_norm": 5.393494606018066, + "learning_rate": 2.792299672326482e-05, + "loss": 0.5922, + "num_input_tokens_seen": 4354232, + "step": 7500 + }, + { + "epoch": 1.1178135239797438, + "grad_norm": 5.478439807891846, + "learning_rate": 2.7941614536788802e-05, + "loss": 0.6328, + "num_input_tokens_seen": 4356920, + "step": 7505 + }, + { + "epoch": 1.118558236520703, + "grad_norm": 6.823614597320557, + "learning_rate": 2.796023235031278e-05, + "loss": 0.637, + "num_input_tokens_seen": 4359928, + "step": 7510 + }, + { + "epoch": 1.1193029490616622, + "grad_norm": 9.226414680480957, + "learning_rate": 2.7978850163836762e-05, + "loss": 0.554, + "num_input_tokens_seen": 4362840, + "step": 7515 + }, + { + "epoch": 1.1200476616026214, + "grad_norm": 7.386566638946533, + "learning_rate": 2.799746797736074e-05, + "loss": 0.676, + "num_input_tokens_seen": 4365624, + "step": 7520 + }, + { + "epoch": 1.1207923741435806, + "grad_norm": 12.134710311889648, + "learning_rate": 2.8016085790884722e-05, + "loss": 0.5536, + "num_input_tokens_seen": 4368664, + "step": 7525 + }, + { + "epoch": 1.1215370866845398, + "grad_norm": 9.568784713745117, + "learning_rate": 2.80347036044087e-05, + "loss": 0.7402, + "num_input_tokens_seen": 4371736, + "step": 7530 + }, + { + "epoch": 1.122281799225499, + "grad_norm": 9.035012245178223, + "learning_rate": 2.8053321417932682e-05, + "loss": 0.5546, + "num_input_tokens_seen": 4374808, + "step": 7535 + }, + { + "epoch": 1.1230265117664582, + "grad_norm": 6.254203796386719, + "learning_rate": 2.8071939231456657e-05, + "loss": 0.5075, + "num_input_tokens_seen": 4377560, + "step": 7540 + }, + { + "epoch": 1.1237712243074174, + "grad_norm": 7.990253925323486, + "learning_rate": 2.8090557044980635e-05, + "loss": 0.7587, + "num_input_tokens_seen": 4380472, + "step": 7545 + }, + { + "epoch": 1.1245159368483766, + "grad_norm": 9.651931762695312, + "learning_rate": 2.8109174858504617e-05, + "loss": 0.5427, + "num_input_tokens_seen": 4383352, + "step": 7550 + }, + { + "epoch": 1.1252606493893358, + "grad_norm": 10.857378959655762, + "learning_rate": 2.81277926720286e-05, + "loss": 0.5538, + "num_input_tokens_seen": 4386072, + "step": 7555 + }, + { + "epoch": 1.126005361930295, + "grad_norm": 10.037781715393066, + "learning_rate": 2.8146410485552577e-05, + "loss": 0.7129, + "num_input_tokens_seen": 4388920, + "step": 7560 + }, + { + "epoch": 1.1267500744712542, + "grad_norm": 6.617676258087158, + "learning_rate": 2.8165028299076558e-05, + "loss": 0.6802, + "num_input_tokens_seen": 4391864, + "step": 7565 + }, + { + "epoch": 1.1274947870122132, + "grad_norm": 3.742067337036133, + "learning_rate": 2.8183646112600536e-05, + "loss": 0.5013, + "num_input_tokens_seen": 4394808, + "step": 7570 + }, + { + "epoch": 1.1282394995531724, + "grad_norm": 7.760630130767822, + "learning_rate": 2.8202263926124518e-05, + "loss": 0.6492, + "num_input_tokens_seen": 4397592, + "step": 7575 + }, + { + "epoch": 1.1289842120941316, + "grad_norm": 8.952857971191406, + "learning_rate": 2.8220881739648496e-05, + "loss": 0.8253, + "num_input_tokens_seen": 4400760, + "step": 7580 + }, + { + "epoch": 1.1297289246350908, + "grad_norm": 6.007181167602539, + "learning_rate": 2.8239499553172478e-05, + "loss": 0.4755, + "num_input_tokens_seen": 4403832, + "step": 7585 + }, + { + "epoch": 1.13047363717605, + "grad_norm": 12.239757537841797, + "learning_rate": 2.8258117366696456e-05, + "loss": 0.7971, + "num_input_tokens_seen": 4406552, + "step": 7590 + }, + { + "epoch": 1.1312183497170092, + "grad_norm": 5.294753074645996, + "learning_rate": 2.8276735180220438e-05, + "loss": 0.4782, + "num_input_tokens_seen": 4409400, + "step": 7595 + }, + { + "epoch": 1.1319630622579684, + "grad_norm": 7.414255142211914, + "learning_rate": 2.8295352993744416e-05, + "loss": 0.5635, + "num_input_tokens_seen": 4412248, + "step": 7600 + }, + { + "epoch": 1.1327077747989276, + "grad_norm": 6.744398593902588, + "learning_rate": 2.8313970807268398e-05, + "loss": 0.6245, + "num_input_tokens_seen": 4415160, + "step": 7605 + }, + { + "epoch": 1.1334524873398868, + "grad_norm": 11.81274700164795, + "learning_rate": 2.8332588620792376e-05, + "loss": 0.6909, + "num_input_tokens_seen": 4418104, + "step": 7610 + }, + { + "epoch": 1.134197199880846, + "grad_norm": 4.19581937789917, + "learning_rate": 2.8351206434316357e-05, + "loss": 0.6688, + "num_input_tokens_seen": 4420952, + "step": 7615 + }, + { + "epoch": 1.1349419124218052, + "grad_norm": 22.663501739501953, + "learning_rate": 2.836982424784034e-05, + "loss": 0.7223, + "num_input_tokens_seen": 4423896, + "step": 7620 + }, + { + "epoch": 1.1356866249627644, + "grad_norm": 8.039443969726562, + "learning_rate": 2.838844206136431e-05, + "loss": 0.761, + "num_input_tokens_seen": 4427000, + "step": 7625 + }, + { + "epoch": 1.1364313375037236, + "grad_norm": 4.30892276763916, + "learning_rate": 2.8407059874888292e-05, + "loss": 0.6732, + "num_input_tokens_seen": 4429880, + "step": 7630 + }, + { + "epoch": 1.1371760500446828, + "grad_norm": 12.54541301727295, + "learning_rate": 2.8425677688412274e-05, + "loss": 0.6173, + "num_input_tokens_seen": 4433016, + "step": 7635 + }, + { + "epoch": 1.137920762585642, + "grad_norm": 4.3180718421936035, + "learning_rate": 2.8444295501936252e-05, + "loss": 0.6372, + "num_input_tokens_seen": 4436184, + "step": 7640 + }, + { + "epoch": 1.1386654751266012, + "grad_norm": 3.810415029525757, + "learning_rate": 2.8462913315460234e-05, + "loss": 0.7259, + "num_input_tokens_seen": 4439256, + "step": 7645 + }, + { + "epoch": 1.1394101876675604, + "grad_norm": 5.48486852645874, + "learning_rate": 2.8481531128984212e-05, + "loss": 0.5381, + "num_input_tokens_seen": 4442584, + "step": 7650 + }, + { + "epoch": 1.1401549002085196, + "grad_norm": 4.452991008758545, + "learning_rate": 2.8500148942508193e-05, + "loss": 0.6186, + "num_input_tokens_seen": 4445464, + "step": 7655 + }, + { + "epoch": 1.1408996127494788, + "grad_norm": 5.643124580383301, + "learning_rate": 2.8518766756032172e-05, + "loss": 0.602, + "num_input_tokens_seen": 4448184, + "step": 7660 + }, + { + "epoch": 1.1416443252904378, + "grad_norm": 8.920783042907715, + "learning_rate": 2.8537384569556153e-05, + "loss": 0.8068, + "num_input_tokens_seen": 4451608, + "step": 7665 + }, + { + "epoch": 1.142389037831397, + "grad_norm": 9.681857109069824, + "learning_rate": 2.855600238308013e-05, + "loss": 0.5358, + "num_input_tokens_seen": 4454296, + "step": 7670 + }, + { + "epoch": 1.1431337503723562, + "grad_norm": 7.637612819671631, + "learning_rate": 2.8574620196604113e-05, + "loss": 0.6165, + "num_input_tokens_seen": 4457208, + "step": 7675 + }, + { + "epoch": 1.1438784629133154, + "grad_norm": 8.906938552856445, + "learning_rate": 2.859323801012809e-05, + "loss": 0.4988, + "num_input_tokens_seen": 4460504, + "step": 7680 + }, + { + "epoch": 1.1446231754542746, + "grad_norm": 15.207817077636719, + "learning_rate": 2.8611855823652073e-05, + "loss": 0.6734, + "num_input_tokens_seen": 4463320, + "step": 7685 + }, + { + "epoch": 1.1453678879952338, + "grad_norm": 7.276523113250732, + "learning_rate": 2.863047363717605e-05, + "loss": 0.7281, + "num_input_tokens_seen": 4466232, + "step": 7690 + }, + { + "epoch": 1.146112600536193, + "grad_norm": 11.490259170532227, + "learning_rate": 2.8649091450700033e-05, + "loss": 0.6378, + "num_input_tokens_seen": 4469240, + "step": 7695 + }, + { + "epoch": 1.1468573130771522, + "grad_norm": 6.683349132537842, + "learning_rate": 2.8667709264224015e-05, + "loss": 0.4198, + "num_input_tokens_seen": 4472024, + "step": 7700 + }, + { + "epoch": 1.1476020256181114, + "grad_norm": 11.634358406066895, + "learning_rate": 2.8686327077747993e-05, + "loss": 0.5813, + "num_input_tokens_seen": 4475128, + "step": 7705 + }, + { + "epoch": 1.1483467381590706, + "grad_norm": 9.133639335632324, + "learning_rate": 2.8704944891271968e-05, + "loss": 0.6463, + "num_input_tokens_seen": 4478168, + "step": 7710 + }, + { + "epoch": 1.1490914507000298, + "grad_norm": 6.203179836273193, + "learning_rate": 2.872356270479595e-05, + "loss": 0.5505, + "num_input_tokens_seen": 4481080, + "step": 7715 + }, + { + "epoch": 1.149836163240989, + "grad_norm": 6.04938268661499, + "learning_rate": 2.8742180518319927e-05, + "loss": 0.6769, + "num_input_tokens_seen": 4483992, + "step": 7720 + }, + { + "epoch": 1.1505808757819482, + "grad_norm": 8.8928804397583, + "learning_rate": 2.876079833184391e-05, + "loss": 0.7628, + "num_input_tokens_seen": 4487192, + "step": 7725 + }, + { + "epoch": 1.1513255883229074, + "grad_norm": 9.838767051696777, + "learning_rate": 2.8779416145367887e-05, + "loss": 0.6586, + "num_input_tokens_seen": 4490552, + "step": 7730 + }, + { + "epoch": 1.1520703008638666, + "grad_norm": 4.795233726501465, + "learning_rate": 2.879803395889187e-05, + "loss": 0.6422, + "num_input_tokens_seen": 4493464, + "step": 7735 + }, + { + "epoch": 1.1528150134048256, + "grad_norm": 4.9206647872924805, + "learning_rate": 2.8816651772415847e-05, + "loss": 0.5671, + "num_input_tokens_seen": 4496216, + "step": 7740 + }, + { + "epoch": 1.1535597259457848, + "grad_norm": 4.489410400390625, + "learning_rate": 2.883526958593983e-05, + "loss": 0.7028, + "num_input_tokens_seen": 4499160, + "step": 7745 + }, + { + "epoch": 1.154304438486744, + "grad_norm": 4.7496657371521, + "learning_rate": 2.8853887399463807e-05, + "loss": 0.5477, + "num_input_tokens_seen": 4501912, + "step": 7750 + }, + { + "epoch": 1.1550491510277032, + "grad_norm": 7.17208194732666, + "learning_rate": 2.887250521298779e-05, + "loss": 0.6039, + "num_input_tokens_seen": 4504568, + "step": 7755 + }, + { + "epoch": 1.1557938635686624, + "grad_norm": 4.39135217666626, + "learning_rate": 2.8891123026511767e-05, + "loss": 0.6728, + "num_input_tokens_seen": 4507320, + "step": 7760 + }, + { + "epoch": 1.1565385761096216, + "grad_norm": 6.2423529624938965, + "learning_rate": 2.890974084003575e-05, + "loss": 0.6207, + "num_input_tokens_seen": 4509976, + "step": 7765 + }, + { + "epoch": 1.1572832886505808, + "grad_norm": 4.203886985778809, + "learning_rate": 2.892835865355973e-05, + "loss": 0.6591, + "num_input_tokens_seen": 4512760, + "step": 7770 + }, + { + "epoch": 1.15802800119154, + "grad_norm": 5.622302532196045, + "learning_rate": 2.894697646708371e-05, + "loss": 0.7353, + "num_input_tokens_seen": 4515928, + "step": 7775 + }, + { + "epoch": 1.1587727137324992, + "grad_norm": 5.478921413421631, + "learning_rate": 2.896559428060769e-05, + "loss": 0.5965, + "num_input_tokens_seen": 4518872, + "step": 7780 + }, + { + "epoch": 1.1595174262734584, + "grad_norm": 3.884434938430786, + "learning_rate": 2.8984212094131668e-05, + "loss": 0.7514, + "num_input_tokens_seen": 4521784, + "step": 7785 + }, + { + "epoch": 1.1602621388144176, + "grad_norm": 4.498055934906006, + "learning_rate": 2.900282990765565e-05, + "loss": 0.6602, + "num_input_tokens_seen": 4524824, + "step": 7790 + }, + { + "epoch": 1.1610068513553768, + "grad_norm": 4.390188694000244, + "learning_rate": 2.9021447721179628e-05, + "loss": 0.6818, + "num_input_tokens_seen": 4527640, + "step": 7795 + }, + { + "epoch": 1.161751563896336, + "grad_norm": 4.6090192794799805, + "learning_rate": 2.9040065534703603e-05, + "loss": 0.6479, + "num_input_tokens_seen": 4530392, + "step": 7800 + }, + { + "epoch": 1.1624962764372953, + "grad_norm": 7.078320026397705, + "learning_rate": 2.9058683348227584e-05, + "loss": 0.6424, + "num_input_tokens_seen": 4533016, + "step": 7805 + }, + { + "epoch": 1.1632409889782545, + "grad_norm": 5.398780345916748, + "learning_rate": 2.9077301161751563e-05, + "loss": 0.5648, + "num_input_tokens_seen": 4536120, + "step": 7810 + }, + { + "epoch": 1.1639857015192137, + "grad_norm": 3.5169358253479004, + "learning_rate": 2.9095918975275544e-05, + "loss": 0.6198, + "num_input_tokens_seen": 4539160, + "step": 7815 + }, + { + "epoch": 1.1647304140601729, + "grad_norm": 6.637011528015137, + "learning_rate": 2.9114536788799523e-05, + "loss": 0.5399, + "num_input_tokens_seen": 4542232, + "step": 7820 + }, + { + "epoch": 1.165475126601132, + "grad_norm": 7.1675519943237305, + "learning_rate": 2.9133154602323504e-05, + "loss": 0.5496, + "num_input_tokens_seen": 4545176, + "step": 7825 + }, + { + "epoch": 1.1662198391420913, + "grad_norm": 6.510587215423584, + "learning_rate": 2.9151772415847482e-05, + "loss": 0.6523, + "num_input_tokens_seen": 4548248, + "step": 7830 + }, + { + "epoch": 1.1669645516830505, + "grad_norm": 11.904876708984375, + "learning_rate": 2.9170390229371464e-05, + "loss": 0.5643, + "num_input_tokens_seen": 4551224, + "step": 7835 + }, + { + "epoch": 1.1677092642240094, + "grad_norm": 8.858918190002441, + "learning_rate": 2.9189008042895442e-05, + "loss": 0.3931, + "num_input_tokens_seen": 4553944, + "step": 7840 + }, + { + "epoch": 1.1684539767649686, + "grad_norm": 9.682218551635742, + "learning_rate": 2.9207625856419424e-05, + "loss": 0.5797, + "num_input_tokens_seen": 4556888, + "step": 7845 + }, + { + "epoch": 1.1691986893059279, + "grad_norm": 7.102034568786621, + "learning_rate": 2.9226243669943406e-05, + "loss": 0.8186, + "num_input_tokens_seen": 4559704, + "step": 7850 + }, + { + "epoch": 1.169943401846887, + "grad_norm": 8.222249031066895, + "learning_rate": 2.9244861483467384e-05, + "loss": 0.6355, + "num_input_tokens_seen": 4562456, + "step": 7855 + }, + { + "epoch": 1.1706881143878463, + "grad_norm": 14.288698196411133, + "learning_rate": 2.9263479296991365e-05, + "loss": 0.638, + "num_input_tokens_seen": 4565240, + "step": 7860 + }, + { + "epoch": 1.1714328269288055, + "grad_norm": 13.52371597290039, + "learning_rate": 2.9282097110515344e-05, + "loss": 0.5614, + "num_input_tokens_seen": 4568440, + "step": 7865 + }, + { + "epoch": 1.1721775394697647, + "grad_norm": 6.889704704284668, + "learning_rate": 2.9300714924039325e-05, + "loss": 0.7023, + "num_input_tokens_seen": 4571512, + "step": 7870 + }, + { + "epoch": 1.1729222520107239, + "grad_norm": 8.78702163696289, + "learning_rate": 2.9319332737563303e-05, + "loss": 0.6292, + "num_input_tokens_seen": 4574392, + "step": 7875 + }, + { + "epoch": 1.173666964551683, + "grad_norm": 6.199235439300537, + "learning_rate": 2.9337950551087285e-05, + "loss": 0.6483, + "num_input_tokens_seen": 4577304, + "step": 7880 + }, + { + "epoch": 1.1744116770926423, + "grad_norm": 8.518688201904297, + "learning_rate": 2.935656836461126e-05, + "loss": 0.6469, + "num_input_tokens_seen": 4580440, + "step": 7885 + }, + { + "epoch": 1.1751563896336015, + "grad_norm": 6.634400844573975, + "learning_rate": 2.9375186178135238e-05, + "loss": 0.8698, + "num_input_tokens_seen": 4583320, + "step": 7890 + }, + { + "epoch": 1.1759011021745607, + "grad_norm": 8.03706169128418, + "learning_rate": 2.939380399165922e-05, + "loss": 0.6118, + "num_input_tokens_seen": 4586296, + "step": 7895 + }, + { + "epoch": 1.1766458147155199, + "grad_norm": 9.280374526977539, + "learning_rate": 2.9412421805183198e-05, + "loss": 0.8029, + "num_input_tokens_seen": 4589240, + "step": 7900 + }, + { + "epoch": 1.177390527256479, + "grad_norm": 6.562958240509033, + "learning_rate": 2.943103961870718e-05, + "loss": 0.6461, + "num_input_tokens_seen": 4592248, + "step": 7905 + }, + { + "epoch": 1.1781352397974383, + "grad_norm": 5.605023384094238, + "learning_rate": 2.9449657432231158e-05, + "loss": 0.7138, + "num_input_tokens_seen": 4595192, + "step": 7910 + }, + { + "epoch": 1.1788799523383973, + "grad_norm": 3.218932628631592, + "learning_rate": 2.946827524575514e-05, + "loss": 0.7155, + "num_input_tokens_seen": 4597912, + "step": 7915 + }, + { + "epoch": 1.1796246648793565, + "grad_norm": 7.768893718719482, + "learning_rate": 2.9486893059279118e-05, + "loss": 0.5898, + "num_input_tokens_seen": 4600696, + "step": 7920 + }, + { + "epoch": 1.1803693774203157, + "grad_norm": 2.9819021224975586, + "learning_rate": 2.95055108728031e-05, + "loss": 0.6965, + "num_input_tokens_seen": 4603704, + "step": 7925 + }, + { + "epoch": 1.1811140899612749, + "grad_norm": 4.9914398193359375, + "learning_rate": 2.952412868632708e-05, + "loss": 0.5408, + "num_input_tokens_seen": 4606424, + "step": 7930 + }, + { + "epoch": 1.181858802502234, + "grad_norm": 6.8143110275268555, + "learning_rate": 2.954274649985106e-05, + "loss": 0.6847, + "num_input_tokens_seen": 4609240, + "step": 7935 + }, + { + "epoch": 1.1826035150431933, + "grad_norm": 8.377547264099121, + "learning_rate": 2.956136431337504e-05, + "loss": 0.739, + "num_input_tokens_seen": 4612088, + "step": 7940 + }, + { + "epoch": 1.1833482275841525, + "grad_norm": 3.722224712371826, + "learning_rate": 2.957998212689902e-05, + "loss": 0.5757, + "num_input_tokens_seen": 4614744, + "step": 7945 + }, + { + "epoch": 1.1840929401251117, + "grad_norm": 6.736315727233887, + "learning_rate": 2.9598599940423e-05, + "loss": 0.4826, + "num_input_tokens_seen": 4617400, + "step": 7950 + }, + { + "epoch": 1.1848376526660709, + "grad_norm": 7.93727970123291, + "learning_rate": 2.961721775394698e-05, + "loss": 0.3462, + "num_input_tokens_seen": 4620344, + "step": 7955 + }, + { + "epoch": 1.18558236520703, + "grad_norm": 9.939553260803223, + "learning_rate": 2.963583556747096e-05, + "loss": 0.7422, + "num_input_tokens_seen": 4623384, + "step": 7960 + }, + { + "epoch": 1.1863270777479893, + "grad_norm": 4.871148586273193, + "learning_rate": 2.965445338099494e-05, + "loss": 0.6275, + "num_input_tokens_seen": 4626424, + "step": 7965 + }, + { + "epoch": 1.1870717902889485, + "grad_norm": 7.22058629989624, + "learning_rate": 2.9673071194518914e-05, + "loss": 0.6817, + "num_input_tokens_seen": 4628984, + "step": 7970 + }, + { + "epoch": 1.1878165028299077, + "grad_norm": 4.34166955947876, + "learning_rate": 2.9691689008042895e-05, + "loss": 0.4428, + "num_input_tokens_seen": 4631864, + "step": 7975 + }, + { + "epoch": 1.188561215370867, + "grad_norm": 8.148954391479492, + "learning_rate": 2.9710306821566873e-05, + "loss": 0.5784, + "num_input_tokens_seen": 4634808, + "step": 7980 + }, + { + "epoch": 1.189305927911826, + "grad_norm": 4.527592658996582, + "learning_rate": 2.9728924635090855e-05, + "loss": 0.691, + "num_input_tokens_seen": 4637560, + "step": 7985 + }, + { + "epoch": 1.1900506404527853, + "grad_norm": 7.094540119171143, + "learning_rate": 2.9747542448614833e-05, + "loss": 0.5465, + "num_input_tokens_seen": 4640728, + "step": 7990 + }, + { + "epoch": 1.1907953529937445, + "grad_norm": 9.484989166259766, + "learning_rate": 2.9766160262138815e-05, + "loss": 0.7446, + "num_input_tokens_seen": 4643384, + "step": 7995 + }, + { + "epoch": 1.1915400655347037, + "grad_norm": 15.307987213134766, + "learning_rate": 2.9784778075662793e-05, + "loss": 0.5749, + "num_input_tokens_seen": 4646040, + "step": 8000 + }, + { + "epoch": 1.192284778075663, + "grad_norm": 6.854035377502441, + "learning_rate": 2.9803395889186775e-05, + "loss": 0.7853, + "num_input_tokens_seen": 4648728, + "step": 8005 + }, + { + "epoch": 1.193029490616622, + "grad_norm": 4.83715295791626, + "learning_rate": 2.9822013702710756e-05, + "loss": 0.613, + "num_input_tokens_seen": 4651672, + "step": 8010 + }, + { + "epoch": 1.193774203157581, + "grad_norm": 4.5560383796691895, + "learning_rate": 2.9840631516234735e-05, + "loss": 0.561, + "num_input_tokens_seen": 4654520, + "step": 8015 + }, + { + "epoch": 1.1945189156985403, + "grad_norm": 3.45894455909729, + "learning_rate": 2.9859249329758716e-05, + "loss": 0.6138, + "num_input_tokens_seen": 4657432, + "step": 8020 + }, + { + "epoch": 1.1952636282394995, + "grad_norm": 5.902529716491699, + "learning_rate": 2.9877867143282694e-05, + "loss": 0.5007, + "num_input_tokens_seen": 4660280, + "step": 8025 + }, + { + "epoch": 1.1960083407804587, + "grad_norm": 9.074432373046875, + "learning_rate": 2.9896484956806676e-05, + "loss": 0.4507, + "num_input_tokens_seen": 4663032, + "step": 8030 + }, + { + "epoch": 1.196753053321418, + "grad_norm": 15.138032913208008, + "learning_rate": 2.9915102770330654e-05, + "loss": 0.5789, + "num_input_tokens_seen": 4666136, + "step": 8035 + }, + { + "epoch": 1.197497765862377, + "grad_norm": 7.44838285446167, + "learning_rate": 2.9933720583854636e-05, + "loss": 0.4184, + "num_input_tokens_seen": 4668952, + "step": 8040 + }, + { + "epoch": 1.1982424784033363, + "grad_norm": 6.885765552520752, + "learning_rate": 2.9952338397378614e-05, + "loss": 0.6217, + "num_input_tokens_seen": 4671928, + "step": 8045 + }, + { + "epoch": 1.1989871909442955, + "grad_norm": 6.384573936462402, + "learning_rate": 2.9970956210902596e-05, + "loss": 0.6525, + "num_input_tokens_seen": 4674616, + "step": 8050 + }, + { + "epoch": 1.1997319034852547, + "grad_norm": 6.361767292022705, + "learning_rate": 2.9989574024426574e-05, + "loss": 0.6945, + "num_input_tokens_seen": 4677368, + "step": 8055 + }, + { + "epoch": 1.200476616026214, + "grad_norm": 4.040438175201416, + "learning_rate": 3.000819183795055e-05, + "loss": 0.4971, + "num_input_tokens_seen": 4679992, + "step": 8060 + }, + { + "epoch": 1.2012213285671731, + "grad_norm": 11.875975608825684, + "learning_rate": 3.002680965147453e-05, + "loss": 0.7794, + "num_input_tokens_seen": 4683000, + "step": 8065 + }, + { + "epoch": 1.2019660411081323, + "grad_norm": 16.295242309570312, + "learning_rate": 3.004542746499851e-05, + "loss": 0.6157, + "num_input_tokens_seen": 4685848, + "step": 8070 + }, + { + "epoch": 1.2027107536490915, + "grad_norm": 11.202716827392578, + "learning_rate": 3.006404527852249e-05, + "loss": 0.7721, + "num_input_tokens_seen": 4688664, + "step": 8075 + }, + { + "epoch": 1.2034554661900507, + "grad_norm": 7.776278972625732, + "learning_rate": 3.008266309204647e-05, + "loss": 0.6226, + "num_input_tokens_seen": 4691640, + "step": 8080 + }, + { + "epoch": 1.2042001787310097, + "grad_norm": 6.941986560821533, + "learning_rate": 3.010128090557045e-05, + "loss": 0.6136, + "num_input_tokens_seen": 4694520, + "step": 8085 + }, + { + "epoch": 1.204944891271969, + "grad_norm": 5.523571968078613, + "learning_rate": 3.0119898719094432e-05, + "loss": 0.6663, + "num_input_tokens_seen": 4697208, + "step": 8090 + }, + { + "epoch": 1.2056896038129281, + "grad_norm": 9.864372253417969, + "learning_rate": 3.013851653261841e-05, + "loss": 0.6763, + "num_input_tokens_seen": 4700184, + "step": 8095 + }, + { + "epoch": 1.2064343163538873, + "grad_norm": 4.837357044219971, + "learning_rate": 3.015713434614239e-05, + "loss": 0.601, + "num_input_tokens_seen": 4703224, + "step": 8100 + }, + { + "epoch": 1.2071790288948465, + "grad_norm": 4.529480934143066, + "learning_rate": 3.017575215966637e-05, + "loss": 0.629, + "num_input_tokens_seen": 4705816, + "step": 8105 + }, + { + "epoch": 1.2079237414358057, + "grad_norm": 11.352717399597168, + "learning_rate": 3.019436997319035e-05, + "loss": 0.6376, + "num_input_tokens_seen": 4708600, + "step": 8110 + }, + { + "epoch": 1.208668453976765, + "grad_norm": 14.297675132751465, + "learning_rate": 3.021298778671433e-05, + "loss": 0.7665, + "num_input_tokens_seen": 4711416, + "step": 8115 + }, + { + "epoch": 1.2094131665177241, + "grad_norm": 8.77822494506836, + "learning_rate": 3.023160560023831e-05, + "loss": 0.7391, + "num_input_tokens_seen": 4714392, + "step": 8120 + }, + { + "epoch": 1.2101578790586833, + "grad_norm": 5.9178996086120605, + "learning_rate": 3.025022341376229e-05, + "loss": 0.5561, + "num_input_tokens_seen": 4717432, + "step": 8125 + }, + { + "epoch": 1.2109025915996425, + "grad_norm": 4.087647438049316, + "learning_rate": 3.026884122728627e-05, + "loss": 0.6527, + "num_input_tokens_seen": 4720632, + "step": 8130 + }, + { + "epoch": 1.2116473041406017, + "grad_norm": 3.9439480304718018, + "learning_rate": 3.028745904081025e-05, + "loss": 0.8015, + "num_input_tokens_seen": 4723384, + "step": 8135 + }, + { + "epoch": 1.212392016681561, + "grad_norm": 6.75589656829834, + "learning_rate": 3.030607685433423e-05, + "loss": 0.5765, + "num_input_tokens_seen": 4726232, + "step": 8140 + }, + { + "epoch": 1.2131367292225201, + "grad_norm": 5.981895923614502, + "learning_rate": 3.0324694667858206e-05, + "loss": 0.5921, + "num_input_tokens_seen": 4728952, + "step": 8145 + }, + { + "epoch": 1.2138814417634793, + "grad_norm": 6.306111812591553, + "learning_rate": 3.0343312481382184e-05, + "loss": 0.7367, + "num_input_tokens_seen": 4731672, + "step": 8150 + }, + { + "epoch": 1.2146261543044385, + "grad_norm": 6.768533706665039, + "learning_rate": 3.0361930294906166e-05, + "loss": 0.7297, + "num_input_tokens_seen": 4734776, + "step": 8155 + }, + { + "epoch": 1.2153708668453977, + "grad_norm": 4.289531707763672, + "learning_rate": 3.0380548108430147e-05, + "loss": 0.5696, + "num_input_tokens_seen": 4737592, + "step": 8160 + }, + { + "epoch": 1.216115579386357, + "grad_norm": 5.190781593322754, + "learning_rate": 3.0399165921954126e-05, + "loss": 0.629, + "num_input_tokens_seen": 4740568, + "step": 8165 + }, + { + "epoch": 1.2168602919273162, + "grad_norm": 6.389497756958008, + "learning_rate": 3.0417783735478107e-05, + "loss": 0.8595, + "num_input_tokens_seen": 4743736, + "step": 8170 + }, + { + "epoch": 1.2176050044682754, + "grad_norm": 4.650540351867676, + "learning_rate": 3.0436401549002085e-05, + "loss": 0.6186, + "num_input_tokens_seen": 4746616, + "step": 8175 + }, + { + "epoch": 1.2183497170092346, + "grad_norm": 8.00468635559082, + "learning_rate": 3.0455019362526067e-05, + "loss": 0.8408, + "num_input_tokens_seen": 4749432, + "step": 8180 + }, + { + "epoch": 1.2190944295501935, + "grad_norm": 3.9901809692382812, + "learning_rate": 3.0473637176050045e-05, + "loss": 0.602, + "num_input_tokens_seen": 4752376, + "step": 8185 + }, + { + "epoch": 1.2198391420911527, + "grad_norm": 5.136066436767578, + "learning_rate": 3.0492254989574027e-05, + "loss": 0.6258, + "num_input_tokens_seen": 4755192, + "step": 8190 + }, + { + "epoch": 1.220583854632112, + "grad_norm": 5.31689453125, + "learning_rate": 3.0510872803098005e-05, + "loss": 0.6429, + "num_input_tokens_seen": 4757944, + "step": 8195 + }, + { + "epoch": 1.2213285671730711, + "grad_norm": 4.353511810302734, + "learning_rate": 3.052949061662199e-05, + "loss": 0.655, + "num_input_tokens_seen": 4760792, + "step": 8200 + }, + { + "epoch": 1.2220732797140303, + "grad_norm": 3.28456711769104, + "learning_rate": 3.0548108430145965e-05, + "loss": 0.6481, + "num_input_tokens_seen": 4763768, + "step": 8205 + }, + { + "epoch": 1.2228179922549895, + "grad_norm": 4.409858226776123, + "learning_rate": 3.056672624366994e-05, + "loss": 0.7854, + "num_input_tokens_seen": 4766520, + "step": 8210 + }, + { + "epoch": 1.2235627047959488, + "grad_norm": 7.77255392074585, + "learning_rate": 3.058534405719393e-05, + "loss": 0.4247, + "num_input_tokens_seen": 4769464, + "step": 8215 + }, + { + "epoch": 1.224307417336908, + "grad_norm": 4.877632141113281, + "learning_rate": 3.0603961870717907e-05, + "loss": 0.563, + "num_input_tokens_seen": 4772184, + "step": 8220 + }, + { + "epoch": 1.2250521298778672, + "grad_norm": 6.844290256500244, + "learning_rate": 3.0622579684241885e-05, + "loss": 0.6547, + "num_input_tokens_seen": 4775096, + "step": 8225 + }, + { + "epoch": 1.2257968424188264, + "grad_norm": 3.0806050300598145, + "learning_rate": 3.064119749776586e-05, + "loss": 0.6788, + "num_input_tokens_seen": 4777976, + "step": 8230 + }, + { + "epoch": 1.2265415549597856, + "grad_norm": 7.251819610595703, + "learning_rate": 3.065981531128984e-05, + "loss": 0.6864, + "num_input_tokens_seen": 4781208, + "step": 8235 + }, + { + "epoch": 1.2272862675007448, + "grad_norm": 6.846287250518799, + "learning_rate": 3.067843312481382e-05, + "loss": 0.5822, + "num_input_tokens_seen": 4783992, + "step": 8240 + }, + { + "epoch": 1.228030980041704, + "grad_norm": 5.354629993438721, + "learning_rate": 3.0697050938337804e-05, + "loss": 0.5815, + "num_input_tokens_seen": 4787032, + "step": 8245 + }, + { + "epoch": 1.2287756925826632, + "grad_norm": 3.750638961791992, + "learning_rate": 3.071566875186178e-05, + "loss": 0.745, + "num_input_tokens_seen": 4789976, + "step": 8250 + }, + { + "epoch": 1.2295204051236224, + "grad_norm": 4.223310947418213, + "learning_rate": 3.073428656538576e-05, + "loss": 0.6868, + "num_input_tokens_seen": 4792696, + "step": 8255 + }, + { + "epoch": 1.2302651176645814, + "grad_norm": 5.696191310882568, + "learning_rate": 3.075290437890974e-05, + "loss": 0.5743, + "num_input_tokens_seen": 4795608, + "step": 8260 + }, + { + "epoch": 1.2310098302055406, + "grad_norm": 3.735936164855957, + "learning_rate": 3.0771522192433724e-05, + "loss": 0.6595, + "num_input_tokens_seen": 4798520, + "step": 8265 + }, + { + "epoch": 1.2317545427464998, + "grad_norm": 5.059011936187744, + "learning_rate": 3.07901400059577e-05, + "loss": 0.7562, + "num_input_tokens_seen": 4801624, + "step": 8270 + }, + { + "epoch": 1.232499255287459, + "grad_norm": 7.965489864349365, + "learning_rate": 3.080875781948168e-05, + "loss": 0.4979, + "num_input_tokens_seen": 4804952, + "step": 8275 + }, + { + "epoch": 1.2332439678284182, + "grad_norm": 5.211277961730957, + "learning_rate": 3.082737563300566e-05, + "loss": 0.558, + "num_input_tokens_seen": 4807736, + "step": 8280 + }, + { + "epoch": 1.2339886803693774, + "grad_norm": 9.867961883544922, + "learning_rate": 3.0845993446529644e-05, + "loss": 0.8879, + "num_input_tokens_seen": 4810424, + "step": 8285 + }, + { + "epoch": 1.2347333929103366, + "grad_norm": 5.282741069793701, + "learning_rate": 3.086461126005362e-05, + "loss": 0.7358, + "num_input_tokens_seen": 4813720, + "step": 8290 + }, + { + "epoch": 1.2354781054512958, + "grad_norm": 5.868327617645264, + "learning_rate": 3.08832290735776e-05, + "loss": 0.7334, + "num_input_tokens_seen": 4816664, + "step": 8295 + }, + { + "epoch": 1.236222817992255, + "grad_norm": 4.840679168701172, + "learning_rate": 3.0901846887101585e-05, + "loss": 0.6834, + "num_input_tokens_seen": 4819544, + "step": 8300 + }, + { + "epoch": 1.2369675305332142, + "grad_norm": 5.558811664581299, + "learning_rate": 3.0920464700625564e-05, + "loss": 0.6967, + "num_input_tokens_seen": 4822616, + "step": 8305 + }, + { + "epoch": 1.2377122430741734, + "grad_norm": 6.489785194396973, + "learning_rate": 3.093908251414954e-05, + "loss": 0.717, + "num_input_tokens_seen": 4825752, + "step": 8310 + }, + { + "epoch": 1.2384569556151326, + "grad_norm": 4.92611837387085, + "learning_rate": 3.095770032767352e-05, + "loss": 0.5002, + "num_input_tokens_seen": 4828920, + "step": 8315 + }, + { + "epoch": 1.2392016681560918, + "grad_norm": 8.623414993286133, + "learning_rate": 3.09763181411975e-05, + "loss": 0.5345, + "num_input_tokens_seen": 4831672, + "step": 8320 + }, + { + "epoch": 1.239946380697051, + "grad_norm": 4.571309566497803, + "learning_rate": 3.0994935954721477e-05, + "loss": 0.5521, + "num_input_tokens_seen": 4834424, + "step": 8325 + }, + { + "epoch": 1.2406910932380102, + "grad_norm": 9.502178192138672, + "learning_rate": 3.1013553768245455e-05, + "loss": 0.7032, + "num_input_tokens_seen": 4837496, + "step": 8330 + }, + { + "epoch": 1.2414358057789694, + "grad_norm": 4.254115104675293, + "learning_rate": 3.103217158176944e-05, + "loss": 0.8421, + "num_input_tokens_seen": 4840440, + "step": 8335 + }, + { + "epoch": 1.2421805183199286, + "grad_norm": 6.6150641441345215, + "learning_rate": 3.105078939529342e-05, + "loss": 0.6408, + "num_input_tokens_seen": 4843032, + "step": 8340 + }, + { + "epoch": 1.2429252308608878, + "grad_norm": 5.57606315612793, + "learning_rate": 3.1069407208817396e-05, + "loss": 0.5482, + "num_input_tokens_seen": 4845848, + "step": 8345 + }, + { + "epoch": 1.243669943401847, + "grad_norm": 4.92020845413208, + "learning_rate": 3.1088025022341374e-05, + "loss": 0.5008, + "num_input_tokens_seen": 4848824, + "step": 8350 + }, + { + "epoch": 1.244414655942806, + "grad_norm": 6.542077541351318, + "learning_rate": 3.110664283586536e-05, + "loss": 0.554, + "num_input_tokens_seen": 4851608, + "step": 8355 + }, + { + "epoch": 1.2451593684837652, + "grad_norm": 7.615109443664551, + "learning_rate": 3.112526064938934e-05, + "loss": 0.4669, + "num_input_tokens_seen": 4854840, + "step": 8360 + }, + { + "epoch": 1.2459040810247244, + "grad_norm": 8.267584800720215, + "learning_rate": 3.1143878462913316e-05, + "loss": 0.8312, + "num_input_tokens_seen": 4857912, + "step": 8365 + }, + { + "epoch": 1.2466487935656836, + "grad_norm": 8.904058456420898, + "learning_rate": 3.1162496276437294e-05, + "loss": 0.7603, + "num_input_tokens_seen": 4860696, + "step": 8370 + }, + { + "epoch": 1.2473935061066428, + "grad_norm": 10.164727210998535, + "learning_rate": 3.118111408996128e-05, + "loss": 0.7063, + "num_input_tokens_seen": 4863736, + "step": 8375 + }, + { + "epoch": 1.248138218647602, + "grad_norm": 6.11226224899292, + "learning_rate": 3.119973190348526e-05, + "loss": 0.5893, + "num_input_tokens_seen": 4866488, + "step": 8380 + }, + { + "epoch": 1.2488829311885612, + "grad_norm": 6.740679740905762, + "learning_rate": 3.1218349717009236e-05, + "loss": 0.7269, + "num_input_tokens_seen": 4869720, + "step": 8385 + }, + { + "epoch": 1.2496276437295204, + "grad_norm": 19.302410125732422, + "learning_rate": 3.123696753053322e-05, + "loss": 0.736, + "num_input_tokens_seen": 4872792, + "step": 8390 + }, + { + "epoch": 1.2503723562704796, + "grad_norm": 6.417990684509277, + "learning_rate": 3.12555853440572e-05, + "loss": 0.6867, + "num_input_tokens_seen": 4876120, + "step": 8395 + }, + { + "epoch": 1.2511170688114388, + "grad_norm": 5.0884690284729, + "learning_rate": 3.127420315758118e-05, + "loss": 0.4528, + "num_input_tokens_seen": 4879000, + "step": 8400 + }, + { + "epoch": 1.251861781352398, + "grad_norm": 7.373092174530029, + "learning_rate": 3.1292820971105155e-05, + "loss": 0.5264, + "num_input_tokens_seen": 4881560, + "step": 8405 + }, + { + "epoch": 1.2526064938933572, + "grad_norm": 5.006511688232422, + "learning_rate": 3.1311438784629134e-05, + "loss": 0.6748, + "num_input_tokens_seen": 4884440, + "step": 8410 + }, + { + "epoch": 1.2533512064343164, + "grad_norm": 6.338024616241455, + "learning_rate": 3.133005659815311e-05, + "loss": 0.6813, + "num_input_tokens_seen": 4887160, + "step": 8415 + }, + { + "epoch": 1.2540959189752756, + "grad_norm": 3.330122709274292, + "learning_rate": 3.134867441167709e-05, + "loss": 0.5245, + "num_input_tokens_seen": 4890424, + "step": 8420 + }, + { + "epoch": 1.2548406315162346, + "grad_norm": 4.83281946182251, + "learning_rate": 3.1367292225201075e-05, + "loss": 0.5317, + "num_input_tokens_seen": 4893144, + "step": 8425 + }, + { + "epoch": 1.2555853440571938, + "grad_norm": 7.763613224029541, + "learning_rate": 3.138591003872505e-05, + "loss": 0.6152, + "num_input_tokens_seen": 4896120, + "step": 8430 + }, + { + "epoch": 1.256330056598153, + "grad_norm": 10.965322494506836, + "learning_rate": 3.140452785224903e-05, + "loss": 0.7751, + "num_input_tokens_seen": 4898968, + "step": 8435 + }, + { + "epoch": 1.2570747691391122, + "grad_norm": 7.4095845222473145, + "learning_rate": 3.142314566577301e-05, + "loss": 0.7605, + "num_input_tokens_seen": 4901880, + "step": 8440 + }, + { + "epoch": 1.2578194816800714, + "grad_norm": 9.847780227661133, + "learning_rate": 3.1441763479296995e-05, + "loss": 0.4743, + "num_input_tokens_seen": 4904568, + "step": 8445 + }, + { + "epoch": 1.2585641942210306, + "grad_norm": 2.4295127391815186, + "learning_rate": 3.146038129282097e-05, + "loss": 0.6592, + "num_input_tokens_seen": 4907480, + "step": 8450 + }, + { + "epoch": 1.2593089067619898, + "grad_norm": 3.9654786586761475, + "learning_rate": 3.147899910634495e-05, + "loss": 0.4228, + "num_input_tokens_seen": 4910744, + "step": 8455 + }, + { + "epoch": 1.260053619302949, + "grad_norm": 4.901012420654297, + "learning_rate": 3.1497616919868936e-05, + "loss": 0.5678, + "num_input_tokens_seen": 4913720, + "step": 8460 + }, + { + "epoch": 1.2607983318439082, + "grad_norm": 11.036452293395996, + "learning_rate": 3.1516234733392914e-05, + "loss": 0.9006, + "num_input_tokens_seen": 4916568, + "step": 8465 + }, + { + "epoch": 1.2615430443848674, + "grad_norm": 4.272316932678223, + "learning_rate": 3.153485254691689e-05, + "loss": 0.6713, + "num_input_tokens_seen": 4919224, + "step": 8470 + }, + { + "epoch": 1.2622877569258266, + "grad_norm": 7.553067684173584, + "learning_rate": 3.155347036044087e-05, + "loss": 0.595, + "num_input_tokens_seen": 4922168, + "step": 8475 + }, + { + "epoch": 1.2630324694667858, + "grad_norm": 5.456788539886475, + "learning_rate": 3.1572088173964856e-05, + "loss": 0.5491, + "num_input_tokens_seen": 4924824, + "step": 8480 + }, + { + "epoch": 1.263777182007745, + "grad_norm": 7.48284912109375, + "learning_rate": 3.1590705987488834e-05, + "loss": 0.5546, + "num_input_tokens_seen": 4927864, + "step": 8485 + }, + { + "epoch": 1.2645218945487042, + "grad_norm": 4.53607177734375, + "learning_rate": 3.1609323801012806e-05, + "loss": 0.7393, + "num_input_tokens_seen": 4930648, + "step": 8490 + }, + { + "epoch": 1.2652666070896634, + "grad_norm": 10.107963562011719, + "learning_rate": 3.162794161453679e-05, + "loss": 0.7051, + "num_input_tokens_seen": 4933944, + "step": 8495 + }, + { + "epoch": 1.2660113196306226, + "grad_norm": 4.994384288787842, + "learning_rate": 3.164655942806077e-05, + "loss": 0.9102, + "num_input_tokens_seen": 4936600, + "step": 8500 + }, + { + "epoch": 1.2667560321715818, + "grad_norm": 8.609509468078613, + "learning_rate": 3.166517724158475e-05, + "loss": 0.5935, + "num_input_tokens_seen": 4939192, + "step": 8505 + }, + { + "epoch": 1.267500744712541, + "grad_norm": 7.4548492431640625, + "learning_rate": 3.1683795055108725e-05, + "loss": 0.7634, + "num_input_tokens_seen": 4942232, + "step": 8510 + }, + { + "epoch": 1.2682454572535002, + "grad_norm": 4.666564464569092, + "learning_rate": 3.170241286863271e-05, + "loss": 0.7462, + "num_input_tokens_seen": 4945240, + "step": 8515 + }, + { + "epoch": 1.2689901697944594, + "grad_norm": 5.840669631958008, + "learning_rate": 3.172103068215669e-05, + "loss": 0.633, + "num_input_tokens_seen": 4948248, + "step": 8520 + }, + { + "epoch": 1.2697348823354186, + "grad_norm": 3.597478151321411, + "learning_rate": 3.173964849568067e-05, + "loss": 0.5582, + "num_input_tokens_seen": 4951224, + "step": 8525 + }, + { + "epoch": 1.2704795948763778, + "grad_norm": 6.667992115020752, + "learning_rate": 3.1758266309204645e-05, + "loss": 0.6052, + "num_input_tokens_seen": 4954168, + "step": 8530 + }, + { + "epoch": 1.2712243074173368, + "grad_norm": 8.469963073730469, + "learning_rate": 3.177688412272863e-05, + "loss": 0.6285, + "num_input_tokens_seen": 4956952, + "step": 8535 + }, + { + "epoch": 1.271969019958296, + "grad_norm": 5.33427095413208, + "learning_rate": 3.179550193625261e-05, + "loss": 0.4849, + "num_input_tokens_seen": 4959512, + "step": 8540 + }, + { + "epoch": 1.2727137324992552, + "grad_norm": 8.08147144317627, + "learning_rate": 3.1814119749776586e-05, + "loss": 0.7526, + "num_input_tokens_seen": 4962296, + "step": 8545 + }, + { + "epoch": 1.2734584450402144, + "grad_norm": 10.502798080444336, + "learning_rate": 3.183273756330057e-05, + "loss": 0.6283, + "num_input_tokens_seen": 4965048, + "step": 8550 + }, + { + "epoch": 1.2742031575811736, + "grad_norm": 10.02891731262207, + "learning_rate": 3.185135537682455e-05, + "loss": 0.7203, + "num_input_tokens_seen": 4967672, + "step": 8555 + }, + { + "epoch": 1.2749478701221328, + "grad_norm": 11.883593559265137, + "learning_rate": 3.186997319034853e-05, + "loss": 0.77, + "num_input_tokens_seen": 4970744, + "step": 8560 + }, + { + "epoch": 1.275692582663092, + "grad_norm": 5.081808090209961, + "learning_rate": 3.1888591003872506e-05, + "loss": 0.5983, + "num_input_tokens_seen": 4973432, + "step": 8565 + }, + { + "epoch": 1.2764372952040512, + "grad_norm": 9.297033309936523, + "learning_rate": 3.190720881739649e-05, + "loss": 0.5669, + "num_input_tokens_seen": 4976568, + "step": 8570 + }, + { + "epoch": 1.2771820077450105, + "grad_norm": 14.690470695495605, + "learning_rate": 3.192582663092047e-05, + "loss": 0.6824, + "num_input_tokens_seen": 4979320, + "step": 8575 + }, + { + "epoch": 1.2779267202859697, + "grad_norm": 4.0280680656433105, + "learning_rate": 3.194444444444444e-05, + "loss": 0.7025, + "num_input_tokens_seen": 4981976, + "step": 8580 + }, + { + "epoch": 1.2786714328269289, + "grad_norm": 5.919393062591553, + "learning_rate": 3.1963062257968426e-05, + "loss": 0.6595, + "num_input_tokens_seen": 4984856, + "step": 8585 + }, + { + "epoch": 1.279416145367888, + "grad_norm": 10.02163028717041, + "learning_rate": 3.1981680071492404e-05, + "loss": 0.9022, + "num_input_tokens_seen": 4987928, + "step": 8590 + }, + { + "epoch": 1.2801608579088473, + "grad_norm": 6.79878568649292, + "learning_rate": 3.200029788501638e-05, + "loss": 0.5199, + "num_input_tokens_seen": 4990936, + "step": 8595 + }, + { + "epoch": 1.2809055704498062, + "grad_norm": 4.859716892242432, + "learning_rate": 3.201891569854036e-05, + "loss": 0.5732, + "num_input_tokens_seen": 4993464, + "step": 8600 + }, + { + "epoch": 1.2816502829907654, + "grad_norm": 5.2606706619262695, + "learning_rate": 3.2037533512064346e-05, + "loss": 0.6727, + "num_input_tokens_seen": 4996248, + "step": 8605 + }, + { + "epoch": 1.2823949955317246, + "grad_norm": 7.251401901245117, + "learning_rate": 3.2056151325588324e-05, + "loss": 0.6575, + "num_input_tokens_seen": 4999128, + "step": 8610 + }, + { + "epoch": 1.2831397080726838, + "grad_norm": 6.304658889770508, + "learning_rate": 3.20747691391123e-05, + "loss": 0.5806, + "num_input_tokens_seen": 5001624, + "step": 8615 + }, + { + "epoch": 1.283884420613643, + "grad_norm": 10.110404014587402, + "learning_rate": 3.209338695263629e-05, + "loss": 0.6572, + "num_input_tokens_seen": 5004248, + "step": 8620 + }, + { + "epoch": 1.2846291331546023, + "grad_norm": 5.370707988739014, + "learning_rate": 3.2112004766160265e-05, + "loss": 0.6405, + "num_input_tokens_seen": 5007032, + "step": 8625 + }, + { + "epoch": 1.2853738456955615, + "grad_norm": 4.458540439605713, + "learning_rate": 3.2130622579684244e-05, + "loss": 0.5995, + "num_input_tokens_seen": 5009912, + "step": 8630 + }, + { + "epoch": 1.2861185582365207, + "grad_norm": 4.565883159637451, + "learning_rate": 3.214924039320822e-05, + "loss": 0.6787, + "num_input_tokens_seen": 5012728, + "step": 8635 + }, + { + "epoch": 1.2868632707774799, + "grad_norm": 3.8276491165161133, + "learning_rate": 3.216785820673221e-05, + "loss": 0.5922, + "num_input_tokens_seen": 5015576, + "step": 8640 + }, + { + "epoch": 1.287607983318439, + "grad_norm": 4.33355188369751, + "learning_rate": 3.2186476020256185e-05, + "loss": 0.5736, + "num_input_tokens_seen": 5018296, + "step": 8645 + }, + { + "epoch": 1.2883526958593983, + "grad_norm": 10.115696907043457, + "learning_rate": 3.220509383378016e-05, + "loss": 0.6162, + "num_input_tokens_seen": 5020824, + "step": 8650 + }, + { + "epoch": 1.2890974084003575, + "grad_norm": 4.432344436645508, + "learning_rate": 3.222371164730414e-05, + "loss": 0.4659, + "num_input_tokens_seen": 5023512, + "step": 8655 + }, + { + "epoch": 1.2898421209413167, + "grad_norm": 12.365450859069824, + "learning_rate": 3.2242329460828126e-05, + "loss": 0.8036, + "num_input_tokens_seen": 5026232, + "step": 8660 + }, + { + "epoch": 1.2905868334822759, + "grad_norm": 3.294951915740967, + "learning_rate": 3.22609472743521e-05, + "loss": 0.5385, + "num_input_tokens_seen": 5029304, + "step": 8665 + }, + { + "epoch": 1.291331546023235, + "grad_norm": 2.825441598892212, + "learning_rate": 3.2279565087876076e-05, + "loss": 0.5317, + "num_input_tokens_seen": 5032056, + "step": 8670 + }, + { + "epoch": 1.2920762585641943, + "grad_norm": 5.370297908782959, + "learning_rate": 3.229818290140006e-05, + "loss": 0.7826, + "num_input_tokens_seen": 5034936, + "step": 8675 + }, + { + "epoch": 1.2928209711051535, + "grad_norm": 10.926907539367676, + "learning_rate": 3.231680071492404e-05, + "loss": 0.6423, + "num_input_tokens_seen": 5037944, + "step": 8680 + }, + { + "epoch": 1.2935656836461127, + "grad_norm": 4.19814395904541, + "learning_rate": 3.233541852844802e-05, + "loss": 0.5697, + "num_input_tokens_seen": 5041080, + "step": 8685 + }, + { + "epoch": 1.2943103961870719, + "grad_norm": 5.627339839935303, + "learning_rate": 3.2354036341972e-05, + "loss": 0.6367, + "num_input_tokens_seen": 5043896, + "step": 8690 + }, + { + "epoch": 1.295055108728031, + "grad_norm": 9.239487648010254, + "learning_rate": 3.237265415549598e-05, + "loss": 0.7042, + "num_input_tokens_seen": 5046680, + "step": 8695 + }, + { + "epoch": 1.2957998212689903, + "grad_norm": 6.610611915588379, + "learning_rate": 3.239127196901996e-05, + "loss": 0.6581, + "num_input_tokens_seen": 5049464, + "step": 8700 + }, + { + "epoch": 1.2965445338099495, + "grad_norm": 7.502178192138672, + "learning_rate": 3.240988978254394e-05, + "loss": 0.7759, + "num_input_tokens_seen": 5052568, + "step": 8705 + }, + { + "epoch": 1.2972892463509085, + "grad_norm": 7.327748775482178, + "learning_rate": 3.242850759606792e-05, + "loss": 0.7708, + "num_input_tokens_seen": 5055256, + "step": 8710 + }, + { + "epoch": 1.2980339588918677, + "grad_norm": 4.321539402008057, + "learning_rate": 3.24471254095919e-05, + "loss": 0.4808, + "num_input_tokens_seen": 5058264, + "step": 8715 + }, + { + "epoch": 1.2987786714328269, + "grad_norm": 10.552501678466797, + "learning_rate": 3.246574322311588e-05, + "loss": 0.5655, + "num_input_tokens_seen": 5061048, + "step": 8720 + }, + { + "epoch": 1.299523383973786, + "grad_norm": 4.335540294647217, + "learning_rate": 3.248436103663986e-05, + "loss": 0.5962, + "num_input_tokens_seen": 5063864, + "step": 8725 + }, + { + "epoch": 1.3002680965147453, + "grad_norm": 5.812646389007568, + "learning_rate": 3.250297885016384e-05, + "loss": 0.5916, + "num_input_tokens_seen": 5066552, + "step": 8730 + }, + { + "epoch": 1.3010128090557045, + "grad_norm": 7.561095714569092, + "learning_rate": 3.252159666368782e-05, + "loss": 0.5871, + "num_input_tokens_seen": 5069368, + "step": 8735 + }, + { + "epoch": 1.3017575215966637, + "grad_norm": 5.457233905792236, + "learning_rate": 3.25402144772118e-05, + "loss": 0.6464, + "num_input_tokens_seen": 5072248, + "step": 8740 + }, + { + "epoch": 1.302502234137623, + "grad_norm": 9.168204307556152, + "learning_rate": 3.255883229073578e-05, + "loss": 0.7594, + "num_input_tokens_seen": 5075192, + "step": 8745 + }, + { + "epoch": 1.303246946678582, + "grad_norm": 5.189684867858887, + "learning_rate": 3.2577450104259755e-05, + "loss": 0.6382, + "num_input_tokens_seen": 5078008, + "step": 8750 + }, + { + "epoch": 1.3039916592195413, + "grad_norm": 5.81838321685791, + "learning_rate": 3.259606791778373e-05, + "loss": 0.7676, + "num_input_tokens_seen": 5081112, + "step": 8755 + }, + { + "epoch": 1.3047363717605005, + "grad_norm": 9.792241096496582, + "learning_rate": 3.261468573130771e-05, + "loss": 0.7142, + "num_input_tokens_seen": 5083992, + "step": 8760 + }, + { + "epoch": 1.3054810843014597, + "grad_norm": 3.8244028091430664, + "learning_rate": 3.2633303544831696e-05, + "loss": 0.5659, + "num_input_tokens_seen": 5086936, + "step": 8765 + }, + { + "epoch": 1.306225796842419, + "grad_norm": 6.191316604614258, + "learning_rate": 3.2651921358355675e-05, + "loss": 0.7263, + "num_input_tokens_seen": 5090008, + "step": 8770 + }, + { + "epoch": 1.3069705093833779, + "grad_norm": 6.258456230163574, + "learning_rate": 3.267053917187965e-05, + "loss": 0.7116, + "num_input_tokens_seen": 5093144, + "step": 8775 + }, + { + "epoch": 1.307715221924337, + "grad_norm": 3.8961551189422607, + "learning_rate": 3.268915698540364e-05, + "loss": 0.7176, + "num_input_tokens_seen": 5096312, + "step": 8780 + }, + { + "epoch": 1.3084599344652963, + "grad_norm": 3.4610698223114014, + "learning_rate": 3.2707774798927616e-05, + "loss": 0.549, + "num_input_tokens_seen": 5099192, + "step": 8785 + }, + { + "epoch": 1.3092046470062555, + "grad_norm": 4.169615745544434, + "learning_rate": 3.2726392612451594e-05, + "loss": 0.6282, + "num_input_tokens_seen": 5101848, + "step": 8790 + }, + { + "epoch": 1.3099493595472147, + "grad_norm": 3.223128080368042, + "learning_rate": 3.274501042597557e-05, + "loss": 0.6204, + "num_input_tokens_seen": 5104760, + "step": 8795 + }, + { + "epoch": 1.310694072088174, + "grad_norm": 5.538936138153076, + "learning_rate": 3.276362823949956e-05, + "loss": 0.6852, + "num_input_tokens_seen": 5107544, + "step": 8800 + }, + { + "epoch": 1.311438784629133, + "grad_norm": 4.854589939117432, + "learning_rate": 3.2782246053023536e-05, + "loss": 0.6553, + "num_input_tokens_seen": 5110488, + "step": 8805 + }, + { + "epoch": 1.3121834971700923, + "grad_norm": 3.016522169113159, + "learning_rate": 3.2800863866547514e-05, + "loss": 0.6587, + "num_input_tokens_seen": 5113400, + "step": 8810 + }, + { + "epoch": 1.3129282097110515, + "grad_norm": 11.87530517578125, + "learning_rate": 3.281948168007149e-05, + "loss": 0.784, + "num_input_tokens_seen": 5116600, + "step": 8815 + }, + { + "epoch": 1.3136729222520107, + "grad_norm": 5.638213634490967, + "learning_rate": 3.283809949359548e-05, + "loss": 0.6179, + "num_input_tokens_seen": 5119512, + "step": 8820 + }, + { + "epoch": 1.31441763479297, + "grad_norm": 6.001789569854736, + "learning_rate": 3.2856717307119456e-05, + "loss": 0.6891, + "num_input_tokens_seen": 5122360, + "step": 8825 + }, + { + "epoch": 1.3151623473339291, + "grad_norm": 2.8867452144622803, + "learning_rate": 3.2875335120643434e-05, + "loss": 0.6031, + "num_input_tokens_seen": 5125176, + "step": 8830 + }, + { + "epoch": 1.3159070598748883, + "grad_norm": 10.900169372558594, + "learning_rate": 3.289395293416741e-05, + "loss": 0.5969, + "num_input_tokens_seen": 5127960, + "step": 8835 + }, + { + "epoch": 1.3166517724158475, + "grad_norm": 4.284978866577148, + "learning_rate": 3.291257074769139e-05, + "loss": 0.4886, + "num_input_tokens_seen": 5131000, + "step": 8840 + }, + { + "epoch": 1.3173964849568067, + "grad_norm": 7.2446136474609375, + "learning_rate": 3.293118856121537e-05, + "loss": 0.7398, + "num_input_tokens_seen": 5133976, + "step": 8845 + }, + { + "epoch": 1.318141197497766, + "grad_norm": 8.15465259552002, + "learning_rate": 3.2949806374739354e-05, + "loss": 0.594, + "num_input_tokens_seen": 5137112, + "step": 8850 + }, + { + "epoch": 1.3188859100387251, + "grad_norm": 6.473474025726318, + "learning_rate": 3.296842418826333e-05, + "loss": 0.7282, + "num_input_tokens_seen": 5140152, + "step": 8855 + }, + { + "epoch": 1.3196306225796843, + "grad_norm": 4.075338363647461, + "learning_rate": 3.298704200178731e-05, + "loss": 0.7217, + "num_input_tokens_seen": 5143288, + "step": 8860 + }, + { + "epoch": 1.3203753351206435, + "grad_norm": 4.779201984405518, + "learning_rate": 3.300565981531129e-05, + "loss": 0.5049, + "num_input_tokens_seen": 5145976, + "step": 8865 + }, + { + "epoch": 1.3211200476616027, + "grad_norm": 3.4598865509033203, + "learning_rate": 3.302427762883527e-05, + "loss": 0.5984, + "num_input_tokens_seen": 5148792, + "step": 8870 + }, + { + "epoch": 1.321864760202562, + "grad_norm": 3.123059034347534, + "learning_rate": 3.304289544235925e-05, + "loss": 0.5378, + "num_input_tokens_seen": 5151384, + "step": 8875 + }, + { + "epoch": 1.322609472743521, + "grad_norm": 3.9166600704193115, + "learning_rate": 3.306151325588323e-05, + "loss": 0.7018, + "num_input_tokens_seen": 5154168, + "step": 8880 + }, + { + "epoch": 1.3233541852844801, + "grad_norm": 4.899038314819336, + "learning_rate": 3.308013106940721e-05, + "loss": 0.5276, + "num_input_tokens_seen": 5157432, + "step": 8885 + }, + { + "epoch": 1.3240988978254393, + "grad_norm": 4.201977729797363, + "learning_rate": 3.309874888293119e-05, + "loss": 0.6578, + "num_input_tokens_seen": 5160216, + "step": 8890 + }, + { + "epoch": 1.3248436103663985, + "grad_norm": 3.4922709465026855, + "learning_rate": 3.311736669645517e-05, + "loss": 0.6229, + "num_input_tokens_seen": 5162936, + "step": 8895 + }, + { + "epoch": 1.3255883229073577, + "grad_norm": 5.857165336608887, + "learning_rate": 3.313598450997915e-05, + "loss": 0.614, + "num_input_tokens_seen": 5165848, + "step": 8900 + }, + { + "epoch": 1.326333035448317, + "grad_norm": 6.228298187255859, + "learning_rate": 3.3154602323503134e-05, + "loss": 0.7465, + "num_input_tokens_seen": 5168760, + "step": 8905 + }, + { + "epoch": 1.3270777479892761, + "grad_norm": 11.980177879333496, + "learning_rate": 3.317322013702711e-05, + "loss": 0.802, + "num_input_tokens_seen": 5171576, + "step": 8910 + }, + { + "epoch": 1.3278224605302353, + "grad_norm": 7.098200798034668, + "learning_rate": 3.319183795055109e-05, + "loss": 0.6414, + "num_input_tokens_seen": 5174456, + "step": 8915 + }, + { + "epoch": 1.3285671730711945, + "grad_norm": 6.1431193351745605, + "learning_rate": 3.321045576407507e-05, + "loss": 0.4336, + "num_input_tokens_seen": 5177240, + "step": 8920 + }, + { + "epoch": 1.3293118856121537, + "grad_norm": 4.956012725830078, + "learning_rate": 3.322907357759905e-05, + "loss": 0.6168, + "num_input_tokens_seen": 5180216, + "step": 8925 + }, + { + "epoch": 1.330056598153113, + "grad_norm": 6.147140979766846, + "learning_rate": 3.3247691391123026e-05, + "loss": 0.7483, + "num_input_tokens_seen": 5183224, + "step": 8930 + }, + { + "epoch": 1.3308013106940721, + "grad_norm": 9.267364501953125, + "learning_rate": 3.3266309204647004e-05, + "loss": 0.788, + "num_input_tokens_seen": 5186072, + "step": 8935 + }, + { + "epoch": 1.3315460232350314, + "grad_norm": 4.780656337738037, + "learning_rate": 3.328492701817099e-05, + "loss": 0.5587, + "num_input_tokens_seen": 5189464, + "step": 8940 + }, + { + "epoch": 1.3322907357759903, + "grad_norm": 6.019697189331055, + "learning_rate": 3.330354483169497e-05, + "loss": 0.5413, + "num_input_tokens_seen": 5192440, + "step": 8945 + }, + { + "epoch": 1.3330354483169495, + "grad_norm": 9.80269718170166, + "learning_rate": 3.3322162645218945e-05, + "loss": 0.6909, + "num_input_tokens_seen": 5195352, + "step": 8950 + }, + { + "epoch": 1.3337801608579087, + "grad_norm": 7.1519551277160645, + "learning_rate": 3.3340780458742924e-05, + "loss": 0.593, + "num_input_tokens_seen": 5198328, + "step": 8955 + }, + { + "epoch": 1.334524873398868, + "grad_norm": 7.948943138122559, + "learning_rate": 3.335939827226691e-05, + "loss": 0.6847, + "num_input_tokens_seen": 5201432, + "step": 8960 + }, + { + "epoch": 1.3352695859398271, + "grad_norm": 6.345822811126709, + "learning_rate": 3.337801608579089e-05, + "loss": 0.8291, + "num_input_tokens_seen": 5204344, + "step": 8965 + }, + { + "epoch": 1.3360142984807863, + "grad_norm": 3.6285905838012695, + "learning_rate": 3.3396633899314865e-05, + "loss": 0.6715, + "num_input_tokens_seen": 5207672, + "step": 8970 + }, + { + "epoch": 1.3367590110217455, + "grad_norm": 3.708798885345459, + "learning_rate": 3.341525171283884e-05, + "loss": 0.5862, + "num_input_tokens_seen": 5210552, + "step": 8975 + }, + { + "epoch": 1.3375037235627047, + "grad_norm": 5.715310096740723, + "learning_rate": 3.343386952636283e-05, + "loss": 0.7381, + "num_input_tokens_seen": 5213496, + "step": 8980 + }, + { + "epoch": 1.338248436103664, + "grad_norm": 8.33835220336914, + "learning_rate": 3.3452487339886806e-05, + "loss": 0.7857, + "num_input_tokens_seen": 5215992, + "step": 8985 + }, + { + "epoch": 1.3389931486446232, + "grad_norm": 1.8928403854370117, + "learning_rate": 3.3471105153410785e-05, + "loss": 0.6359, + "num_input_tokens_seen": 5218584, + "step": 8990 + }, + { + "epoch": 1.3397378611855824, + "grad_norm": 3.8127903938293457, + "learning_rate": 3.348972296693477e-05, + "loss": 0.6559, + "num_input_tokens_seen": 5221592, + "step": 8995 + }, + { + "epoch": 1.3404825737265416, + "grad_norm": 3.081556797027588, + "learning_rate": 3.350834078045875e-05, + "loss": 0.8415, + "num_input_tokens_seen": 5224504, + "step": 9000 + }, + { + "epoch": 1.3412272862675008, + "grad_norm": 7.398233413696289, + "learning_rate": 3.3526958593982726e-05, + "loss": 0.6899, + "num_input_tokens_seen": 5227512, + "step": 9005 + }, + { + "epoch": 1.34197199880846, + "grad_norm": 4.056516170501709, + "learning_rate": 3.3545576407506704e-05, + "loss": 0.5446, + "num_input_tokens_seen": 5230328, + "step": 9010 + }, + { + "epoch": 1.3427167113494192, + "grad_norm": 11.431142807006836, + "learning_rate": 3.356419422103068e-05, + "loss": 0.5799, + "num_input_tokens_seen": 5233240, + "step": 9015 + }, + { + "epoch": 1.3434614238903784, + "grad_norm": 3.977386951446533, + "learning_rate": 3.358281203455466e-05, + "loss": 0.647, + "num_input_tokens_seen": 5236312, + "step": 9020 + }, + { + "epoch": 1.3442061364313376, + "grad_norm": 9.827707290649414, + "learning_rate": 3.360142984807864e-05, + "loss": 0.6255, + "num_input_tokens_seen": 5239544, + "step": 9025 + }, + { + "epoch": 1.3449508489722968, + "grad_norm": 4.5707783699035645, + "learning_rate": 3.3620047661602624e-05, + "loss": 0.6998, + "num_input_tokens_seen": 5242808, + "step": 9030 + }, + { + "epoch": 1.345695561513256, + "grad_norm": 9.453851699829102, + "learning_rate": 3.36386654751266e-05, + "loss": 0.6651, + "num_input_tokens_seen": 5245560, + "step": 9035 + }, + { + "epoch": 1.3464402740542152, + "grad_norm": 5.165776252746582, + "learning_rate": 3.365728328865058e-05, + "loss": 0.6045, + "num_input_tokens_seen": 5248408, + "step": 9040 + }, + { + "epoch": 1.3471849865951744, + "grad_norm": 4.051413536071777, + "learning_rate": 3.367590110217456e-05, + "loss": 0.7654, + "num_input_tokens_seen": 5251352, + "step": 9045 + }, + { + "epoch": 1.3479296991361336, + "grad_norm": 3.5211234092712402, + "learning_rate": 3.3694518915698544e-05, + "loss": 0.5819, + "num_input_tokens_seen": 5254200, + "step": 9050 + }, + { + "epoch": 1.3486744116770926, + "grad_norm": 4.494103908538818, + "learning_rate": 3.371313672922252e-05, + "loss": 0.7342, + "num_input_tokens_seen": 5257144, + "step": 9055 + }, + { + "epoch": 1.3494191242180518, + "grad_norm": 5.628911018371582, + "learning_rate": 3.37317545427465e-05, + "loss": 0.6175, + "num_input_tokens_seen": 5260120, + "step": 9060 + }, + { + "epoch": 1.350163836759011, + "grad_norm": 9.805933952331543, + "learning_rate": 3.3750372356270485e-05, + "loss": 0.754, + "num_input_tokens_seen": 5262872, + "step": 9065 + }, + { + "epoch": 1.3509085492999702, + "grad_norm": 8.644980430603027, + "learning_rate": 3.3768990169794464e-05, + "loss": 0.6907, + "num_input_tokens_seen": 5265944, + "step": 9070 + }, + { + "epoch": 1.3516532618409294, + "grad_norm": 3.671882390975952, + "learning_rate": 3.378760798331844e-05, + "loss": 0.6731, + "num_input_tokens_seen": 5268920, + "step": 9075 + }, + { + "epoch": 1.3523979743818886, + "grad_norm": 4.507523536682129, + "learning_rate": 3.380622579684242e-05, + "loss": 0.5908, + "num_input_tokens_seen": 5272024, + "step": 9080 + }, + { + "epoch": 1.3531426869228478, + "grad_norm": 5.841545104980469, + "learning_rate": 3.3824843610366405e-05, + "loss": 0.7034, + "num_input_tokens_seen": 5274968, + "step": 9085 + }, + { + "epoch": 1.353887399463807, + "grad_norm": 7.225685119628906, + "learning_rate": 3.384346142389038e-05, + "loss": 0.6441, + "num_input_tokens_seen": 5277944, + "step": 9090 + }, + { + "epoch": 1.3546321120047662, + "grad_norm": 4.217147350311279, + "learning_rate": 3.3862079237414355e-05, + "loss": 0.5413, + "num_input_tokens_seen": 5280984, + "step": 9095 + }, + { + "epoch": 1.3553768245457254, + "grad_norm": 6.724404811859131, + "learning_rate": 3.388069705093834e-05, + "loss": 0.812, + "num_input_tokens_seen": 5283800, + "step": 9100 + }, + { + "epoch": 1.3561215370866846, + "grad_norm": 5.005448818206787, + "learning_rate": 3.389931486446232e-05, + "loss": 0.6539, + "num_input_tokens_seen": 5286744, + "step": 9105 + }, + { + "epoch": 1.3568662496276438, + "grad_norm": 13.276549339294434, + "learning_rate": 3.3917932677986296e-05, + "loss": 0.5885, + "num_input_tokens_seen": 5289304, + "step": 9110 + }, + { + "epoch": 1.357610962168603, + "grad_norm": 5.919689655303955, + "learning_rate": 3.3936550491510274e-05, + "loss": 0.6287, + "num_input_tokens_seen": 5292088, + "step": 9115 + }, + { + "epoch": 1.358355674709562, + "grad_norm": 5.428791522979736, + "learning_rate": 3.395516830503426e-05, + "loss": 0.6552, + "num_input_tokens_seen": 5295096, + "step": 9120 + }, + { + "epoch": 1.3591003872505212, + "grad_norm": 5.739907264709473, + "learning_rate": 3.397378611855824e-05, + "loss": 0.6786, + "num_input_tokens_seen": 5298104, + "step": 9125 + }, + { + "epoch": 1.3598450997914804, + "grad_norm": 14.490005493164062, + "learning_rate": 3.3992403932082216e-05, + "loss": 0.7617, + "num_input_tokens_seen": 5301016, + "step": 9130 + }, + { + "epoch": 1.3605898123324396, + "grad_norm": 6.336284160614014, + "learning_rate": 3.4011021745606194e-05, + "loss": 0.5622, + "num_input_tokens_seen": 5303960, + "step": 9135 + }, + { + "epoch": 1.3613345248733988, + "grad_norm": 7.657265663146973, + "learning_rate": 3.402963955913018e-05, + "loss": 0.5884, + "num_input_tokens_seen": 5306680, + "step": 9140 + }, + { + "epoch": 1.362079237414358, + "grad_norm": 3.320389747619629, + "learning_rate": 3.404825737265416e-05, + "loss": 0.688, + "num_input_tokens_seen": 5309336, + "step": 9145 + }, + { + "epoch": 1.3628239499553172, + "grad_norm": 7.474294185638428, + "learning_rate": 3.4066875186178136e-05, + "loss": 0.5644, + "num_input_tokens_seen": 5312216, + "step": 9150 + }, + { + "epoch": 1.3635686624962764, + "grad_norm": 4.530416488647461, + "learning_rate": 3.408549299970212e-05, + "loss": 0.5562, + "num_input_tokens_seen": 5315480, + "step": 9155 + }, + { + "epoch": 1.3643133750372356, + "grad_norm": 7.058436870574951, + "learning_rate": 3.41041108132261e-05, + "loss": 0.6545, + "num_input_tokens_seen": 5318392, + "step": 9160 + }, + { + "epoch": 1.3650580875781948, + "grad_norm": 6.2525739669799805, + "learning_rate": 3.412272862675008e-05, + "loss": 0.6097, + "num_input_tokens_seen": 5321336, + "step": 9165 + }, + { + "epoch": 1.365802800119154, + "grad_norm": 10.024947166442871, + "learning_rate": 3.4141346440274055e-05, + "loss": 0.6226, + "num_input_tokens_seen": 5324024, + "step": 9170 + }, + { + "epoch": 1.3665475126601132, + "grad_norm": 8.200554847717285, + "learning_rate": 3.415996425379804e-05, + "loss": 0.5727, + "num_input_tokens_seen": 5327000, + "step": 9175 + }, + { + "epoch": 1.3672922252010724, + "grad_norm": 6.505425930023193, + "learning_rate": 3.417858206732202e-05, + "loss": 0.553, + "num_input_tokens_seen": 5329944, + "step": 9180 + }, + { + "epoch": 1.3680369377420316, + "grad_norm": 6.252173900604248, + "learning_rate": 3.419719988084599e-05, + "loss": 0.7105, + "num_input_tokens_seen": 5332760, + "step": 9185 + }, + { + "epoch": 1.3687816502829908, + "grad_norm": 4.681523323059082, + "learning_rate": 3.4215817694369975e-05, + "loss": 0.5815, + "num_input_tokens_seen": 5335928, + "step": 9190 + }, + { + "epoch": 1.36952636282395, + "grad_norm": 8.93090534210205, + "learning_rate": 3.423443550789395e-05, + "loss": 0.7282, + "num_input_tokens_seen": 5338904, + "step": 9195 + }, + { + "epoch": 1.3702710753649092, + "grad_norm": 7.68813943862915, + "learning_rate": 3.425305332141793e-05, + "loss": 0.7235, + "num_input_tokens_seen": 5341816, + "step": 9200 + }, + { + "epoch": 1.3710157879058684, + "grad_norm": 3.610050678253174, + "learning_rate": 3.427167113494191e-05, + "loss": 0.5498, + "num_input_tokens_seen": 5344696, + "step": 9205 + }, + { + "epoch": 1.3717605004468276, + "grad_norm": 5.483499526977539, + "learning_rate": 3.4290288948465895e-05, + "loss": 0.6088, + "num_input_tokens_seen": 5347832, + "step": 9210 + }, + { + "epoch": 1.3725052129877868, + "grad_norm": 8.681614875793457, + "learning_rate": 3.430890676198987e-05, + "loss": 0.5792, + "num_input_tokens_seen": 5350584, + "step": 9215 + }, + { + "epoch": 1.373249925528746, + "grad_norm": 7.865274429321289, + "learning_rate": 3.432752457551385e-05, + "loss": 0.6027, + "num_input_tokens_seen": 5353656, + "step": 9220 + }, + { + "epoch": 1.3739946380697052, + "grad_norm": 5.0926833152771, + "learning_rate": 3.4346142389037836e-05, + "loss": 0.6797, + "num_input_tokens_seen": 5356728, + "step": 9225 + }, + { + "epoch": 1.3747393506106642, + "grad_norm": 4.738766670227051, + "learning_rate": 3.4364760202561814e-05, + "loss": 0.7163, + "num_input_tokens_seen": 5359768, + "step": 9230 + }, + { + "epoch": 1.3754840631516234, + "grad_norm": 9.07264518737793, + "learning_rate": 3.438337801608579e-05, + "loss": 0.6136, + "num_input_tokens_seen": 5362904, + "step": 9235 + }, + { + "epoch": 1.3762287756925826, + "grad_norm": 5.094147682189941, + "learning_rate": 3.440199582960977e-05, + "loss": 0.6354, + "num_input_tokens_seen": 5366008, + "step": 9240 + }, + { + "epoch": 1.3769734882335418, + "grad_norm": 4.983355522155762, + "learning_rate": 3.4420613643133756e-05, + "loss": 0.6338, + "num_input_tokens_seen": 5369016, + "step": 9245 + }, + { + "epoch": 1.377718200774501, + "grad_norm": 4.038907051086426, + "learning_rate": 3.4439231456657734e-05, + "loss": 0.6343, + "num_input_tokens_seen": 5371928, + "step": 9250 + }, + { + "epoch": 1.3784629133154602, + "grad_norm": 2.6978633403778076, + "learning_rate": 3.445784927018171e-05, + "loss": 0.5007, + "num_input_tokens_seen": 5374520, + "step": 9255 + }, + { + "epoch": 1.3792076258564194, + "grad_norm": 3.8077597618103027, + "learning_rate": 3.447646708370569e-05, + "loss": 0.5403, + "num_input_tokens_seen": 5377336, + "step": 9260 + }, + { + "epoch": 1.3799523383973786, + "grad_norm": 9.366077423095703, + "learning_rate": 3.4495084897229676e-05, + "loss": 0.7086, + "num_input_tokens_seen": 5380504, + "step": 9265 + }, + { + "epoch": 1.3806970509383378, + "grad_norm": 8.933939933776855, + "learning_rate": 3.451370271075365e-05, + "loss": 0.6229, + "num_input_tokens_seen": 5383256, + "step": 9270 + }, + { + "epoch": 1.381441763479297, + "grad_norm": 4.6686015129089355, + "learning_rate": 3.4532320524277625e-05, + "loss": 0.6431, + "num_input_tokens_seen": 5386168, + "step": 9275 + }, + { + "epoch": 1.3821864760202562, + "grad_norm": 6.507336139678955, + "learning_rate": 3.455093833780161e-05, + "loss": 0.5358, + "num_input_tokens_seen": 5388984, + "step": 9280 + }, + { + "epoch": 1.3829311885612154, + "grad_norm": 9.957535743713379, + "learning_rate": 3.456955615132559e-05, + "loss": 0.5827, + "num_input_tokens_seen": 5391608, + "step": 9285 + }, + { + "epoch": 1.3836759011021746, + "grad_norm": 11.846352577209473, + "learning_rate": 3.458817396484957e-05, + "loss": 0.7686, + "num_input_tokens_seen": 5394360, + "step": 9290 + }, + { + "epoch": 1.3844206136431336, + "grad_norm": 6.201194763183594, + "learning_rate": 3.460679177837355e-05, + "loss": 0.8266, + "num_input_tokens_seen": 5397272, + "step": 9295 + }, + { + "epoch": 1.3851653261840928, + "grad_norm": 6.087865829467773, + "learning_rate": 3.462540959189753e-05, + "loss": 0.5416, + "num_input_tokens_seen": 5400024, + "step": 9300 + }, + { + "epoch": 1.385910038725052, + "grad_norm": 6.327996730804443, + "learning_rate": 3.464402740542151e-05, + "loss": 0.5952, + "num_input_tokens_seen": 5402872, + "step": 9305 + }, + { + "epoch": 1.3866547512660112, + "grad_norm": 8.36737060546875, + "learning_rate": 3.4662645218945486e-05, + "loss": 0.6829, + "num_input_tokens_seen": 5405656, + "step": 9310 + }, + { + "epoch": 1.3873994638069704, + "grad_norm": 10.502570152282715, + "learning_rate": 3.468126303246947e-05, + "loss": 0.6455, + "num_input_tokens_seen": 5408696, + "step": 9315 + }, + { + "epoch": 1.3881441763479296, + "grad_norm": 6.865245819091797, + "learning_rate": 3.469988084599345e-05, + "loss": 0.509, + "num_input_tokens_seen": 5411608, + "step": 9320 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 5.8574538230896, + "learning_rate": 3.471849865951743e-05, + "loss": 0.7265, + "num_input_tokens_seen": 5414936, + "step": 9325 + }, + { + "epoch": 1.389633601429848, + "grad_norm": 9.583087921142578, + "learning_rate": 3.4737116473041406e-05, + "loss": 0.7421, + "num_input_tokens_seen": 5417592, + "step": 9330 + }, + { + "epoch": 1.3903783139708072, + "grad_norm": 8.534554481506348, + "learning_rate": 3.475573428656539e-05, + "loss": 0.7388, + "num_input_tokens_seen": 5420344, + "step": 9335 + }, + { + "epoch": 1.3911230265117664, + "grad_norm": 6.843444347381592, + "learning_rate": 3.477435210008937e-05, + "loss": 0.4723, + "num_input_tokens_seen": 5423416, + "step": 9340 + }, + { + "epoch": 1.3918677390527256, + "grad_norm": 5.663025379180908, + "learning_rate": 3.479296991361335e-05, + "loss": 0.7678, + "num_input_tokens_seen": 5426264, + "step": 9345 + }, + { + "epoch": 1.3926124515936849, + "grad_norm": 11.255615234375, + "learning_rate": 3.4811587727137326e-05, + "loss": 0.5338, + "num_input_tokens_seen": 5429496, + "step": 9350 + }, + { + "epoch": 1.393357164134644, + "grad_norm": 5.803074836730957, + "learning_rate": 3.4830205540661304e-05, + "loss": 0.6133, + "num_input_tokens_seen": 5432376, + "step": 9355 + }, + { + "epoch": 1.3941018766756033, + "grad_norm": 7.824141979217529, + "learning_rate": 3.484882335418528e-05, + "loss": 0.7119, + "num_input_tokens_seen": 5435288, + "step": 9360 + }, + { + "epoch": 1.3948465892165625, + "grad_norm": 7.968044281005859, + "learning_rate": 3.486744116770926e-05, + "loss": 0.5429, + "num_input_tokens_seen": 5438264, + "step": 9365 + }, + { + "epoch": 1.3955913017575217, + "grad_norm": 7.516453742980957, + "learning_rate": 3.4886058981233246e-05, + "loss": 0.67, + "num_input_tokens_seen": 5441208, + "step": 9370 + }, + { + "epoch": 1.3963360142984809, + "grad_norm": 4.694659233093262, + "learning_rate": 3.4904676794757224e-05, + "loss": 0.4738, + "num_input_tokens_seen": 5444120, + "step": 9375 + }, + { + "epoch": 1.39708072683944, + "grad_norm": 4.894054889678955, + "learning_rate": 3.49232946082812e-05, + "loss": 0.6005, + "num_input_tokens_seen": 5446904, + "step": 9380 + }, + { + "epoch": 1.3978254393803993, + "grad_norm": 10.027790069580078, + "learning_rate": 3.494191242180519e-05, + "loss": 0.8917, + "num_input_tokens_seen": 5449720, + "step": 9385 + }, + { + "epoch": 1.3985701519213585, + "grad_norm": 4.262686252593994, + "learning_rate": 3.4960530235329165e-05, + "loss": 0.6318, + "num_input_tokens_seen": 5452536, + "step": 9390 + }, + { + "epoch": 1.3993148644623177, + "grad_norm": 8.719966888427734, + "learning_rate": 3.4979148048853143e-05, + "loss": 0.6592, + "num_input_tokens_seen": 5455224, + "step": 9395 + }, + { + "epoch": 1.4000595770032767, + "grad_norm": 5.456294536590576, + "learning_rate": 3.499776586237712e-05, + "loss": 0.6446, + "num_input_tokens_seen": 5458264, + "step": 9400 + }, + { + "epoch": 1.4008042895442359, + "grad_norm": 4.7967143058776855, + "learning_rate": 3.501638367590111e-05, + "loss": 0.8807, + "num_input_tokens_seen": 5460952, + "step": 9405 + }, + { + "epoch": 1.401549002085195, + "grad_norm": 7.854728698730469, + "learning_rate": 3.5035001489425085e-05, + "loss": 0.6391, + "num_input_tokens_seen": 5463576, + "step": 9410 + }, + { + "epoch": 1.4022937146261543, + "grad_norm": 6.9106903076171875, + "learning_rate": 3.505361930294906e-05, + "loss": 0.6898, + "num_input_tokens_seen": 5466296, + "step": 9415 + }, + { + "epoch": 1.4030384271671135, + "grad_norm": 3.5839080810546875, + "learning_rate": 3.507223711647304e-05, + "loss": 0.7233, + "num_input_tokens_seen": 5469112, + "step": 9420 + }, + { + "epoch": 1.4037831397080727, + "grad_norm": 4.7123284339904785, + "learning_rate": 3.5090854929997026e-05, + "loss": 0.5515, + "num_input_tokens_seen": 5471864, + "step": 9425 + }, + { + "epoch": 1.4045278522490319, + "grad_norm": 7.353683948516846, + "learning_rate": 3.5109472743521005e-05, + "loss": 0.6923, + "num_input_tokens_seen": 5474712, + "step": 9430 + }, + { + "epoch": 1.405272564789991, + "grad_norm": 4.084010601043701, + "learning_rate": 3.512809055704498e-05, + "loss": 0.5543, + "num_input_tokens_seen": 5477336, + "step": 9435 + }, + { + "epoch": 1.4060172773309503, + "grad_norm": 5.679144382476807, + "learning_rate": 3.514670837056897e-05, + "loss": 0.5723, + "num_input_tokens_seen": 5480440, + "step": 9440 + }, + { + "epoch": 1.4067619898719095, + "grad_norm": 4.665407180786133, + "learning_rate": 3.516532618409294e-05, + "loss": 0.6089, + "num_input_tokens_seen": 5483640, + "step": 9445 + }, + { + "epoch": 1.4075067024128687, + "grad_norm": 7.860484600067139, + "learning_rate": 3.518394399761692e-05, + "loss": 0.7441, + "num_input_tokens_seen": 5486520, + "step": 9450 + }, + { + "epoch": 1.4082514149538279, + "grad_norm": 11.472172737121582, + "learning_rate": 3.52025618111409e-05, + "loss": 0.4954, + "num_input_tokens_seen": 5489272, + "step": 9455 + }, + { + "epoch": 1.408996127494787, + "grad_norm": 6.341667652130127, + "learning_rate": 3.522117962466488e-05, + "loss": 0.9482, + "num_input_tokens_seen": 5492632, + "step": 9460 + }, + { + "epoch": 1.409740840035746, + "grad_norm": 5.530130863189697, + "learning_rate": 3.523979743818886e-05, + "loss": 0.4728, + "num_input_tokens_seen": 5495480, + "step": 9465 + }, + { + "epoch": 1.4104855525767053, + "grad_norm": 9.150032997131348, + "learning_rate": 3.525841525171284e-05, + "loss": 0.6162, + "num_input_tokens_seen": 5498264, + "step": 9470 + }, + { + "epoch": 1.4112302651176645, + "grad_norm": 7.021644592285156, + "learning_rate": 3.527703306523682e-05, + "loss": 0.5696, + "num_input_tokens_seen": 5501080, + "step": 9475 + }, + { + "epoch": 1.4119749776586237, + "grad_norm": 10.137161254882812, + "learning_rate": 3.52956508787608e-05, + "loss": 0.6879, + "num_input_tokens_seen": 5503896, + "step": 9480 + }, + { + "epoch": 1.4127196901995829, + "grad_norm": 5.724578857421875, + "learning_rate": 3.531426869228478e-05, + "loss": 0.5774, + "num_input_tokens_seen": 5507032, + "step": 9485 + }, + { + "epoch": 1.413464402740542, + "grad_norm": 8.206648826599121, + "learning_rate": 3.533288650580876e-05, + "loss": 0.4453, + "num_input_tokens_seen": 5509944, + "step": 9490 + }, + { + "epoch": 1.4142091152815013, + "grad_norm": 6.296883583068848, + "learning_rate": 3.535150431933274e-05, + "loss": 0.5175, + "num_input_tokens_seen": 5512920, + "step": 9495 + }, + { + "epoch": 1.4149538278224605, + "grad_norm": 7.953074932098389, + "learning_rate": 3.537012213285672e-05, + "loss": 0.6598, + "num_input_tokens_seen": 5515960, + "step": 9500 + }, + { + "epoch": 1.4156985403634197, + "grad_norm": 4.999081134796143, + "learning_rate": 3.53887399463807e-05, + "loss": 0.5837, + "num_input_tokens_seen": 5518968, + "step": 9505 + }, + { + "epoch": 1.416443252904379, + "grad_norm": 7.259735107421875, + "learning_rate": 3.5407357759904683e-05, + "loss": 0.423, + "num_input_tokens_seen": 5521816, + "step": 9510 + }, + { + "epoch": 1.417187965445338, + "grad_norm": 7.926835060119629, + "learning_rate": 3.542597557342866e-05, + "loss": 0.6074, + "num_input_tokens_seen": 5524504, + "step": 9515 + }, + { + "epoch": 1.4179326779862973, + "grad_norm": 11.656402587890625, + "learning_rate": 3.544459338695264e-05, + "loss": 0.7439, + "num_input_tokens_seen": 5527320, + "step": 9520 + }, + { + "epoch": 1.4186773905272565, + "grad_norm": 7.398606777191162, + "learning_rate": 3.546321120047662e-05, + "loss": 0.7093, + "num_input_tokens_seen": 5530040, + "step": 9525 + }, + { + "epoch": 1.4194221030682157, + "grad_norm": 4.174770832061768, + "learning_rate": 3.5481829014000596e-05, + "loss": 0.694, + "num_input_tokens_seen": 5532888, + "step": 9530 + }, + { + "epoch": 1.420166815609175, + "grad_norm": 4.615589141845703, + "learning_rate": 3.5500446827524575e-05, + "loss": 0.6742, + "num_input_tokens_seen": 5535992, + "step": 9535 + }, + { + "epoch": 1.420911528150134, + "grad_norm": 5.827035903930664, + "learning_rate": 3.551906464104855e-05, + "loss": 0.6789, + "num_input_tokens_seen": 5538872, + "step": 9540 + }, + { + "epoch": 1.4216562406910933, + "grad_norm": 10.587703704833984, + "learning_rate": 3.553768245457254e-05, + "loss": 0.8041, + "num_input_tokens_seen": 5541720, + "step": 9545 + }, + { + "epoch": 1.4224009532320525, + "grad_norm": 4.136136054992676, + "learning_rate": 3.5556300268096516e-05, + "loss": 0.6777, + "num_input_tokens_seen": 5544920, + "step": 9550 + }, + { + "epoch": 1.4231456657730117, + "grad_norm": 4.618213176727295, + "learning_rate": 3.5574918081620494e-05, + "loss": 0.5772, + "num_input_tokens_seen": 5547864, + "step": 9555 + }, + { + "epoch": 1.423890378313971, + "grad_norm": 6.211966037750244, + "learning_rate": 3.559353589514447e-05, + "loss": 0.7015, + "num_input_tokens_seen": 5550744, + "step": 9560 + }, + { + "epoch": 1.4246350908549301, + "grad_norm": 4.985563278198242, + "learning_rate": 3.561215370866846e-05, + "loss": 0.5935, + "num_input_tokens_seen": 5553560, + "step": 9565 + }, + { + "epoch": 1.4253798033958893, + "grad_norm": 8.832660675048828, + "learning_rate": 3.5630771522192436e-05, + "loss": 0.5861, + "num_input_tokens_seen": 5556408, + "step": 9570 + }, + { + "epoch": 1.4261245159368483, + "grad_norm": 8.382359504699707, + "learning_rate": 3.5649389335716414e-05, + "loss": 0.7065, + "num_input_tokens_seen": 5559256, + "step": 9575 + }, + { + "epoch": 1.4268692284778075, + "grad_norm": 3.6421685218811035, + "learning_rate": 3.566800714924039e-05, + "loss": 0.5321, + "num_input_tokens_seen": 5562040, + "step": 9580 + }, + { + "epoch": 1.4276139410187667, + "grad_norm": 8.820159912109375, + "learning_rate": 3.568662496276438e-05, + "loss": 0.4713, + "num_input_tokens_seen": 5564984, + "step": 9585 + }, + { + "epoch": 1.428358653559726, + "grad_norm": 6.644659042358398, + "learning_rate": 3.5705242776288356e-05, + "loss": 0.5069, + "num_input_tokens_seen": 5567640, + "step": 9590 + }, + { + "epoch": 1.4291033661006851, + "grad_norm": 4.3762102127075195, + "learning_rate": 3.5723860589812334e-05, + "loss": 0.7849, + "num_input_tokens_seen": 5570456, + "step": 9595 + }, + { + "epoch": 1.4298480786416443, + "grad_norm": 7.809296131134033, + "learning_rate": 3.574247840333632e-05, + "loss": 0.4315, + "num_input_tokens_seen": 5573048, + "step": 9600 + }, + { + "epoch": 1.4305927911826035, + "grad_norm": 6.086071491241455, + "learning_rate": 3.57610962168603e-05, + "loss": 0.5993, + "num_input_tokens_seen": 5575832, + "step": 9605 + }, + { + "epoch": 1.4313375037235627, + "grad_norm": 10.555582046508789, + "learning_rate": 3.5779714030384275e-05, + "loss": 0.7537, + "num_input_tokens_seen": 5578456, + "step": 9610 + }, + { + "epoch": 1.432082216264522, + "grad_norm": 4.853938102722168, + "learning_rate": 3.5798331843908253e-05, + "loss": 0.6034, + "num_input_tokens_seen": 5581560, + "step": 9615 + }, + { + "epoch": 1.4328269288054811, + "grad_norm": 7.450983047485352, + "learning_rate": 3.581694965743223e-05, + "loss": 0.5134, + "num_input_tokens_seen": 5584280, + "step": 9620 + }, + { + "epoch": 1.4335716413464403, + "grad_norm": 10.53225326538086, + "learning_rate": 3.583556747095621e-05, + "loss": 0.5379, + "num_input_tokens_seen": 5586968, + "step": 9625 + }, + { + "epoch": 1.4343163538873995, + "grad_norm": 6.536792278289795, + "learning_rate": 3.585418528448019e-05, + "loss": 0.5782, + "num_input_tokens_seen": 5589688, + "step": 9630 + }, + { + "epoch": 1.4350610664283587, + "grad_norm": 6.975749969482422, + "learning_rate": 3.587280309800417e-05, + "loss": 0.6532, + "num_input_tokens_seen": 5592504, + "step": 9635 + }, + { + "epoch": 1.4358057789693177, + "grad_norm": 5.623785495758057, + "learning_rate": 3.589142091152815e-05, + "loss": 0.4705, + "num_input_tokens_seen": 5595320, + "step": 9640 + }, + { + "epoch": 1.436550491510277, + "grad_norm": 3.669797897338867, + "learning_rate": 3.591003872505213e-05, + "loss": 0.4713, + "num_input_tokens_seen": 5598296, + "step": 9645 + }, + { + "epoch": 1.4372952040512361, + "grad_norm": 13.703093528747559, + "learning_rate": 3.592865653857611e-05, + "loss": 0.5446, + "num_input_tokens_seen": 5601240, + "step": 9650 + }, + { + "epoch": 1.4380399165921953, + "grad_norm": 4.999536514282227, + "learning_rate": 3.594727435210009e-05, + "loss": 0.5645, + "num_input_tokens_seen": 5604152, + "step": 9655 + }, + { + "epoch": 1.4387846291331545, + "grad_norm": 10.92524242401123, + "learning_rate": 3.596589216562407e-05, + "loss": 0.6005, + "num_input_tokens_seen": 5607000, + "step": 9660 + }, + { + "epoch": 1.4395293416741137, + "grad_norm": 12.866111755371094, + "learning_rate": 3.598450997914805e-05, + "loss": 0.6586, + "num_input_tokens_seen": 5609720, + "step": 9665 + }, + { + "epoch": 1.440274054215073, + "grad_norm": 7.090954303741455, + "learning_rate": 3.6003127792672034e-05, + "loss": 0.5386, + "num_input_tokens_seen": 5612632, + "step": 9670 + }, + { + "epoch": 1.4410187667560321, + "grad_norm": 10.232772827148438, + "learning_rate": 3.602174560619601e-05, + "loss": 0.8089, + "num_input_tokens_seen": 5615480, + "step": 9675 + }, + { + "epoch": 1.4417634792969913, + "grad_norm": 6.065643787384033, + "learning_rate": 3.604036341971999e-05, + "loss": 0.4477, + "num_input_tokens_seen": 5618200, + "step": 9680 + }, + { + "epoch": 1.4425081918379505, + "grad_norm": 9.24611759185791, + "learning_rate": 3.605898123324397e-05, + "loss": 0.4855, + "num_input_tokens_seen": 5621048, + "step": 9685 + }, + { + "epoch": 1.4432529043789097, + "grad_norm": 16.21629524230957, + "learning_rate": 3.6077599046767954e-05, + "loss": 0.7969, + "num_input_tokens_seen": 5624088, + "step": 9690 + }, + { + "epoch": 1.443997616919869, + "grad_norm": 4.91889762878418, + "learning_rate": 3.609621686029193e-05, + "loss": 0.6947, + "num_input_tokens_seen": 5627064, + "step": 9695 + }, + { + "epoch": 1.4447423294608281, + "grad_norm": 5.976571559906006, + "learning_rate": 3.6114834673815904e-05, + "loss": 0.6034, + "num_input_tokens_seen": 5630456, + "step": 9700 + }, + { + "epoch": 1.4454870420017873, + "grad_norm": 8.14208698272705, + "learning_rate": 3.613345248733989e-05, + "loss": 0.5959, + "num_input_tokens_seen": 5633464, + "step": 9705 + }, + { + "epoch": 1.4462317545427466, + "grad_norm": 4.4320855140686035, + "learning_rate": 3.615207030086387e-05, + "loss": 0.5011, + "num_input_tokens_seen": 5636376, + "step": 9710 + }, + { + "epoch": 1.4469764670837058, + "grad_norm": 6.469446182250977, + "learning_rate": 3.6170688114387845e-05, + "loss": 0.6572, + "num_input_tokens_seen": 5639256, + "step": 9715 + }, + { + "epoch": 1.447721179624665, + "grad_norm": 6.758541584014893, + "learning_rate": 3.6189305927911823e-05, + "loss": 0.6701, + "num_input_tokens_seen": 5641944, + "step": 9720 + }, + { + "epoch": 1.4484658921656242, + "grad_norm": 7.322132110595703, + "learning_rate": 3.620792374143581e-05, + "loss": 0.7374, + "num_input_tokens_seen": 5644696, + "step": 9725 + }, + { + "epoch": 1.4492106047065834, + "grad_norm": 4.23908805847168, + "learning_rate": 3.622654155495979e-05, + "loss": 0.7811, + "num_input_tokens_seen": 5647800, + "step": 9730 + }, + { + "epoch": 1.4499553172475426, + "grad_norm": 5.37538480758667, + "learning_rate": 3.6245159368483765e-05, + "loss": 0.5327, + "num_input_tokens_seen": 5650776, + "step": 9735 + }, + { + "epoch": 1.4507000297885018, + "grad_norm": 3.1671979427337646, + "learning_rate": 3.626377718200774e-05, + "loss": 0.5267, + "num_input_tokens_seen": 5653592, + "step": 9740 + }, + { + "epoch": 1.4514447423294607, + "grad_norm": 5.174196243286133, + "learning_rate": 3.628239499553173e-05, + "loss": 0.6986, + "num_input_tokens_seen": 5656568, + "step": 9745 + }, + { + "epoch": 1.45218945487042, + "grad_norm": 6.570372104644775, + "learning_rate": 3.6301012809055706e-05, + "loss": 0.4957, + "num_input_tokens_seen": 5659256, + "step": 9750 + }, + { + "epoch": 1.4529341674113792, + "grad_norm": 5.5297017097473145, + "learning_rate": 3.6319630622579685e-05, + "loss": 0.5651, + "num_input_tokens_seen": 5662392, + "step": 9755 + }, + { + "epoch": 1.4536788799523384, + "grad_norm": 4.121198654174805, + "learning_rate": 3.633824843610367e-05, + "loss": 0.5012, + "num_input_tokens_seen": 5665112, + "step": 9760 + }, + { + "epoch": 1.4544235924932976, + "grad_norm": 13.839810371398926, + "learning_rate": 3.635686624962765e-05, + "loss": 0.6724, + "num_input_tokens_seen": 5667928, + "step": 9765 + }, + { + "epoch": 1.4551683050342568, + "grad_norm": 10.454610824584961, + "learning_rate": 3.6375484063151626e-05, + "loss": 0.6747, + "num_input_tokens_seen": 5670968, + "step": 9770 + }, + { + "epoch": 1.455913017575216, + "grad_norm": 7.0895538330078125, + "learning_rate": 3.6394101876675604e-05, + "loss": 0.7152, + "num_input_tokens_seen": 5674008, + "step": 9775 + }, + { + "epoch": 1.4566577301161752, + "grad_norm": 4.306983947753906, + "learning_rate": 3.641271969019959e-05, + "loss": 0.5409, + "num_input_tokens_seen": 5677016, + "step": 9780 + }, + { + "epoch": 1.4574024426571344, + "grad_norm": 11.06571102142334, + "learning_rate": 3.643133750372357e-05, + "loss": 0.8994, + "num_input_tokens_seen": 5680216, + "step": 9785 + }, + { + "epoch": 1.4581471551980936, + "grad_norm": 4.363958835601807, + "learning_rate": 3.644995531724754e-05, + "loss": 0.7083, + "num_input_tokens_seen": 5683320, + "step": 9790 + }, + { + "epoch": 1.4588918677390528, + "grad_norm": 7.831691741943359, + "learning_rate": 3.6468573130771524e-05, + "loss": 0.6311, + "num_input_tokens_seen": 5685880, + "step": 9795 + }, + { + "epoch": 1.459636580280012, + "grad_norm": 7.883718490600586, + "learning_rate": 3.64871909442955e-05, + "loss": 0.6787, + "num_input_tokens_seen": 5688792, + "step": 9800 + }, + { + "epoch": 1.4603812928209712, + "grad_norm": 4.010368824005127, + "learning_rate": 3.650580875781948e-05, + "loss": 0.7001, + "num_input_tokens_seen": 5691608, + "step": 9805 + }, + { + "epoch": 1.4611260053619302, + "grad_norm": 4.555394649505615, + "learning_rate": 3.652442657134346e-05, + "loss": 0.6843, + "num_input_tokens_seen": 5694456, + "step": 9810 + }, + { + "epoch": 1.4618707179028894, + "grad_norm": 7.975205898284912, + "learning_rate": 3.6543044384867444e-05, + "loss": 0.6818, + "num_input_tokens_seen": 5697496, + "step": 9815 + }, + { + "epoch": 1.4626154304438486, + "grad_norm": 15.90749454498291, + "learning_rate": 3.656166219839142e-05, + "loss": 0.6308, + "num_input_tokens_seen": 5700440, + "step": 9820 + }, + { + "epoch": 1.4633601429848078, + "grad_norm": 6.5506086349487305, + "learning_rate": 3.65802800119154e-05, + "loss": 0.7702, + "num_input_tokens_seen": 5703032, + "step": 9825 + }, + { + "epoch": 1.464104855525767, + "grad_norm": 3.190469980239868, + "learning_rate": 3.6598897825439385e-05, + "loss": 0.6211, + "num_input_tokens_seen": 5705784, + "step": 9830 + }, + { + "epoch": 1.4648495680667262, + "grad_norm": 8.47060775756836, + "learning_rate": 3.6617515638963363e-05, + "loss": 0.6134, + "num_input_tokens_seen": 5708504, + "step": 9835 + }, + { + "epoch": 1.4655942806076854, + "grad_norm": 6.143134117126465, + "learning_rate": 3.663613345248734e-05, + "loss": 0.7623, + "num_input_tokens_seen": 5711352, + "step": 9840 + }, + { + "epoch": 1.4663389931486446, + "grad_norm": 8.661123275756836, + "learning_rate": 3.665475126601132e-05, + "loss": 0.6705, + "num_input_tokens_seen": 5714072, + "step": 9845 + }, + { + "epoch": 1.4670837056896038, + "grad_norm": 5.433502674102783, + "learning_rate": 3.6673369079535305e-05, + "loss": 0.6369, + "num_input_tokens_seen": 5717432, + "step": 9850 + }, + { + "epoch": 1.467828418230563, + "grad_norm": 5.955674648284912, + "learning_rate": 3.669198689305928e-05, + "loss": 0.6636, + "num_input_tokens_seen": 5720120, + "step": 9855 + }, + { + "epoch": 1.4685731307715222, + "grad_norm": 2.0660593509674072, + "learning_rate": 3.671060470658326e-05, + "loss": 0.6998, + "num_input_tokens_seen": 5723096, + "step": 9860 + }, + { + "epoch": 1.4693178433124814, + "grad_norm": 3.0670080184936523, + "learning_rate": 3.672922252010724e-05, + "loss": 0.5776, + "num_input_tokens_seen": 5726040, + "step": 9865 + }, + { + "epoch": 1.4700625558534406, + "grad_norm": 4.324955463409424, + "learning_rate": 3.6747840333631225e-05, + "loss": 0.733, + "num_input_tokens_seen": 5729240, + "step": 9870 + }, + { + "epoch": 1.4708072683943998, + "grad_norm": 2.8858256340026855, + "learning_rate": 3.6766458147155196e-05, + "loss": 0.7582, + "num_input_tokens_seen": 5732248, + "step": 9875 + }, + { + "epoch": 1.471551980935359, + "grad_norm": 7.401158332824707, + "learning_rate": 3.6785075960679174e-05, + "loss": 0.5972, + "num_input_tokens_seen": 5735288, + "step": 9880 + }, + { + "epoch": 1.4722966934763182, + "grad_norm": 4.514736652374268, + "learning_rate": 3.680369377420316e-05, + "loss": 0.6237, + "num_input_tokens_seen": 5738264, + "step": 9885 + }, + { + "epoch": 1.4730414060172774, + "grad_norm": 5.879607677459717, + "learning_rate": 3.682231158772714e-05, + "loss": 0.8581, + "num_input_tokens_seen": 5741432, + "step": 9890 + }, + { + "epoch": 1.4737861185582366, + "grad_norm": 7.635617256164551, + "learning_rate": 3.6840929401251116e-05, + "loss": 0.7331, + "num_input_tokens_seen": 5744344, + "step": 9895 + }, + { + "epoch": 1.4745308310991958, + "grad_norm": 3.94044828414917, + "learning_rate": 3.68595472147751e-05, + "loss": 0.7513, + "num_input_tokens_seen": 5747416, + "step": 9900 + }, + { + "epoch": 1.475275543640155, + "grad_norm": 5.470749855041504, + "learning_rate": 3.687816502829908e-05, + "loss": 0.4751, + "num_input_tokens_seen": 5750040, + "step": 9905 + }, + { + "epoch": 1.4760202561811142, + "grad_norm": 10.703453063964844, + "learning_rate": 3.689678284182306e-05, + "loss": 0.6413, + "num_input_tokens_seen": 5752856, + "step": 9910 + }, + { + "epoch": 1.4767649687220734, + "grad_norm": 4.947269916534424, + "learning_rate": 3.6915400655347035e-05, + "loss": 0.6879, + "num_input_tokens_seen": 5755864, + "step": 9915 + }, + { + "epoch": 1.4775096812630324, + "grad_norm": 8.231094360351562, + "learning_rate": 3.693401846887102e-05, + "loss": 0.7878, + "num_input_tokens_seen": 5758584, + "step": 9920 + }, + { + "epoch": 1.4782543938039916, + "grad_norm": 2.996584415435791, + "learning_rate": 3.6952636282395e-05, + "loss": 0.5539, + "num_input_tokens_seen": 5761464, + "step": 9925 + }, + { + "epoch": 1.4789991063449508, + "grad_norm": 5.4329609870910645, + "learning_rate": 3.697125409591898e-05, + "loss": 0.6338, + "num_input_tokens_seen": 5764120, + "step": 9930 + }, + { + "epoch": 1.47974381888591, + "grad_norm": 4.698413848876953, + "learning_rate": 3.6989871909442955e-05, + "loss": 0.6711, + "num_input_tokens_seen": 5766840, + "step": 9935 + }, + { + "epoch": 1.4804885314268692, + "grad_norm": 3.7080070972442627, + "learning_rate": 3.700848972296694e-05, + "loss": 0.564, + "num_input_tokens_seen": 5770296, + "step": 9940 + }, + { + "epoch": 1.4812332439678284, + "grad_norm": 4.606230735778809, + "learning_rate": 3.702710753649092e-05, + "loss": 0.4995, + "num_input_tokens_seen": 5772984, + "step": 9945 + }, + { + "epoch": 1.4819779565087876, + "grad_norm": 4.251567363739014, + "learning_rate": 3.70457253500149e-05, + "loss": 0.5814, + "num_input_tokens_seen": 5775800, + "step": 9950 + }, + { + "epoch": 1.4827226690497468, + "grad_norm": 4.451828956604004, + "learning_rate": 3.7064343163538875e-05, + "loss": 0.5624, + "num_input_tokens_seen": 5778712, + "step": 9955 + }, + { + "epoch": 1.483467381590706, + "grad_norm": 10.672261238098145, + "learning_rate": 3.708296097706285e-05, + "loss": 0.6454, + "num_input_tokens_seen": 5781528, + "step": 9960 + }, + { + "epoch": 1.4842120941316652, + "grad_norm": 3.713961124420166, + "learning_rate": 3.710157879058683e-05, + "loss": 0.6844, + "num_input_tokens_seen": 5784728, + "step": 9965 + }, + { + "epoch": 1.4849568066726244, + "grad_norm": 4.223936080932617, + "learning_rate": 3.712019660411081e-05, + "loss": 0.5318, + "num_input_tokens_seen": 5787800, + "step": 9970 + }, + { + "epoch": 1.4857015192135836, + "grad_norm": 6.5641865730285645, + "learning_rate": 3.7138814417634795e-05, + "loss": 0.6386, + "num_input_tokens_seen": 5791064, + "step": 9975 + }, + { + "epoch": 1.4864462317545428, + "grad_norm": 11.196253776550293, + "learning_rate": 3.715743223115877e-05, + "loss": 0.4866, + "num_input_tokens_seen": 5793880, + "step": 9980 + }, + { + "epoch": 1.4871909442955018, + "grad_norm": 7.7198381423950195, + "learning_rate": 3.717605004468275e-05, + "loss": 0.7081, + "num_input_tokens_seen": 5796696, + "step": 9985 + }, + { + "epoch": 1.487935656836461, + "grad_norm": 14.515464782714844, + "learning_rate": 3.7194667858206736e-05, + "loss": 0.7215, + "num_input_tokens_seen": 5799768, + "step": 9990 + }, + { + "epoch": 1.4886803693774202, + "grad_norm": 5.20766019821167, + "learning_rate": 3.7213285671730714e-05, + "loss": 0.6412, + "num_input_tokens_seen": 5802488, + "step": 9995 + }, + { + "epoch": 1.4894250819183794, + "grad_norm": 4.459023952484131, + "learning_rate": 3.723190348525469e-05, + "loss": 0.6415, + "num_input_tokens_seen": 5805560, + "step": 10000 + }, + { + "epoch": 1.4901697944593386, + "grad_norm": 10.102836608886719, + "learning_rate": 3.725052129877867e-05, + "loss": 0.622, + "num_input_tokens_seen": 5808504, + "step": 10005 + }, + { + "epoch": 1.4909145070002978, + "grad_norm": 5.319465637207031, + "learning_rate": 3.7269139112302656e-05, + "loss": 0.6119, + "num_input_tokens_seen": 5811000, + "step": 10010 + }, + { + "epoch": 1.491659219541257, + "grad_norm": 7.194521903991699, + "learning_rate": 3.7287756925826634e-05, + "loss": 0.6474, + "num_input_tokens_seen": 5813720, + "step": 10015 + }, + { + "epoch": 1.4924039320822162, + "grad_norm": 7.457472801208496, + "learning_rate": 3.730637473935061e-05, + "loss": 0.5511, + "num_input_tokens_seen": 5816792, + "step": 10020 + }, + { + "epoch": 1.4931486446231754, + "grad_norm": 3.5889322757720947, + "learning_rate": 3.732499255287459e-05, + "loss": 0.6652, + "num_input_tokens_seen": 5819544, + "step": 10025 + }, + { + "epoch": 1.4938933571641346, + "grad_norm": 5.350255966186523, + "learning_rate": 3.7343610366398575e-05, + "loss": 0.7201, + "num_input_tokens_seen": 5822936, + "step": 10030 + }, + { + "epoch": 1.4946380697050938, + "grad_norm": 4.383955478668213, + "learning_rate": 3.7362228179922554e-05, + "loss": 0.6263, + "num_input_tokens_seen": 5825720, + "step": 10035 + }, + { + "epoch": 1.495382782246053, + "grad_norm": 3.082062244415283, + "learning_rate": 3.738084599344653e-05, + "loss": 0.7326, + "num_input_tokens_seen": 5829240, + "step": 10040 + }, + { + "epoch": 1.4961274947870122, + "grad_norm": 8.164531707763672, + "learning_rate": 3.739946380697052e-05, + "loss": 0.5896, + "num_input_tokens_seen": 5832280, + "step": 10045 + }, + { + "epoch": 1.4968722073279714, + "grad_norm": 4.256753921508789, + "learning_rate": 3.741808162049449e-05, + "loss": 0.7157, + "num_input_tokens_seen": 5835160, + "step": 10050 + }, + { + "epoch": 1.4976169198689306, + "grad_norm": 5.200985908508301, + "learning_rate": 3.743669943401847e-05, + "loss": 0.5612, + "num_input_tokens_seen": 5838072, + "step": 10055 + }, + { + "epoch": 1.4983616324098898, + "grad_norm": 6.497515678405762, + "learning_rate": 3.745531724754245e-05, + "loss": 0.5188, + "num_input_tokens_seen": 5840888, + "step": 10060 + }, + { + "epoch": 1.499106344950849, + "grad_norm": 8.198525428771973, + "learning_rate": 3.747393506106643e-05, + "loss": 0.6826, + "num_input_tokens_seen": 5843864, + "step": 10065 + }, + { + "epoch": 1.4998510574918082, + "grad_norm": 11.55627727508545, + "learning_rate": 3.749255287459041e-05, + "loss": 0.8615, + "num_input_tokens_seen": 5846360, + "step": 10070 + }, + { + "epoch": 1.5005957700327675, + "grad_norm": 5.220102787017822, + "learning_rate": 3.7511170688114386e-05, + "loss": 0.5649, + "num_input_tokens_seen": 5849304, + "step": 10075 + }, + { + "epoch": 1.5013404825737267, + "grad_norm": 4.248427867889404, + "learning_rate": 3.752978850163837e-05, + "loss": 0.5981, + "num_input_tokens_seen": 5852408, + "step": 10080 + }, + { + "epoch": 1.5020851951146859, + "grad_norm": 4.711069583892822, + "learning_rate": 3.754840631516235e-05, + "loss": 0.756, + "num_input_tokens_seen": 5855576, + "step": 10085 + }, + { + "epoch": 1.502829907655645, + "grad_norm": 4.5435004234313965, + "learning_rate": 3.756702412868633e-05, + "loss": 0.6447, + "num_input_tokens_seen": 5858520, + "step": 10090 + }, + { + "epoch": 1.5035746201966043, + "grad_norm": 4.492320537567139, + "learning_rate": 3.7585641942210306e-05, + "loss": 0.5907, + "num_input_tokens_seen": 5861304, + "step": 10095 + }, + { + "epoch": 1.5043193327375635, + "grad_norm": 5.249291896820068, + "learning_rate": 3.760425975573429e-05, + "loss": 0.5759, + "num_input_tokens_seen": 5864184, + "step": 10100 + }, + { + "epoch": 1.5050640452785224, + "grad_norm": 4.38117790222168, + "learning_rate": 3.762287756925827e-05, + "loss": 0.4906, + "num_input_tokens_seen": 5866968, + "step": 10105 + }, + { + "epoch": 1.5058087578194816, + "grad_norm": 6.380138874053955, + "learning_rate": 3.764149538278225e-05, + "loss": 0.5825, + "num_input_tokens_seen": 5869912, + "step": 10110 + }, + { + "epoch": 1.5065534703604408, + "grad_norm": 4.324146747589111, + "learning_rate": 3.766011319630623e-05, + "loss": 0.8616, + "num_input_tokens_seen": 5872984, + "step": 10115 + }, + { + "epoch": 1.5072981829014, + "grad_norm": 4.8174848556518555, + "learning_rate": 3.767873100983021e-05, + "loss": 0.6051, + "num_input_tokens_seen": 5875960, + "step": 10120 + }, + { + "epoch": 1.5080428954423593, + "grad_norm": 9.267452239990234, + "learning_rate": 3.769734882335419e-05, + "loss": 0.6939, + "num_input_tokens_seen": 5879128, + "step": 10125 + }, + { + "epoch": 1.5087876079833185, + "grad_norm": 10.220256805419922, + "learning_rate": 3.771596663687817e-05, + "loss": 0.5654, + "num_input_tokens_seen": 5882040, + "step": 10130 + }, + { + "epoch": 1.5095323205242777, + "grad_norm": 6.515230178833008, + "learning_rate": 3.7734584450402145e-05, + "loss": 0.6928, + "num_input_tokens_seen": 5884536, + "step": 10135 + }, + { + "epoch": 1.5102770330652369, + "grad_norm": 6.306908130645752, + "learning_rate": 3.7753202263926124e-05, + "loss": 0.6417, + "num_input_tokens_seen": 5887608, + "step": 10140 + }, + { + "epoch": 1.5110217456061958, + "grad_norm": 3.2685985565185547, + "learning_rate": 3.77718200774501e-05, + "loss": 0.726, + "num_input_tokens_seen": 5890392, + "step": 10145 + }, + { + "epoch": 1.511766458147155, + "grad_norm": 2.921295404434204, + "learning_rate": 3.779043789097409e-05, + "loss": 0.5601, + "num_input_tokens_seen": 5893400, + "step": 10150 + }, + { + "epoch": 1.5125111706881142, + "grad_norm": 6.644345760345459, + "learning_rate": 3.7809055704498065e-05, + "loss": 0.63, + "num_input_tokens_seen": 5896280, + "step": 10155 + }, + { + "epoch": 1.5132558832290735, + "grad_norm": 7.081961631774902, + "learning_rate": 3.7827673518022043e-05, + "loss": 0.6872, + "num_input_tokens_seen": 5899000, + "step": 10160 + }, + { + "epoch": 1.5140005957700327, + "grad_norm": 3.9184908866882324, + "learning_rate": 3.784629133154602e-05, + "loss": 0.5878, + "num_input_tokens_seen": 5901720, + "step": 10165 + }, + { + "epoch": 1.5147453083109919, + "grad_norm": 5.195962905883789, + "learning_rate": 3.786490914507001e-05, + "loss": 0.4602, + "num_input_tokens_seen": 5904568, + "step": 10170 + }, + { + "epoch": 1.515490020851951, + "grad_norm": 4.316817760467529, + "learning_rate": 3.7883526958593985e-05, + "loss": 0.7311, + "num_input_tokens_seen": 5907416, + "step": 10175 + }, + { + "epoch": 1.5162347333929103, + "grad_norm": 7.932365417480469, + "learning_rate": 3.790214477211796e-05, + "loss": 0.6622, + "num_input_tokens_seen": 5910360, + "step": 10180 + }, + { + "epoch": 1.5169794459338695, + "grad_norm": 4.354771137237549, + "learning_rate": 3.792076258564194e-05, + "loss": 0.6469, + "num_input_tokens_seen": 5913240, + "step": 10185 + }, + { + "epoch": 1.5177241584748287, + "grad_norm": 1.95461106300354, + "learning_rate": 3.7939380399165926e-05, + "loss": 0.6296, + "num_input_tokens_seen": 5915800, + "step": 10190 + }, + { + "epoch": 1.5184688710157879, + "grad_norm": 2.5716750621795654, + "learning_rate": 3.7957998212689905e-05, + "loss": 0.5764, + "num_input_tokens_seen": 5918552, + "step": 10195 + }, + { + "epoch": 1.519213583556747, + "grad_norm": 4.108082294464111, + "learning_rate": 3.797661602621388e-05, + "loss": 0.6473, + "num_input_tokens_seen": 5921720, + "step": 10200 + }, + { + "epoch": 1.5199582960977063, + "grad_norm": 6.183298110961914, + "learning_rate": 3.799523383973787e-05, + "loss": 0.5049, + "num_input_tokens_seen": 5924504, + "step": 10205 + }, + { + "epoch": 1.5207030086386655, + "grad_norm": 4.260629177093506, + "learning_rate": 3.8013851653261846e-05, + "loss": 0.6881, + "num_input_tokens_seen": 5927384, + "step": 10210 + }, + { + "epoch": 1.5214477211796247, + "grad_norm": 10.125222206115723, + "learning_rate": 3.8032469466785824e-05, + "loss": 0.6179, + "num_input_tokens_seen": 5930008, + "step": 10215 + }, + { + "epoch": 1.5221924337205839, + "grad_norm": 5.2586188316345215, + "learning_rate": 3.80510872803098e-05, + "loss": 0.6495, + "num_input_tokens_seen": 5933016, + "step": 10220 + }, + { + "epoch": 1.522937146261543, + "grad_norm": 3.328000068664551, + "learning_rate": 3.806970509383378e-05, + "loss": 0.7281, + "num_input_tokens_seen": 5936088, + "step": 10225 + }, + { + "epoch": 1.5236818588025023, + "grad_norm": 7.912591934204102, + "learning_rate": 3.808832290735776e-05, + "loss": 0.5415, + "num_input_tokens_seen": 5939032, + "step": 10230 + }, + { + "epoch": 1.5244265713434615, + "grad_norm": 3.667642116546631, + "learning_rate": 3.810694072088174e-05, + "loss": 0.6192, + "num_input_tokens_seen": 5941912, + "step": 10235 + }, + { + "epoch": 1.5251712838844207, + "grad_norm": 5.169541835784912, + "learning_rate": 3.812555853440572e-05, + "loss": 0.7248, + "num_input_tokens_seen": 5945080, + "step": 10240 + }, + { + "epoch": 1.52591599642538, + "grad_norm": 5.5146002769470215, + "learning_rate": 3.81441763479297e-05, + "loss": 0.5515, + "num_input_tokens_seen": 5947832, + "step": 10245 + }, + { + "epoch": 1.526660708966339, + "grad_norm": 4.130377769470215, + "learning_rate": 3.816279416145368e-05, + "loss": 0.5902, + "num_input_tokens_seen": 5950808, + "step": 10250 + }, + { + "epoch": 1.5274054215072983, + "grad_norm": 6.23555326461792, + "learning_rate": 3.818141197497766e-05, + "loss": 0.6497, + "num_input_tokens_seen": 5953528, + "step": 10255 + }, + { + "epoch": 1.5281501340482575, + "grad_norm": 5.751399040222168, + "learning_rate": 3.820002978850164e-05, + "loss": 0.597, + "num_input_tokens_seen": 5956216, + "step": 10260 + }, + { + "epoch": 1.5288948465892167, + "grad_norm": 6.113067626953125, + "learning_rate": 3.821864760202562e-05, + "loss": 0.6218, + "num_input_tokens_seen": 5958840, + "step": 10265 + }, + { + "epoch": 1.529639559130176, + "grad_norm": 4.1609883308410645, + "learning_rate": 3.82372654155496e-05, + "loss": 0.4741, + "num_input_tokens_seen": 5961912, + "step": 10270 + }, + { + "epoch": 1.5303842716711349, + "grad_norm": 6.200786113739014, + "learning_rate": 3.8255883229073583e-05, + "loss": 0.8359, + "num_input_tokens_seen": 5964696, + "step": 10275 + }, + { + "epoch": 1.531128984212094, + "grad_norm": 7.868643283843994, + "learning_rate": 3.827450104259756e-05, + "loss": 0.6222, + "num_input_tokens_seen": 5967576, + "step": 10280 + }, + { + "epoch": 1.5318736967530533, + "grad_norm": 7.838682174682617, + "learning_rate": 3.829311885612154e-05, + "loss": 0.682, + "num_input_tokens_seen": 5970360, + "step": 10285 + }, + { + "epoch": 1.5326184092940125, + "grad_norm": 11.470207214355469, + "learning_rate": 3.831173666964552e-05, + "loss": 0.5577, + "num_input_tokens_seen": 5973016, + "step": 10290 + }, + { + "epoch": 1.5333631218349717, + "grad_norm": 12.377765655517578, + "learning_rate": 3.83303544831695e-05, + "loss": 0.6365, + "num_input_tokens_seen": 5976216, + "step": 10295 + }, + { + "epoch": 1.534107834375931, + "grad_norm": 6.279031276702881, + "learning_rate": 3.834897229669348e-05, + "loss": 0.7266, + "num_input_tokens_seen": 5979160, + "step": 10300 + }, + { + "epoch": 1.53485254691689, + "grad_norm": 7.66422176361084, + "learning_rate": 3.836759011021746e-05, + "loss": 0.5955, + "num_input_tokens_seen": 5982008, + "step": 10305 + }, + { + "epoch": 1.5355972594578493, + "grad_norm": 9.189286231994629, + "learning_rate": 3.838620792374144e-05, + "loss": 0.7614, + "num_input_tokens_seen": 5984920, + "step": 10310 + }, + { + "epoch": 1.5363419719988085, + "grad_norm": 3.527312994003296, + "learning_rate": 3.8404825737265416e-05, + "loss": 0.704, + "num_input_tokens_seen": 5987864, + "step": 10315 + }, + { + "epoch": 1.5370866845397675, + "grad_norm": 7.288948059082031, + "learning_rate": 3.8423443550789394e-05, + "loss": 0.5986, + "num_input_tokens_seen": 5990552, + "step": 10320 + }, + { + "epoch": 1.5378313970807267, + "grad_norm": 2.696115732192993, + "learning_rate": 3.844206136431337e-05, + "loss": 0.7003, + "num_input_tokens_seen": 5993592, + "step": 10325 + }, + { + "epoch": 1.538576109621686, + "grad_norm": 4.046533584594727, + "learning_rate": 3.846067917783736e-05, + "loss": 0.7556, + "num_input_tokens_seen": 5996536, + "step": 10330 + }, + { + "epoch": 1.539320822162645, + "grad_norm": 3.114466428756714, + "learning_rate": 3.8479296991361336e-05, + "loss": 0.6816, + "num_input_tokens_seen": 5999256, + "step": 10335 + }, + { + "epoch": 1.5400655347036043, + "grad_norm": 3.278179168701172, + "learning_rate": 3.8497914804885314e-05, + "loss": 0.5751, + "num_input_tokens_seen": 6002008, + "step": 10340 + }, + { + "epoch": 1.5408102472445635, + "grad_norm": 3.353858232498169, + "learning_rate": 3.851653261840929e-05, + "loss": 0.5278, + "num_input_tokens_seen": 6004696, + "step": 10345 + }, + { + "epoch": 1.5415549597855227, + "grad_norm": 10.93830394744873, + "learning_rate": 3.853515043193328e-05, + "loss": 0.6044, + "num_input_tokens_seen": 6007736, + "step": 10350 + }, + { + "epoch": 1.542299672326482, + "grad_norm": 8.589523315429688, + "learning_rate": 3.8553768245457255e-05, + "loss": 0.7318, + "num_input_tokens_seen": 6010712, + "step": 10355 + }, + { + "epoch": 1.543044384867441, + "grad_norm": 2.88433575630188, + "learning_rate": 3.8572386058981234e-05, + "loss": 0.7102, + "num_input_tokens_seen": 6013560, + "step": 10360 + }, + { + "epoch": 1.5437890974084003, + "grad_norm": 6.182340621948242, + "learning_rate": 3.859100387250522e-05, + "loss": 0.6169, + "num_input_tokens_seen": 6016312, + "step": 10365 + }, + { + "epoch": 1.5445338099493595, + "grad_norm": 6.877575397491455, + "learning_rate": 3.86096216860292e-05, + "loss": 0.5439, + "num_input_tokens_seen": 6019480, + "step": 10370 + }, + { + "epoch": 1.5452785224903187, + "grad_norm": 4.017301082611084, + "learning_rate": 3.8628239499553175e-05, + "loss": 0.5297, + "num_input_tokens_seen": 6022296, + "step": 10375 + }, + { + "epoch": 1.546023235031278, + "grad_norm": 5.725419998168945, + "learning_rate": 3.8646857313077153e-05, + "loss": 0.7242, + "num_input_tokens_seen": 6025208, + "step": 10380 + }, + { + "epoch": 1.5467679475722371, + "grad_norm": 4.616950988769531, + "learning_rate": 3.866547512660114e-05, + "loss": 0.555, + "num_input_tokens_seen": 6028248, + "step": 10385 + }, + { + "epoch": 1.5475126601131963, + "grad_norm": 6.620887279510498, + "learning_rate": 3.868409294012512e-05, + "loss": 0.5317, + "num_input_tokens_seen": 6031256, + "step": 10390 + }, + { + "epoch": 1.5482573726541555, + "grad_norm": 10.443967819213867, + "learning_rate": 3.870271075364909e-05, + "loss": 0.6625, + "num_input_tokens_seen": 6033944, + "step": 10395 + }, + { + "epoch": 1.5490020851951147, + "grad_norm": 11.745599746704102, + "learning_rate": 3.872132856717307e-05, + "loss": 0.5602, + "num_input_tokens_seen": 6036888, + "step": 10400 + }, + { + "epoch": 1.549746797736074, + "grad_norm": 10.70862865447998, + "learning_rate": 3.873994638069705e-05, + "loss": 0.721, + "num_input_tokens_seen": 6039832, + "step": 10405 + }, + { + "epoch": 1.5504915102770331, + "grad_norm": 3.4446897506713867, + "learning_rate": 3.875856419422103e-05, + "loss": 0.6255, + "num_input_tokens_seen": 6042776, + "step": 10410 + }, + { + "epoch": 1.5512362228179923, + "grad_norm": 5.571345806121826, + "learning_rate": 3.877718200774501e-05, + "loss": 0.7451, + "num_input_tokens_seen": 6045880, + "step": 10415 + }, + { + "epoch": 1.5519809353589515, + "grad_norm": 12.406134605407715, + "learning_rate": 3.879579982126899e-05, + "loss": 0.7439, + "num_input_tokens_seen": 6048920, + "step": 10420 + }, + { + "epoch": 1.5527256478999107, + "grad_norm": 3.8969240188598633, + "learning_rate": 3.881441763479297e-05, + "loss": 0.7052, + "num_input_tokens_seen": 6051832, + "step": 10425 + }, + { + "epoch": 1.55347036044087, + "grad_norm": 5.495765686035156, + "learning_rate": 3.883303544831695e-05, + "loss": 0.8645, + "num_input_tokens_seen": 6054744, + "step": 10430 + }, + { + "epoch": 1.5542150729818291, + "grad_norm": 6.057366371154785, + "learning_rate": 3.8851653261840934e-05, + "loss": 0.552, + "num_input_tokens_seen": 6058072, + "step": 10435 + }, + { + "epoch": 1.5549597855227884, + "grad_norm": 5.368941307067871, + "learning_rate": 3.887027107536491e-05, + "loss": 0.5832, + "num_input_tokens_seen": 6060984, + "step": 10440 + }, + { + "epoch": 1.5557044980637476, + "grad_norm": 14.177512168884277, + "learning_rate": 3.888888888888889e-05, + "loss": 0.6934, + "num_input_tokens_seen": 6063768, + "step": 10445 + }, + { + "epoch": 1.5564492106047065, + "grad_norm": 6.493015289306641, + "learning_rate": 3.890750670241287e-05, + "loss": 0.6577, + "num_input_tokens_seen": 6066584, + "step": 10450 + }, + { + "epoch": 1.5571939231456657, + "grad_norm": 4.240787029266357, + "learning_rate": 3.8926124515936854e-05, + "loss": 0.5478, + "num_input_tokens_seen": 6069592, + "step": 10455 + }, + { + "epoch": 1.557938635686625, + "grad_norm": 6.85515832901001, + "learning_rate": 3.894474232946083e-05, + "loss": 0.6634, + "num_input_tokens_seen": 6072344, + "step": 10460 + }, + { + "epoch": 1.5586833482275841, + "grad_norm": 5.265322685241699, + "learning_rate": 3.896336014298481e-05, + "loss": 0.557, + "num_input_tokens_seen": 6075256, + "step": 10465 + }, + { + "epoch": 1.5594280607685433, + "grad_norm": 13.24741268157959, + "learning_rate": 3.898197795650879e-05, + "loss": 0.7302, + "num_input_tokens_seen": 6078168, + "step": 10470 + }, + { + "epoch": 1.5601727733095025, + "grad_norm": 2.5506629943847656, + "learning_rate": 3.9000595770032774e-05, + "loss": 0.4815, + "num_input_tokens_seen": 6080824, + "step": 10475 + }, + { + "epoch": 1.5609174858504618, + "grad_norm": 5.040524959564209, + "learning_rate": 3.9019213583556745e-05, + "loss": 0.6964, + "num_input_tokens_seen": 6083480, + "step": 10480 + }, + { + "epoch": 1.561662198391421, + "grad_norm": 3.5798137187957764, + "learning_rate": 3.903783139708072e-05, + "loss": 0.6654, + "num_input_tokens_seen": 6086360, + "step": 10485 + }, + { + "epoch": 1.5624069109323802, + "grad_norm": 6.320927619934082, + "learning_rate": 3.905644921060471e-05, + "loss": 0.6707, + "num_input_tokens_seen": 6089240, + "step": 10490 + }, + { + "epoch": 1.5631516234733391, + "grad_norm": 5.070636749267578, + "learning_rate": 3.907506702412869e-05, + "loss": 0.6013, + "num_input_tokens_seen": 6092280, + "step": 10495 + }, + { + "epoch": 1.5638963360142983, + "grad_norm": 6.057427883148193, + "learning_rate": 3.9093684837652665e-05, + "loss": 0.5174, + "num_input_tokens_seen": 6095064, + "step": 10500 + }, + { + "epoch": 1.5646410485552575, + "grad_norm": 5.553429126739502, + "learning_rate": 3.911230265117665e-05, + "loss": 0.5016, + "num_input_tokens_seen": 6097880, + "step": 10505 + }, + { + "epoch": 1.5653857610962167, + "grad_norm": 8.807517051696777, + "learning_rate": 3.913092046470063e-05, + "loss": 0.8298, + "num_input_tokens_seen": 6100984, + "step": 10510 + }, + { + "epoch": 1.566130473637176, + "grad_norm": 17.879384994506836, + "learning_rate": 3.9149538278224606e-05, + "loss": 0.7743, + "num_input_tokens_seen": 6103928, + "step": 10515 + }, + { + "epoch": 1.5668751861781351, + "grad_norm": 3.4658451080322266, + "learning_rate": 3.9168156091748585e-05, + "loss": 0.8042, + "num_input_tokens_seen": 6107000, + "step": 10520 + }, + { + "epoch": 1.5676198987190944, + "grad_norm": 6.948915004730225, + "learning_rate": 3.918677390527257e-05, + "loss": 0.6477, + "num_input_tokens_seen": 6110744, + "step": 10525 + }, + { + "epoch": 1.5683646112600536, + "grad_norm": 3.8712363243103027, + "learning_rate": 3.920539171879655e-05, + "loss": 0.7199, + "num_input_tokens_seen": 6113848, + "step": 10530 + }, + { + "epoch": 1.5691093238010128, + "grad_norm": 2.7821059226989746, + "learning_rate": 3.9224009532320526e-05, + "loss": 0.5353, + "num_input_tokens_seen": 6116856, + "step": 10535 + }, + { + "epoch": 1.569854036341972, + "grad_norm": 4.309003829956055, + "learning_rate": 3.9242627345844504e-05, + "loss": 0.5533, + "num_input_tokens_seen": 6119416, + "step": 10540 + }, + { + "epoch": 1.5705987488829312, + "grad_norm": 3.5349323749542236, + "learning_rate": 3.926124515936849e-05, + "loss": 0.5626, + "num_input_tokens_seen": 6122168, + "step": 10545 + }, + { + "epoch": 1.5713434614238904, + "grad_norm": 4.851032733917236, + "learning_rate": 3.927986297289247e-05, + "loss": 0.6165, + "num_input_tokens_seen": 6125144, + "step": 10550 + }, + { + "epoch": 1.5720881739648496, + "grad_norm": 5.213372230529785, + "learning_rate": 3.9298480786416446e-05, + "loss": 0.723, + "num_input_tokens_seen": 6128344, + "step": 10555 + }, + { + "epoch": 1.5728328865058088, + "grad_norm": 9.853998184204102, + "learning_rate": 3.9317098599940424e-05, + "loss": 0.8683, + "num_input_tokens_seen": 6131384, + "step": 10560 + }, + { + "epoch": 1.573577599046768, + "grad_norm": 8.151308059692383, + "learning_rate": 3.933571641346441e-05, + "loss": 0.7062, + "num_input_tokens_seen": 6134648, + "step": 10565 + }, + { + "epoch": 1.5743223115877272, + "grad_norm": 5.0329179763793945, + "learning_rate": 3.935433422698838e-05, + "loss": 0.6412, + "num_input_tokens_seen": 6137464, + "step": 10570 + }, + { + "epoch": 1.5750670241286864, + "grad_norm": 7.533470153808594, + "learning_rate": 3.937295204051236e-05, + "loss": 0.6472, + "num_input_tokens_seen": 6140408, + "step": 10575 + }, + { + "epoch": 1.5758117366696456, + "grad_norm": 12.19218635559082, + "learning_rate": 3.9391569854036344e-05, + "loss": 0.5869, + "num_input_tokens_seen": 6143160, + "step": 10580 + }, + { + "epoch": 1.5765564492106048, + "grad_norm": 5.842182636260986, + "learning_rate": 3.941018766756032e-05, + "loss": 0.6638, + "num_input_tokens_seen": 6145944, + "step": 10585 + }, + { + "epoch": 1.577301161751564, + "grad_norm": 7.888777732849121, + "learning_rate": 3.94288054810843e-05, + "loss": 0.5164, + "num_input_tokens_seen": 6148888, + "step": 10590 + }, + { + "epoch": 1.5780458742925232, + "grad_norm": 5.612227916717529, + "learning_rate": 3.9447423294608285e-05, + "loss": 0.5806, + "num_input_tokens_seen": 6151672, + "step": 10595 + }, + { + "epoch": 1.5787905868334824, + "grad_norm": 6.5857367515563965, + "learning_rate": 3.946604110813226e-05, + "loss": 0.5344, + "num_input_tokens_seen": 6154520, + "step": 10600 + }, + { + "epoch": 1.5795352993744416, + "grad_norm": 7.528450012207031, + "learning_rate": 3.948465892165624e-05, + "loss": 0.6704, + "num_input_tokens_seen": 6157400, + "step": 10605 + }, + { + "epoch": 1.5802800119154008, + "grad_norm": 7.305152893066406, + "learning_rate": 3.950327673518022e-05, + "loss": 0.7501, + "num_input_tokens_seen": 6160120, + "step": 10610 + }, + { + "epoch": 1.58102472445636, + "grad_norm": 3.772463083267212, + "learning_rate": 3.9521894548704205e-05, + "loss": 0.4234, + "num_input_tokens_seen": 6163032, + "step": 10615 + }, + { + "epoch": 1.5817694369973192, + "grad_norm": 7.26778507232666, + "learning_rate": 3.954051236222818e-05, + "loss": 0.7693, + "num_input_tokens_seen": 6165816, + "step": 10620 + }, + { + "epoch": 1.5825141495382782, + "grad_norm": 7.525229454040527, + "learning_rate": 3.955913017575216e-05, + "loss": 0.6653, + "num_input_tokens_seen": 6168664, + "step": 10625 + }, + { + "epoch": 1.5832588620792374, + "grad_norm": 2.6522114276885986, + "learning_rate": 3.957774798927614e-05, + "loss": 0.5884, + "num_input_tokens_seen": 6171704, + "step": 10630 + }, + { + "epoch": 1.5840035746201966, + "grad_norm": 3.7270348072052, + "learning_rate": 3.9596365802800125e-05, + "loss": 0.5597, + "num_input_tokens_seen": 6174392, + "step": 10635 + }, + { + "epoch": 1.5847482871611558, + "grad_norm": 3.2861826419830322, + "learning_rate": 3.96149836163241e-05, + "loss": 0.6987, + "num_input_tokens_seen": 6177176, + "step": 10640 + }, + { + "epoch": 1.585492999702115, + "grad_norm": 4.837691307067871, + "learning_rate": 3.963360142984808e-05, + "loss": 0.7625, + "num_input_tokens_seen": 6180280, + "step": 10645 + }, + { + "epoch": 1.5862377122430742, + "grad_norm": 3.788445234298706, + "learning_rate": 3.9652219243372066e-05, + "loss": 0.6859, + "num_input_tokens_seen": 6183576, + "step": 10650 + }, + { + "epoch": 1.5869824247840334, + "grad_norm": 5.614121913909912, + "learning_rate": 3.967083705689604e-05, + "loss": 0.7317, + "num_input_tokens_seen": 6186904, + "step": 10655 + }, + { + "epoch": 1.5877271373249926, + "grad_norm": 5.829448699951172, + "learning_rate": 3.9689454870420016e-05, + "loss": 0.6538, + "num_input_tokens_seen": 6189720, + "step": 10660 + }, + { + "epoch": 1.5884718498659516, + "grad_norm": 3.8214826583862305, + "learning_rate": 3.9708072683944e-05, + "loss": 0.6125, + "num_input_tokens_seen": 6192792, + "step": 10665 + }, + { + "epoch": 1.5892165624069108, + "grad_norm": 5.913989067077637, + "learning_rate": 3.972669049746798e-05, + "loss": 0.7301, + "num_input_tokens_seen": 6196184, + "step": 10670 + }, + { + "epoch": 1.58996127494787, + "grad_norm": 7.012304782867432, + "learning_rate": 3.974530831099196e-05, + "loss": 0.703, + "num_input_tokens_seen": 6198968, + "step": 10675 + }, + { + "epoch": 1.5907059874888292, + "grad_norm": 4.406062602996826, + "learning_rate": 3.9763926124515935e-05, + "loss": 0.6769, + "num_input_tokens_seen": 6202040, + "step": 10680 + }, + { + "epoch": 1.5914507000297884, + "grad_norm": 4.173473358154297, + "learning_rate": 3.978254393803992e-05, + "loss": 0.6659, + "num_input_tokens_seen": 6204952, + "step": 10685 + }, + { + "epoch": 1.5921954125707476, + "grad_norm": 3.710676908493042, + "learning_rate": 3.98011617515639e-05, + "loss": 0.5957, + "num_input_tokens_seen": 6207800, + "step": 10690 + }, + { + "epoch": 1.5929401251117068, + "grad_norm": 3.3880910873413086, + "learning_rate": 3.981977956508788e-05, + "loss": 0.5686, + "num_input_tokens_seen": 6210424, + "step": 10695 + }, + { + "epoch": 1.593684837652666, + "grad_norm": 6.411845684051514, + "learning_rate": 3.9838397378611855e-05, + "loss": 0.7068, + "num_input_tokens_seen": 6213080, + "step": 10700 + }, + { + "epoch": 1.5944295501936252, + "grad_norm": 3.3384640216827393, + "learning_rate": 3.985701519213584e-05, + "loss": 0.7034, + "num_input_tokens_seen": 6216088, + "step": 10705 + }, + { + "epoch": 1.5951742627345844, + "grad_norm": 6.738797664642334, + "learning_rate": 3.987563300565982e-05, + "loss": 0.4797, + "num_input_tokens_seen": 6219160, + "step": 10710 + }, + { + "epoch": 1.5959189752755436, + "grad_norm": 5.886651039123535, + "learning_rate": 3.98942508191838e-05, + "loss": 0.6926, + "num_input_tokens_seen": 6222424, + "step": 10715 + }, + { + "epoch": 1.5966636878165028, + "grad_norm": 8.140632629394531, + "learning_rate": 3.991286863270778e-05, + "loss": 0.6792, + "num_input_tokens_seen": 6225400, + "step": 10720 + }, + { + "epoch": 1.597408400357462, + "grad_norm": 5.238716125488281, + "learning_rate": 3.993148644623176e-05, + "loss": 0.5701, + "num_input_tokens_seen": 6227768, + "step": 10725 + }, + { + "epoch": 1.5981531128984212, + "grad_norm": 5.019039154052734, + "learning_rate": 3.995010425975574e-05, + "loss": 0.4174, + "num_input_tokens_seen": 6231000, + "step": 10730 + }, + { + "epoch": 1.5988978254393804, + "grad_norm": 8.43000602722168, + "learning_rate": 3.9968722073279716e-05, + "loss": 0.5862, + "num_input_tokens_seen": 6233656, + "step": 10735 + }, + { + "epoch": 1.5996425379803396, + "grad_norm": 8.544368743896484, + "learning_rate": 3.9987339886803695e-05, + "loss": 0.836, + "num_input_tokens_seen": 6236344, + "step": 10740 + }, + { + "epoch": 1.6003872505212988, + "grad_norm": 6.108889102935791, + "learning_rate": 4.000595770032767e-05, + "loss": 0.7345, + "num_input_tokens_seen": 6239416, + "step": 10745 + }, + { + "epoch": 1.601131963062258, + "grad_norm": 7.868058204650879, + "learning_rate": 4.002457551385165e-05, + "loss": 0.5441, + "num_input_tokens_seen": 6242520, + "step": 10750 + }, + { + "epoch": 1.6018766756032172, + "grad_norm": 4.374707221984863, + "learning_rate": 4.0043193327375636e-05, + "loss": 0.7222, + "num_input_tokens_seen": 6245336, + "step": 10755 + }, + { + "epoch": 1.6026213881441764, + "grad_norm": 5.640923023223877, + "learning_rate": 4.0061811140899614e-05, + "loss": 0.5296, + "num_input_tokens_seen": 6247928, + "step": 10760 + }, + { + "epoch": 1.6033661006851356, + "grad_norm": 9.647085189819336, + "learning_rate": 4.008042895442359e-05, + "loss": 0.6528, + "num_input_tokens_seen": 6250648, + "step": 10765 + }, + { + "epoch": 1.6041108132260948, + "grad_norm": 9.639547348022461, + "learning_rate": 4.009904676794757e-05, + "loss": 0.5192, + "num_input_tokens_seen": 6253368, + "step": 10770 + }, + { + "epoch": 1.604855525767054, + "grad_norm": 6.5172882080078125, + "learning_rate": 4.0117664581471556e-05, + "loss": 0.8, + "num_input_tokens_seen": 6256024, + "step": 10775 + }, + { + "epoch": 1.6056002383080132, + "grad_norm": 5.888789653778076, + "learning_rate": 4.0136282394995534e-05, + "loss": 0.5711, + "num_input_tokens_seen": 6258680, + "step": 10780 + }, + { + "epoch": 1.6063449508489724, + "grad_norm": 6.151035308837891, + "learning_rate": 4.015490020851951e-05, + "loss": 0.6806, + "num_input_tokens_seen": 6261432, + "step": 10785 + }, + { + "epoch": 1.6070896633899316, + "grad_norm": 6.610717296600342, + "learning_rate": 4.017351802204349e-05, + "loss": 0.8773, + "num_input_tokens_seen": 6264440, + "step": 10790 + }, + { + "epoch": 1.6078343759308906, + "grad_norm": 3.819368839263916, + "learning_rate": 4.0192135835567475e-05, + "loss": 0.5942, + "num_input_tokens_seen": 6267000, + "step": 10795 + }, + { + "epoch": 1.6085790884718498, + "grad_norm": 4.290466785430908, + "learning_rate": 4.0210753649091454e-05, + "loss": 0.679, + "num_input_tokens_seen": 6269432, + "step": 10800 + }, + { + "epoch": 1.609323801012809, + "grad_norm": 5.190969944000244, + "learning_rate": 4.022937146261543e-05, + "loss": 0.7192, + "num_input_tokens_seen": 6272376, + "step": 10805 + }, + { + "epoch": 1.6100685135537682, + "grad_norm": 3.395113229751587, + "learning_rate": 4.024798927613942e-05, + "loss": 0.6624, + "num_input_tokens_seen": 6275480, + "step": 10810 + }, + { + "epoch": 1.6108132260947274, + "grad_norm": 3.0460205078125, + "learning_rate": 4.0266607089663395e-05, + "loss": 0.6948, + "num_input_tokens_seen": 6278392, + "step": 10815 + }, + { + "epoch": 1.6115579386356866, + "grad_norm": 6.148211479187012, + "learning_rate": 4.028522490318737e-05, + "loss": 0.5321, + "num_input_tokens_seen": 6281400, + "step": 10820 + }, + { + "epoch": 1.6123026511766458, + "grad_norm": 3.2014594078063965, + "learning_rate": 4.030384271671135e-05, + "loss": 0.6951, + "num_input_tokens_seen": 6284248, + "step": 10825 + }, + { + "epoch": 1.613047363717605, + "grad_norm": 5.276370525360107, + "learning_rate": 4.032246053023533e-05, + "loss": 0.5391, + "num_input_tokens_seen": 6287288, + "step": 10830 + }, + { + "epoch": 1.6137920762585642, + "grad_norm": 3.3304660320281982, + "learning_rate": 4.034107834375931e-05, + "loss": 0.6081, + "num_input_tokens_seen": 6289912, + "step": 10835 + }, + { + "epoch": 1.6145367887995232, + "grad_norm": 6.638852119445801, + "learning_rate": 4.0359696157283286e-05, + "loss": 0.6589, + "num_input_tokens_seen": 6292728, + "step": 10840 + }, + { + "epoch": 1.6152815013404824, + "grad_norm": 9.930513381958008, + "learning_rate": 4.037831397080727e-05, + "loss": 0.6736, + "num_input_tokens_seen": 6295768, + "step": 10845 + }, + { + "epoch": 1.6160262138814416, + "grad_norm": 2.9822216033935547, + "learning_rate": 4.039693178433125e-05, + "loss": 0.6355, + "num_input_tokens_seen": 6298520, + "step": 10850 + }, + { + "epoch": 1.6167709264224008, + "grad_norm": 7.2581024169921875, + "learning_rate": 4.041554959785523e-05, + "loss": 0.7978, + "num_input_tokens_seen": 6301496, + "step": 10855 + }, + { + "epoch": 1.61751563896336, + "grad_norm": 9.220450401306152, + "learning_rate": 4.0434167411379206e-05, + "loss": 0.5084, + "num_input_tokens_seen": 6304248, + "step": 10860 + }, + { + "epoch": 1.6182603515043192, + "grad_norm": 2.8708016872406006, + "learning_rate": 4.045278522490319e-05, + "loss": 0.6492, + "num_input_tokens_seen": 6307000, + "step": 10865 + }, + { + "epoch": 1.6190050640452784, + "grad_norm": 7.276669025421143, + "learning_rate": 4.047140303842717e-05, + "loss": 0.7041, + "num_input_tokens_seen": 6309912, + "step": 10870 + }, + { + "epoch": 1.6197497765862376, + "grad_norm": 2.7028985023498535, + "learning_rate": 4.049002085195115e-05, + "loss": 0.582, + "num_input_tokens_seen": 6312632, + "step": 10875 + }, + { + "epoch": 1.6204944891271968, + "grad_norm": 10.193405151367188, + "learning_rate": 4.050863866547513e-05, + "loss": 0.6729, + "num_input_tokens_seen": 6315352, + "step": 10880 + }, + { + "epoch": 1.621239201668156, + "grad_norm": 7.917550086975098, + "learning_rate": 4.052725647899911e-05, + "loss": 0.5797, + "num_input_tokens_seen": 6317944, + "step": 10885 + }, + { + "epoch": 1.6219839142091153, + "grad_norm": 12.120770454406738, + "learning_rate": 4.054587429252309e-05, + "loss": 0.5354, + "num_input_tokens_seen": 6321016, + "step": 10890 + }, + { + "epoch": 1.6227286267500745, + "grad_norm": 11.918651580810547, + "learning_rate": 4.056449210604707e-05, + "loss": 0.7018, + "num_input_tokens_seen": 6323768, + "step": 10895 + }, + { + "epoch": 1.6234733392910337, + "grad_norm": 4.836981773376465, + "learning_rate": 4.058310991957105e-05, + "loss": 0.5253, + "num_input_tokens_seen": 6326584, + "step": 10900 + }, + { + "epoch": 1.6242180518319929, + "grad_norm": 4.297863960266113, + "learning_rate": 4.060172773309503e-05, + "loss": 0.5514, + "num_input_tokens_seen": 6329496, + "step": 10905 + }, + { + "epoch": 1.624962764372952, + "grad_norm": 6.480803489685059, + "learning_rate": 4.062034554661901e-05, + "loss": 0.6033, + "num_input_tokens_seen": 6332248, + "step": 10910 + }, + { + "epoch": 1.6257074769139113, + "grad_norm": 14.109598159790039, + "learning_rate": 4.063896336014299e-05, + "loss": 0.8242, + "num_input_tokens_seen": 6334904, + "step": 10915 + }, + { + "epoch": 1.6264521894548705, + "grad_norm": 12.098882675170898, + "learning_rate": 4.0657581173666965e-05, + "loss": 0.7649, + "num_input_tokens_seen": 6337592, + "step": 10920 + }, + { + "epoch": 1.6271969019958297, + "grad_norm": 4.677970886230469, + "learning_rate": 4.067619898719094e-05, + "loss": 0.7761, + "num_input_tokens_seen": 6340504, + "step": 10925 + }, + { + "epoch": 1.6279416145367889, + "grad_norm": 5.904734134674072, + "learning_rate": 4.069481680071492e-05, + "loss": 0.5989, + "num_input_tokens_seen": 6343352, + "step": 10930 + }, + { + "epoch": 1.628686327077748, + "grad_norm": 5.146734237670898, + "learning_rate": 4.0713434614238907e-05, + "loss": 0.5961, + "num_input_tokens_seen": 6346840, + "step": 10935 + }, + { + "epoch": 1.6294310396187073, + "grad_norm": 7.25426721572876, + "learning_rate": 4.0732052427762885e-05, + "loss": 0.7352, + "num_input_tokens_seen": 6349752, + "step": 10940 + }, + { + "epoch": 1.6301757521596665, + "grad_norm": 8.84180736541748, + "learning_rate": 4.075067024128686e-05, + "loss": 0.6497, + "num_input_tokens_seen": 6352728, + "step": 10945 + }, + { + "epoch": 1.6309204647006257, + "grad_norm": 5.980107307434082, + "learning_rate": 4.076928805481084e-05, + "loss": 0.6409, + "num_input_tokens_seen": 6355896, + "step": 10950 + }, + { + "epoch": 1.6316651772415849, + "grad_norm": 5.760055065155029, + "learning_rate": 4.0787905868334826e-05, + "loss": 0.5039, + "num_input_tokens_seen": 6358552, + "step": 10955 + }, + { + "epoch": 1.632409889782544, + "grad_norm": 3.296520233154297, + "learning_rate": 4.0806523681858805e-05, + "loss": 0.6814, + "num_input_tokens_seen": 6361560, + "step": 10960 + }, + { + "epoch": 1.6331546023235033, + "grad_norm": 3.8105947971343994, + "learning_rate": 4.082514149538278e-05, + "loss": 0.7011, + "num_input_tokens_seen": 6364696, + "step": 10965 + }, + { + "epoch": 1.6338993148644623, + "grad_norm": 3.218899965286255, + "learning_rate": 4.084375930890677e-05, + "loss": 0.5756, + "num_input_tokens_seen": 6367736, + "step": 10970 + }, + { + "epoch": 1.6346440274054215, + "grad_norm": 5.067338466644287, + "learning_rate": 4.0862377122430746e-05, + "loss": 0.66, + "num_input_tokens_seen": 6370648, + "step": 10975 + }, + { + "epoch": 1.6353887399463807, + "grad_norm": 4.012572765350342, + "learning_rate": 4.0880994935954724e-05, + "loss": 0.7463, + "num_input_tokens_seen": 6373592, + "step": 10980 + }, + { + "epoch": 1.6361334524873399, + "grad_norm": 6.117007732391357, + "learning_rate": 4.08996127494787e-05, + "loss": 0.4977, + "num_input_tokens_seen": 6376408, + "step": 10985 + }, + { + "epoch": 1.636878165028299, + "grad_norm": 3.793076753616333, + "learning_rate": 4.091823056300269e-05, + "loss": 0.816, + "num_input_tokens_seen": 6379384, + "step": 10990 + }, + { + "epoch": 1.6376228775692583, + "grad_norm": 3.1661183834075928, + "learning_rate": 4.0936848376526666e-05, + "loss": 0.6937, + "num_input_tokens_seen": 6382392, + "step": 10995 + }, + { + "epoch": 1.6383675901102175, + "grad_norm": 3.9707438945770264, + "learning_rate": 4.095546619005064e-05, + "loss": 0.66, + "num_input_tokens_seen": 6385272, + "step": 11000 + }, + { + "epoch": 1.6391123026511767, + "grad_norm": 5.049508571624756, + "learning_rate": 4.097408400357462e-05, + "loss": 0.6352, + "num_input_tokens_seen": 6388184, + "step": 11005 + }, + { + "epoch": 1.6398570151921357, + "grad_norm": 3.8871819972991943, + "learning_rate": 4.09927018170986e-05, + "loss": 0.5703, + "num_input_tokens_seen": 6391256, + "step": 11010 + }, + { + "epoch": 1.6406017277330949, + "grad_norm": 5.091839790344238, + "learning_rate": 4.101131963062258e-05, + "loss": 0.6677, + "num_input_tokens_seen": 6393688, + "step": 11015 + }, + { + "epoch": 1.641346440274054, + "grad_norm": 7.144129753112793, + "learning_rate": 4.102993744414656e-05, + "loss": 0.7252, + "num_input_tokens_seen": 6396728, + "step": 11020 + }, + { + "epoch": 1.6420911528150133, + "grad_norm": 5.905922889709473, + "learning_rate": 4.104855525767054e-05, + "loss": 0.5989, + "num_input_tokens_seen": 6399576, + "step": 11025 + }, + { + "epoch": 1.6428358653559725, + "grad_norm": 4.271267890930176, + "learning_rate": 4.106717307119452e-05, + "loss": 0.6991, + "num_input_tokens_seen": 6402680, + "step": 11030 + }, + { + "epoch": 1.6435805778969317, + "grad_norm": 4.052227973937988, + "learning_rate": 4.10857908847185e-05, + "loss": 0.6507, + "num_input_tokens_seen": 6405688, + "step": 11035 + }, + { + "epoch": 1.6443252904378909, + "grad_norm": 7.678982257843018, + "learning_rate": 4.110440869824248e-05, + "loss": 0.7553, + "num_input_tokens_seen": 6408408, + "step": 11040 + }, + { + "epoch": 1.64507000297885, + "grad_norm": 7.49730110168457, + "learning_rate": 4.112302651176646e-05, + "loss": 0.6968, + "num_input_tokens_seen": 6411032, + "step": 11045 + }, + { + "epoch": 1.6458147155198093, + "grad_norm": 5.04691219329834, + "learning_rate": 4.114164432529044e-05, + "loss": 0.7724, + "num_input_tokens_seen": 6413944, + "step": 11050 + }, + { + "epoch": 1.6465594280607685, + "grad_norm": 3.4965617656707764, + "learning_rate": 4.116026213881442e-05, + "loss": 0.6274, + "num_input_tokens_seen": 6417048, + "step": 11055 + }, + { + "epoch": 1.6473041406017277, + "grad_norm": 4.881919860839844, + "learning_rate": 4.11788799523384e-05, + "loss": 0.645, + "num_input_tokens_seen": 6419960, + "step": 11060 + }, + { + "epoch": 1.648048853142687, + "grad_norm": 5.216404914855957, + "learning_rate": 4.119749776586238e-05, + "loss": 0.6133, + "num_input_tokens_seen": 6422584, + "step": 11065 + }, + { + "epoch": 1.648793565683646, + "grad_norm": 5.4122233390808105, + "learning_rate": 4.121611557938636e-05, + "loss": 0.5714, + "num_input_tokens_seen": 6425624, + "step": 11070 + }, + { + "epoch": 1.6495382782246053, + "grad_norm": 3.167827844619751, + "learning_rate": 4.123473339291034e-05, + "loss": 0.5796, + "num_input_tokens_seen": 6428312, + "step": 11075 + }, + { + "epoch": 1.6502829907655645, + "grad_norm": 8.817327499389648, + "learning_rate": 4.125335120643432e-05, + "loss": 0.413, + "num_input_tokens_seen": 6431064, + "step": 11080 + }, + { + "epoch": 1.6510277033065237, + "grad_norm": 6.312938690185547, + "learning_rate": 4.1271969019958294e-05, + "loss": 0.8061, + "num_input_tokens_seen": 6433976, + "step": 11085 + }, + { + "epoch": 1.651772415847483, + "grad_norm": 8.791894912719727, + "learning_rate": 4.129058683348227e-05, + "loss": 0.6556, + "num_input_tokens_seen": 6437112, + "step": 11090 + }, + { + "epoch": 1.6525171283884421, + "grad_norm": 3.914421558380127, + "learning_rate": 4.130920464700626e-05, + "loss": 0.5892, + "num_input_tokens_seen": 6439928, + "step": 11095 + }, + { + "epoch": 1.6532618409294013, + "grad_norm": 6.670164585113525, + "learning_rate": 4.1327822460530236e-05, + "loss": 0.6495, + "num_input_tokens_seen": 6442872, + "step": 11100 + }, + { + "epoch": 1.6540065534703605, + "grad_norm": 2.9864501953125, + "learning_rate": 4.1346440274054214e-05, + "loss": 0.433, + "num_input_tokens_seen": 6445464, + "step": 11105 + }, + { + "epoch": 1.6547512660113197, + "grad_norm": 9.847481727600098, + "learning_rate": 4.13650580875782e-05, + "loss": 0.6382, + "num_input_tokens_seen": 6448312, + "step": 11110 + }, + { + "epoch": 1.655495978552279, + "grad_norm": 8.25719165802002, + "learning_rate": 4.138367590110218e-05, + "loss": 0.7872, + "num_input_tokens_seen": 6450936, + "step": 11115 + }, + { + "epoch": 1.6562406910932381, + "grad_norm": 2.936764717102051, + "learning_rate": 4.1402293714626155e-05, + "loss": 0.6599, + "num_input_tokens_seen": 6453848, + "step": 11120 + }, + { + "epoch": 1.6569854036341973, + "grad_norm": 7.9789018630981445, + "learning_rate": 4.1420911528150134e-05, + "loss": 0.6172, + "num_input_tokens_seen": 6456920, + "step": 11125 + }, + { + "epoch": 1.6577301161751565, + "grad_norm": 3.9750730991363525, + "learning_rate": 4.143952934167412e-05, + "loss": 0.6984, + "num_input_tokens_seen": 6459768, + "step": 11130 + }, + { + "epoch": 1.6584748287161157, + "grad_norm": 6.2208781242370605, + "learning_rate": 4.14581471551981e-05, + "loss": 0.6569, + "num_input_tokens_seen": 6462616, + "step": 11135 + }, + { + "epoch": 1.6592195412570747, + "grad_norm": 4.182359218597412, + "learning_rate": 4.1476764968722075e-05, + "loss": 0.5578, + "num_input_tokens_seen": 6465592, + "step": 11140 + }, + { + "epoch": 1.659964253798034, + "grad_norm": 4.944649696350098, + "learning_rate": 4.149538278224605e-05, + "loss": 0.6044, + "num_input_tokens_seen": 6468504, + "step": 11145 + }, + { + "epoch": 1.6607089663389931, + "grad_norm": 4.1209282875061035, + "learning_rate": 4.151400059577004e-05, + "loss": 0.5943, + "num_input_tokens_seen": 6471320, + "step": 11150 + }, + { + "epoch": 1.6614536788799523, + "grad_norm": 5.70539665222168, + "learning_rate": 4.1532618409294017e-05, + "loss": 0.4895, + "num_input_tokens_seen": 6474136, + "step": 11155 + }, + { + "epoch": 1.6621983914209115, + "grad_norm": 3.746163845062256, + "learning_rate": 4.1551236222817995e-05, + "loss": 0.5522, + "num_input_tokens_seen": 6476984, + "step": 11160 + }, + { + "epoch": 1.6629431039618707, + "grad_norm": 7.055505275726318, + "learning_rate": 4.156985403634197e-05, + "loss": 0.6902, + "num_input_tokens_seen": 6479864, + "step": 11165 + }, + { + "epoch": 1.66368781650283, + "grad_norm": 7.848091125488281, + "learning_rate": 4.158847184986596e-05, + "loss": 0.7386, + "num_input_tokens_seen": 6482680, + "step": 11170 + }, + { + "epoch": 1.6644325290437891, + "grad_norm": 15.895784378051758, + "learning_rate": 4.160708966338993e-05, + "loss": 0.7614, + "num_input_tokens_seen": 6485336, + "step": 11175 + }, + { + "epoch": 1.6651772415847483, + "grad_norm": 8.035216331481934, + "learning_rate": 4.162570747691391e-05, + "loss": 0.6207, + "num_input_tokens_seen": 6488280, + "step": 11180 + }, + { + "epoch": 1.6659219541257073, + "grad_norm": 4.424757957458496, + "learning_rate": 4.164432529043789e-05, + "loss": 0.4986, + "num_input_tokens_seen": 6491128, + "step": 11185 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 9.555779457092285, + "learning_rate": 4.166294310396187e-05, + "loss": 0.6003, + "num_input_tokens_seen": 6494136, + "step": 11190 + }, + { + "epoch": 1.6674113792076257, + "grad_norm": 8.188239097595215, + "learning_rate": 4.168156091748585e-05, + "loss": 0.5148, + "num_input_tokens_seen": 6496824, + "step": 11195 + }, + { + "epoch": 1.668156091748585, + "grad_norm": 4.624139785766602, + "learning_rate": 4.1700178731009834e-05, + "loss": 0.6517, + "num_input_tokens_seen": 6499768, + "step": 11200 + }, + { + "epoch": 1.6689008042895441, + "grad_norm": 4.888116359710693, + "learning_rate": 4.171879654453381e-05, + "loss": 0.6227, + "num_input_tokens_seen": 6502712, + "step": 11205 + }, + { + "epoch": 1.6696455168305033, + "grad_norm": 4.140107154846191, + "learning_rate": 4.173741435805779e-05, + "loss": 0.6435, + "num_input_tokens_seen": 6505464, + "step": 11210 + }, + { + "epoch": 1.6703902293714625, + "grad_norm": 5.382077693939209, + "learning_rate": 4.175603217158177e-05, + "loss": 0.6137, + "num_input_tokens_seen": 6508504, + "step": 11215 + }, + { + "epoch": 1.6711349419124217, + "grad_norm": 9.076622009277344, + "learning_rate": 4.1774649985105754e-05, + "loss": 0.6034, + "num_input_tokens_seen": 6511320, + "step": 11220 + }, + { + "epoch": 1.671879654453381, + "grad_norm": 10.521240234375, + "learning_rate": 4.179326779862973e-05, + "loss": 0.7439, + "num_input_tokens_seen": 6514392, + "step": 11225 + }, + { + "epoch": 1.6726243669943401, + "grad_norm": 4.071663856506348, + "learning_rate": 4.181188561215371e-05, + "loss": 0.5499, + "num_input_tokens_seen": 6517752, + "step": 11230 + }, + { + "epoch": 1.6733690795352993, + "grad_norm": 6.7303667068481445, + "learning_rate": 4.183050342567769e-05, + "loss": 0.4729, + "num_input_tokens_seen": 6520568, + "step": 11235 + }, + { + "epoch": 1.6741137920762585, + "grad_norm": 6.434745788574219, + "learning_rate": 4.1849121239201674e-05, + "loss": 0.806, + "num_input_tokens_seen": 6523128, + "step": 11240 + }, + { + "epoch": 1.6748585046172177, + "grad_norm": 5.178336143493652, + "learning_rate": 4.186773905272565e-05, + "loss": 0.6795, + "num_input_tokens_seen": 6526232, + "step": 11245 + }, + { + "epoch": 1.675603217158177, + "grad_norm": 3.8067996501922607, + "learning_rate": 4.188635686624963e-05, + "loss": 0.6648, + "num_input_tokens_seen": 6528984, + "step": 11250 + }, + { + "epoch": 1.6763479296991362, + "grad_norm": 4.5469207763671875, + "learning_rate": 4.1904974679773615e-05, + "loss": 0.5728, + "num_input_tokens_seen": 6531896, + "step": 11255 + }, + { + "epoch": 1.6770926422400954, + "grad_norm": 5.795848369598389, + "learning_rate": 4.1923592493297587e-05, + "loss": 0.6634, + "num_input_tokens_seen": 6534552, + "step": 11260 + }, + { + "epoch": 1.6778373547810546, + "grad_norm": 5.61321496963501, + "learning_rate": 4.1942210306821565e-05, + "loss": 0.6231, + "num_input_tokens_seen": 6537400, + "step": 11265 + }, + { + "epoch": 1.6785820673220138, + "grad_norm": 7.724826812744141, + "learning_rate": 4.196082812034555e-05, + "loss": 0.6267, + "num_input_tokens_seen": 6540216, + "step": 11270 + }, + { + "epoch": 1.679326779862973, + "grad_norm": 5.028102397918701, + "learning_rate": 4.197944593386953e-05, + "loss": 0.6679, + "num_input_tokens_seen": 6542968, + "step": 11275 + }, + { + "epoch": 1.6800714924039322, + "grad_norm": 4.700109004974365, + "learning_rate": 4.1998063747393506e-05, + "loss": 0.4967, + "num_input_tokens_seen": 6545848, + "step": 11280 + }, + { + "epoch": 1.6808162049448914, + "grad_norm": 5.667383670806885, + "learning_rate": 4.2016681560917485e-05, + "loss": 0.5799, + "num_input_tokens_seen": 6548664, + "step": 11285 + }, + { + "epoch": 1.6815609174858506, + "grad_norm": 5.73134708404541, + "learning_rate": 4.203529937444147e-05, + "loss": 0.6004, + "num_input_tokens_seen": 6551416, + "step": 11290 + }, + { + "epoch": 1.6823056300268098, + "grad_norm": 8.319110870361328, + "learning_rate": 4.205391718796545e-05, + "loss": 0.5718, + "num_input_tokens_seen": 6554040, + "step": 11295 + }, + { + "epoch": 1.683050342567769, + "grad_norm": 1.8856945037841797, + "learning_rate": 4.2072535001489426e-05, + "loss": 0.4251, + "num_input_tokens_seen": 6557048, + "step": 11300 + }, + { + "epoch": 1.6837950551087282, + "grad_norm": 5.8517913818359375, + "learning_rate": 4.2091152815013404e-05, + "loss": 0.8074, + "num_input_tokens_seen": 6559992, + "step": 11305 + }, + { + "epoch": 1.6845397676496874, + "grad_norm": 4.555874824523926, + "learning_rate": 4.210977062853739e-05, + "loss": 0.5444, + "num_input_tokens_seen": 6562712, + "step": 11310 + }, + { + "epoch": 1.6852844801906464, + "grad_norm": 3.017550230026245, + "learning_rate": 4.212838844206137e-05, + "loss": 0.4423, + "num_input_tokens_seen": 6565688, + "step": 11315 + }, + { + "epoch": 1.6860291927316056, + "grad_norm": 2.9560019969940186, + "learning_rate": 4.2147006255585346e-05, + "loss": 0.6358, + "num_input_tokens_seen": 6568536, + "step": 11320 + }, + { + "epoch": 1.6867739052725648, + "grad_norm": 7.427292823791504, + "learning_rate": 4.216562406910933e-05, + "loss": 0.4922, + "num_input_tokens_seen": 6571448, + "step": 11325 + }, + { + "epoch": 1.687518617813524, + "grad_norm": 5.068542957305908, + "learning_rate": 4.218424188263331e-05, + "loss": 0.6296, + "num_input_tokens_seen": 6574616, + "step": 11330 + }, + { + "epoch": 1.6882633303544832, + "grad_norm": 10.133771896362305, + "learning_rate": 4.220285969615729e-05, + "loss": 0.7864, + "num_input_tokens_seen": 6577432, + "step": 11335 + }, + { + "epoch": 1.6890080428954424, + "grad_norm": 6.971652984619141, + "learning_rate": 4.2221477509681265e-05, + "loss": 0.5175, + "num_input_tokens_seen": 6580088, + "step": 11340 + }, + { + "epoch": 1.6897527554364016, + "grad_norm": 4.3712334632873535, + "learning_rate": 4.2240095323205244e-05, + "loss": 0.5828, + "num_input_tokens_seen": 6582648, + "step": 11345 + }, + { + "epoch": 1.6904974679773608, + "grad_norm": 7.396174430847168, + "learning_rate": 4.225871313672922e-05, + "loss": 0.7368, + "num_input_tokens_seen": 6585400, + "step": 11350 + }, + { + "epoch": 1.69124218051832, + "grad_norm": 10.75021743774414, + "learning_rate": 4.22773309502532e-05, + "loss": 0.796, + "num_input_tokens_seen": 6588472, + "step": 11355 + }, + { + "epoch": 1.691986893059279, + "grad_norm": 2.1212902069091797, + "learning_rate": 4.2295948763777185e-05, + "loss": 0.5809, + "num_input_tokens_seen": 6591192, + "step": 11360 + }, + { + "epoch": 1.6927316056002382, + "grad_norm": 2.526535749435425, + "learning_rate": 4.231456657730116e-05, + "loss": 0.6487, + "num_input_tokens_seen": 6593880, + "step": 11365 + }, + { + "epoch": 1.6934763181411974, + "grad_norm": 5.93652868270874, + "learning_rate": 4.233318439082514e-05, + "loss": 0.7475, + "num_input_tokens_seen": 6597016, + "step": 11370 + }, + { + "epoch": 1.6942210306821566, + "grad_norm": 3.7389309406280518, + "learning_rate": 4.235180220434912e-05, + "loss": 0.6102, + "num_input_tokens_seen": 6599896, + "step": 11375 + }, + { + "epoch": 1.6949657432231158, + "grad_norm": 6.722352027893066, + "learning_rate": 4.2370420017873105e-05, + "loss": 0.7287, + "num_input_tokens_seen": 6602808, + "step": 11380 + }, + { + "epoch": 1.695710455764075, + "grad_norm": 5.9816575050354, + "learning_rate": 4.238903783139708e-05, + "loss": 0.5434, + "num_input_tokens_seen": 6605624, + "step": 11385 + }, + { + "epoch": 1.6964551683050342, + "grad_norm": 3.4304070472717285, + "learning_rate": 4.240765564492106e-05, + "loss": 0.601, + "num_input_tokens_seen": 6608504, + "step": 11390 + }, + { + "epoch": 1.6971998808459934, + "grad_norm": 5.959695816040039, + "learning_rate": 4.242627345844504e-05, + "loss": 0.6328, + "num_input_tokens_seen": 6611128, + "step": 11395 + }, + { + "epoch": 1.6979445933869526, + "grad_norm": 6.25883150100708, + "learning_rate": 4.2444891271969025e-05, + "loss": 0.5363, + "num_input_tokens_seen": 6614168, + "step": 11400 + }, + { + "epoch": 1.6986893059279118, + "grad_norm": 4.508364677429199, + "learning_rate": 4.2463509085493e-05, + "loss": 0.5723, + "num_input_tokens_seen": 6617304, + "step": 11405 + }, + { + "epoch": 1.699434018468871, + "grad_norm": 9.657588005065918, + "learning_rate": 4.248212689901698e-05, + "loss": 0.4772, + "num_input_tokens_seen": 6620088, + "step": 11410 + }, + { + "epoch": 1.7001787310098302, + "grad_norm": 5.990655899047852, + "learning_rate": 4.2500744712540966e-05, + "loss": 0.7157, + "num_input_tokens_seen": 6622872, + "step": 11415 + }, + { + "epoch": 1.7009234435507894, + "grad_norm": 7.53114128112793, + "learning_rate": 4.2519362526064944e-05, + "loss": 0.6837, + "num_input_tokens_seen": 6625560, + "step": 11420 + }, + { + "epoch": 1.7016681560917486, + "grad_norm": 7.606106281280518, + "learning_rate": 4.253798033958892e-05, + "loss": 0.6065, + "num_input_tokens_seen": 6628376, + "step": 11425 + }, + { + "epoch": 1.7024128686327078, + "grad_norm": 3.9991068840026855, + "learning_rate": 4.25565981531129e-05, + "loss": 0.4935, + "num_input_tokens_seen": 6631096, + "step": 11430 + }, + { + "epoch": 1.703157581173667, + "grad_norm": 3.852703332901001, + "learning_rate": 4.257521596663688e-05, + "loss": 0.5313, + "num_input_tokens_seen": 6634040, + "step": 11435 + }, + { + "epoch": 1.7039022937146262, + "grad_norm": 2.5702173709869385, + "learning_rate": 4.259383378016086e-05, + "loss": 0.6071, + "num_input_tokens_seen": 6636920, + "step": 11440 + }, + { + "epoch": 1.7046470062555854, + "grad_norm": 6.79258394241333, + "learning_rate": 4.2612451593684835e-05, + "loss": 0.6827, + "num_input_tokens_seen": 6639800, + "step": 11445 + }, + { + "epoch": 1.7053917187965446, + "grad_norm": 4.731743335723877, + "learning_rate": 4.263106940720882e-05, + "loss": 0.5984, + "num_input_tokens_seen": 6642680, + "step": 11450 + }, + { + "epoch": 1.7061364313375038, + "grad_norm": 4.110493183135986, + "learning_rate": 4.26496872207328e-05, + "loss": 0.6068, + "num_input_tokens_seen": 6645784, + "step": 11455 + }, + { + "epoch": 1.706881143878463, + "grad_norm": 2.371140241622925, + "learning_rate": 4.266830503425678e-05, + "loss": 0.5229, + "num_input_tokens_seen": 6649048, + "step": 11460 + }, + { + "epoch": 1.7076258564194222, + "grad_norm": 3.998575448989868, + "learning_rate": 4.2686922847780755e-05, + "loss": 0.4909, + "num_input_tokens_seen": 6651960, + "step": 11465 + }, + { + "epoch": 1.7083705689603814, + "grad_norm": 4.226199150085449, + "learning_rate": 4.270554066130474e-05, + "loss": 0.6292, + "num_input_tokens_seen": 6654968, + "step": 11470 + }, + { + "epoch": 1.7091152815013406, + "grad_norm": 8.819182395935059, + "learning_rate": 4.272415847482872e-05, + "loss": 0.7025, + "num_input_tokens_seen": 6657944, + "step": 11475 + }, + { + "epoch": 1.7098599940422998, + "grad_norm": 4.675013065338135, + "learning_rate": 4.2742776288352697e-05, + "loss": 0.7306, + "num_input_tokens_seen": 6660824, + "step": 11480 + }, + { + "epoch": 1.710604706583259, + "grad_norm": 7.066685676574707, + "learning_rate": 4.276139410187668e-05, + "loss": 0.896, + "num_input_tokens_seen": 6663800, + "step": 11485 + }, + { + "epoch": 1.711349419124218, + "grad_norm": 5.702134609222412, + "learning_rate": 4.278001191540066e-05, + "loss": 0.6865, + "num_input_tokens_seen": 6666872, + "step": 11490 + }, + { + "epoch": 1.7120941316651772, + "grad_norm": 5.6103715896606445, + "learning_rate": 4.279862972892464e-05, + "loss": 0.55, + "num_input_tokens_seen": 6669624, + "step": 11495 + }, + { + "epoch": 1.7128388442061364, + "grad_norm": 4.5111517906188965, + "learning_rate": 4.2817247542448616e-05, + "loss": 0.6778, + "num_input_tokens_seen": 6672248, + "step": 11500 + }, + { + "epoch": 1.7135835567470956, + "grad_norm": 5.713151931762695, + "learning_rate": 4.28358653559726e-05, + "loss": 0.6443, + "num_input_tokens_seen": 6675288, + "step": 11505 + }, + { + "epoch": 1.7143282692880548, + "grad_norm": 3.895073175430298, + "learning_rate": 4.285448316949658e-05, + "loss": 0.5963, + "num_input_tokens_seen": 6678168, + "step": 11510 + }, + { + "epoch": 1.715072981829014, + "grad_norm": 2.877445936203003, + "learning_rate": 4.287310098302056e-05, + "loss": 0.6253, + "num_input_tokens_seen": 6680792, + "step": 11515 + }, + { + "epoch": 1.7158176943699732, + "grad_norm": 3.266453504562378, + "learning_rate": 4.2891718796544536e-05, + "loss": 0.4625, + "num_input_tokens_seen": 6683672, + "step": 11520 + }, + { + "epoch": 1.7165624069109324, + "grad_norm": 12.615340232849121, + "learning_rate": 4.2910336610068514e-05, + "loss": 0.6798, + "num_input_tokens_seen": 6686584, + "step": 11525 + }, + { + "epoch": 1.7173071194518914, + "grad_norm": 6.00240421295166, + "learning_rate": 4.292895442359249e-05, + "loss": 0.7371, + "num_input_tokens_seen": 6689752, + "step": 11530 + }, + { + "epoch": 1.7180518319928506, + "grad_norm": 4.025372505187988, + "learning_rate": 4.294757223711647e-05, + "loss": 0.6033, + "num_input_tokens_seen": 6692568, + "step": 11535 + }, + { + "epoch": 1.7187965445338098, + "grad_norm": 6.1153244972229, + "learning_rate": 4.2966190050640456e-05, + "loss": 0.6775, + "num_input_tokens_seen": 6695544, + "step": 11540 + }, + { + "epoch": 1.719541257074769, + "grad_norm": 6.099351406097412, + "learning_rate": 4.2984807864164434e-05, + "loss": 0.5065, + "num_input_tokens_seen": 6698424, + "step": 11545 + }, + { + "epoch": 1.7202859696157282, + "grad_norm": 5.231560707092285, + "learning_rate": 4.300342567768841e-05, + "loss": 0.7698, + "num_input_tokens_seen": 6701272, + "step": 11550 + }, + { + "epoch": 1.7210306821566874, + "grad_norm": 3.9571871757507324, + "learning_rate": 4.302204349121239e-05, + "loss": 0.4565, + "num_input_tokens_seen": 6704088, + "step": 11555 + }, + { + "epoch": 1.7217753946976466, + "grad_norm": 4.622130393981934, + "learning_rate": 4.3040661304736375e-05, + "loss": 0.6643, + "num_input_tokens_seen": 6706744, + "step": 11560 + }, + { + "epoch": 1.7225201072386058, + "grad_norm": 4.451765060424805, + "learning_rate": 4.3059279118260354e-05, + "loss": 0.6006, + "num_input_tokens_seen": 6709496, + "step": 11565 + }, + { + "epoch": 1.723264819779565, + "grad_norm": 5.557424545288086, + "learning_rate": 4.307789693178433e-05, + "loss": 0.4771, + "num_input_tokens_seen": 6712504, + "step": 11570 + }, + { + "epoch": 1.7240095323205242, + "grad_norm": 7.164791584014893, + "learning_rate": 4.309651474530832e-05, + "loss": 0.7183, + "num_input_tokens_seen": 6715768, + "step": 11575 + }, + { + "epoch": 1.7247542448614834, + "grad_norm": 2.102402925491333, + "learning_rate": 4.3115132558832295e-05, + "loss": 0.4094, + "num_input_tokens_seen": 6718520, + "step": 11580 + }, + { + "epoch": 1.7254989574024426, + "grad_norm": 4.88011360168457, + "learning_rate": 4.313375037235627e-05, + "loss": 0.707, + "num_input_tokens_seen": 6721304, + "step": 11585 + }, + { + "epoch": 1.7262436699434018, + "grad_norm": 4.632388591766357, + "learning_rate": 4.315236818588025e-05, + "loss": 0.5702, + "num_input_tokens_seen": 6724024, + "step": 11590 + }, + { + "epoch": 1.726988382484361, + "grad_norm": 10.719611167907715, + "learning_rate": 4.3170985999404237e-05, + "loss": 0.776, + "num_input_tokens_seen": 6726904, + "step": 11595 + }, + { + "epoch": 1.7277330950253202, + "grad_norm": 5.606206893920898, + "learning_rate": 4.3189603812928215e-05, + "loss": 0.7504, + "num_input_tokens_seen": 6729784, + "step": 11600 + }, + { + "epoch": 1.7284778075662794, + "grad_norm": 3.4907009601593018, + "learning_rate": 4.3208221626452186e-05, + "loss": 0.4936, + "num_input_tokens_seen": 6732504, + "step": 11605 + }, + { + "epoch": 1.7292225201072386, + "grad_norm": 6.123589992523193, + "learning_rate": 4.322683943997617e-05, + "loss": 0.7356, + "num_input_tokens_seen": 6735320, + "step": 11610 + }, + { + "epoch": 1.7299672326481979, + "grad_norm": 9.545370101928711, + "learning_rate": 4.324545725350015e-05, + "loss": 0.6086, + "num_input_tokens_seen": 6738200, + "step": 11615 + }, + { + "epoch": 1.730711945189157, + "grad_norm": 6.928246021270752, + "learning_rate": 4.326407506702413e-05, + "loss": 0.4986, + "num_input_tokens_seen": 6741304, + "step": 11620 + }, + { + "epoch": 1.7314566577301163, + "grad_norm": 7.2640156745910645, + "learning_rate": 4.3282692880548106e-05, + "loss": 0.7659, + "num_input_tokens_seen": 6744184, + "step": 11625 + }, + { + "epoch": 1.7322013702710755, + "grad_norm": 13.85745906829834, + "learning_rate": 4.330131069407209e-05, + "loss": 0.5945, + "num_input_tokens_seen": 6747032, + "step": 11630 + }, + { + "epoch": 1.7329460828120347, + "grad_norm": 9.291582107543945, + "learning_rate": 4.331992850759607e-05, + "loss": 0.6055, + "num_input_tokens_seen": 6749880, + "step": 11635 + }, + { + "epoch": 1.7336907953529939, + "grad_norm": 8.94929313659668, + "learning_rate": 4.333854632112005e-05, + "loss": 0.9047, + "num_input_tokens_seen": 6752728, + "step": 11640 + }, + { + "epoch": 1.734435507893953, + "grad_norm": 4.16254997253418, + "learning_rate": 4.335716413464403e-05, + "loss": 0.6435, + "num_input_tokens_seen": 6755672, + "step": 11645 + }, + { + "epoch": 1.7351802204349123, + "grad_norm": 3.5294017791748047, + "learning_rate": 4.337578194816801e-05, + "loss": 0.5441, + "num_input_tokens_seen": 6758200, + "step": 11650 + }, + { + "epoch": 1.7359249329758715, + "grad_norm": 7.588011741638184, + "learning_rate": 4.339439976169199e-05, + "loss": 0.6883, + "num_input_tokens_seen": 6760888, + "step": 11655 + }, + { + "epoch": 1.7366696455168305, + "grad_norm": 6.844010829925537, + "learning_rate": 4.341301757521597e-05, + "loss": 0.6068, + "num_input_tokens_seen": 6763800, + "step": 11660 + }, + { + "epoch": 1.7374143580577897, + "grad_norm": 3.820482015609741, + "learning_rate": 4.343163538873995e-05, + "loss": 0.5523, + "num_input_tokens_seen": 6766520, + "step": 11665 + }, + { + "epoch": 1.7381590705987489, + "grad_norm": 6.349262237548828, + "learning_rate": 4.345025320226393e-05, + "loss": 0.6664, + "num_input_tokens_seen": 6769560, + "step": 11670 + }, + { + "epoch": 1.738903783139708, + "grad_norm": 11.686894416809082, + "learning_rate": 4.346887101578791e-05, + "loss": 0.6773, + "num_input_tokens_seen": 6772344, + "step": 11675 + }, + { + "epoch": 1.7396484956806673, + "grad_norm": 3.7384984493255615, + "learning_rate": 4.348748882931189e-05, + "loss": 0.6715, + "num_input_tokens_seen": 6774872, + "step": 11680 + }, + { + "epoch": 1.7403932082216265, + "grad_norm": 8.382156372070312, + "learning_rate": 4.350610664283587e-05, + "loss": 0.6266, + "num_input_tokens_seen": 6777880, + "step": 11685 + }, + { + "epoch": 1.7411379207625857, + "grad_norm": 3.4977545738220215, + "learning_rate": 4.352472445635984e-05, + "loss": 0.634, + "num_input_tokens_seen": 6780696, + "step": 11690 + }, + { + "epoch": 1.7418826333035449, + "grad_norm": 8.947086334228516, + "learning_rate": 4.354334226988382e-05, + "loss": 0.7278, + "num_input_tokens_seen": 6783544, + "step": 11695 + }, + { + "epoch": 1.742627345844504, + "grad_norm": 5.0777668952941895, + "learning_rate": 4.3561960083407807e-05, + "loss": 0.7861, + "num_input_tokens_seen": 6786680, + "step": 11700 + }, + { + "epoch": 1.743372058385463, + "grad_norm": 3.581059455871582, + "learning_rate": 4.3580577896931785e-05, + "loss": 0.6248, + "num_input_tokens_seen": 6789688, + "step": 11705 + }, + { + "epoch": 1.7441167709264223, + "grad_norm": 3.7223567962646484, + "learning_rate": 4.359919571045576e-05, + "loss": 0.6074, + "num_input_tokens_seen": 6792952, + "step": 11710 + }, + { + "epoch": 1.7448614834673815, + "grad_norm": 3.963780403137207, + "learning_rate": 4.361781352397975e-05, + "loss": 0.6546, + "num_input_tokens_seen": 6795928, + "step": 11715 + }, + { + "epoch": 1.7456061960083407, + "grad_norm": 4.265133380889893, + "learning_rate": 4.3636431337503726e-05, + "loss": 0.554, + "num_input_tokens_seen": 6799160, + "step": 11720 + }, + { + "epoch": 1.7463509085492999, + "grad_norm": 4.484224796295166, + "learning_rate": 4.3655049151027704e-05, + "loss": 0.6118, + "num_input_tokens_seen": 6802232, + "step": 11725 + }, + { + "epoch": 1.747095621090259, + "grad_norm": 5.488593578338623, + "learning_rate": 4.367366696455168e-05, + "loss": 0.5697, + "num_input_tokens_seen": 6805208, + "step": 11730 + }, + { + "epoch": 1.7478403336312183, + "grad_norm": 6.185169219970703, + "learning_rate": 4.369228477807567e-05, + "loss": 0.6013, + "num_input_tokens_seen": 6808184, + "step": 11735 + }, + { + "epoch": 1.7485850461721775, + "grad_norm": 7.921233654022217, + "learning_rate": 4.3710902591599646e-05, + "loss": 0.6546, + "num_input_tokens_seen": 6811288, + "step": 11740 + }, + { + "epoch": 1.7493297587131367, + "grad_norm": 14.191580772399902, + "learning_rate": 4.3729520405123624e-05, + "loss": 0.5466, + "num_input_tokens_seen": 6814328, + "step": 11745 + }, + { + "epoch": 1.7500744712540959, + "grad_norm": 7.711426258087158, + "learning_rate": 4.37481382186476e-05, + "loss": 0.5567, + "num_input_tokens_seen": 6817240, + "step": 11750 + }, + { + "epoch": 1.750819183795055, + "grad_norm": 3.5539135932922363, + "learning_rate": 4.376675603217159e-05, + "loss": 0.5216, + "num_input_tokens_seen": 6819896, + "step": 11755 + }, + { + "epoch": 1.7515638963360143, + "grad_norm": 12.96808910369873, + "learning_rate": 4.3785373845695566e-05, + "loss": 0.7591, + "num_input_tokens_seen": 6822840, + "step": 11760 + }, + { + "epoch": 1.7523086088769735, + "grad_norm": 4.988254070281982, + "learning_rate": 4.3803991659219544e-05, + "loss": 0.7015, + "num_input_tokens_seen": 6825624, + "step": 11765 + }, + { + "epoch": 1.7530533214179327, + "grad_norm": 6.149806499481201, + "learning_rate": 4.382260947274352e-05, + "loss": 0.463, + "num_input_tokens_seen": 6828344, + "step": 11770 + }, + { + "epoch": 1.7537980339588919, + "grad_norm": 5.261077404022217, + "learning_rate": 4.384122728626751e-05, + "loss": 0.6418, + "num_input_tokens_seen": 6831736, + "step": 11775 + }, + { + "epoch": 1.754542746499851, + "grad_norm": 10.51897144317627, + "learning_rate": 4.385984509979148e-05, + "loss": 0.649, + "num_input_tokens_seen": 6834552, + "step": 11780 + }, + { + "epoch": 1.7552874590408103, + "grad_norm": 5.646700859069824, + "learning_rate": 4.387846291331546e-05, + "loss": 0.3948, + "num_input_tokens_seen": 6837560, + "step": 11785 + }, + { + "epoch": 1.7560321715817695, + "grad_norm": 5.417813301086426, + "learning_rate": 4.389708072683944e-05, + "loss": 0.6659, + "num_input_tokens_seen": 6840280, + "step": 11790 + }, + { + "epoch": 1.7567768841227287, + "grad_norm": 5.996892929077148, + "learning_rate": 4.391569854036342e-05, + "loss": 0.724, + "num_input_tokens_seen": 6843128, + "step": 11795 + }, + { + "epoch": 1.757521596663688, + "grad_norm": 4.521101951599121, + "learning_rate": 4.39343163538874e-05, + "loss": 0.7715, + "num_input_tokens_seen": 6846328, + "step": 11800 + }, + { + "epoch": 1.758266309204647, + "grad_norm": 8.303827285766602, + "learning_rate": 4.395293416741138e-05, + "loss": 0.5859, + "num_input_tokens_seen": 6849592, + "step": 11805 + }, + { + "epoch": 1.7590110217456063, + "grad_norm": 3.8667855262756348, + "learning_rate": 4.397155198093536e-05, + "loss": 0.6745, + "num_input_tokens_seen": 6852408, + "step": 11810 + }, + { + "epoch": 1.7597557342865655, + "grad_norm": 4.273761749267578, + "learning_rate": 4.399016979445934e-05, + "loss": 0.6407, + "num_input_tokens_seen": 6855160, + "step": 11815 + }, + { + "epoch": 1.7605004468275247, + "grad_norm": 6.092915058135986, + "learning_rate": 4.400878760798332e-05, + "loss": 0.6841, + "num_input_tokens_seen": 6857912, + "step": 11820 + }, + { + "epoch": 1.761245159368484, + "grad_norm": 7.058231353759766, + "learning_rate": 4.40274054215073e-05, + "loss": 0.6876, + "num_input_tokens_seen": 6860792, + "step": 11825 + }, + { + "epoch": 1.7619898719094431, + "grad_norm": 8.975815773010254, + "learning_rate": 4.404602323503128e-05, + "loss": 0.7213, + "num_input_tokens_seen": 6863640, + "step": 11830 + }, + { + "epoch": 1.762734584450402, + "grad_norm": 7.665948390960693, + "learning_rate": 4.406464104855526e-05, + "loss": 0.6181, + "num_input_tokens_seen": 6866296, + "step": 11835 + }, + { + "epoch": 1.7634792969913613, + "grad_norm": 4.756035804748535, + "learning_rate": 4.408325886207924e-05, + "loss": 0.6394, + "num_input_tokens_seen": 6869592, + "step": 11840 + }, + { + "epoch": 1.7642240095323205, + "grad_norm": 4.0740251541137695, + "learning_rate": 4.410187667560322e-05, + "loss": 0.5606, + "num_input_tokens_seen": 6872056, + "step": 11845 + }, + { + "epoch": 1.7649687220732797, + "grad_norm": 4.051481246948242, + "learning_rate": 4.41204944891272e-05, + "loss": 0.5815, + "num_input_tokens_seen": 6874968, + "step": 11850 + }, + { + "epoch": 1.765713434614239, + "grad_norm": 6.705667972564697, + "learning_rate": 4.413911230265118e-05, + "loss": 0.4689, + "num_input_tokens_seen": 6877976, + "step": 11855 + }, + { + "epoch": 1.766458147155198, + "grad_norm": 7.690849781036377, + "learning_rate": 4.4157730116175164e-05, + "loss": 0.5671, + "num_input_tokens_seen": 6881016, + "step": 11860 + }, + { + "epoch": 1.7672028596961573, + "grad_norm": 3.1113803386688232, + "learning_rate": 4.4176347929699136e-05, + "loss": 0.6277, + "num_input_tokens_seen": 6883832, + "step": 11865 + }, + { + "epoch": 1.7679475722371165, + "grad_norm": 5.7666168212890625, + "learning_rate": 4.4194965743223114e-05, + "loss": 0.6783, + "num_input_tokens_seen": 6886680, + "step": 11870 + }, + { + "epoch": 1.7686922847780755, + "grad_norm": 2.8146004676818848, + "learning_rate": 4.42135835567471e-05, + "loss": 0.8266, + "num_input_tokens_seen": 6889656, + "step": 11875 + }, + { + "epoch": 1.7694369973190347, + "grad_norm": 5.955087184906006, + "learning_rate": 4.423220137027108e-05, + "loss": 0.5119, + "num_input_tokens_seen": 6892920, + "step": 11880 + }, + { + "epoch": 1.770181709859994, + "grad_norm": 4.370912551879883, + "learning_rate": 4.4250819183795055e-05, + "loss": 0.5099, + "num_input_tokens_seen": 6895928, + "step": 11885 + }, + { + "epoch": 1.770926422400953, + "grad_norm": 9.86129093170166, + "learning_rate": 4.4269436997319034e-05, + "loss": 0.6949, + "num_input_tokens_seen": 6898680, + "step": 11890 + }, + { + "epoch": 1.7716711349419123, + "grad_norm": 4.688813209533691, + "learning_rate": 4.428805481084302e-05, + "loss": 0.5691, + "num_input_tokens_seen": 6901752, + "step": 11895 + }, + { + "epoch": 1.7724158474828715, + "grad_norm": 6.208582401275635, + "learning_rate": 4.4306672624367e-05, + "loss": 0.7226, + "num_input_tokens_seen": 6904472, + "step": 11900 + }, + { + "epoch": 1.7731605600238307, + "grad_norm": 2.8416755199432373, + "learning_rate": 4.4325290437890975e-05, + "loss": 0.6179, + "num_input_tokens_seen": 6907384, + "step": 11905 + }, + { + "epoch": 1.77390527256479, + "grad_norm": 5.2469162940979, + "learning_rate": 4.434390825141495e-05, + "loss": 0.7169, + "num_input_tokens_seen": 6910168, + "step": 11910 + }, + { + "epoch": 1.7746499851057491, + "grad_norm": 11.526326179504395, + "learning_rate": 4.436252606493894e-05, + "loss": 0.8902, + "num_input_tokens_seen": 6912888, + "step": 11915 + }, + { + "epoch": 1.7753946976467083, + "grad_norm": 5.942480087280273, + "learning_rate": 4.4381143878462917e-05, + "loss": 0.5436, + "num_input_tokens_seen": 6915544, + "step": 11920 + }, + { + "epoch": 1.7761394101876675, + "grad_norm": 4.648182392120361, + "learning_rate": 4.4399761691986895e-05, + "loss": 0.6325, + "num_input_tokens_seen": 6918456, + "step": 11925 + }, + { + "epoch": 1.7768841227286267, + "grad_norm": 3.393341064453125, + "learning_rate": 4.441837950551088e-05, + "loss": 0.5614, + "num_input_tokens_seen": 6921464, + "step": 11930 + }, + { + "epoch": 1.777628835269586, + "grad_norm": 4.342916011810303, + "learning_rate": 4.443699731903486e-05, + "loss": 0.784, + "num_input_tokens_seen": 6924216, + "step": 11935 + }, + { + "epoch": 1.7783735478105451, + "grad_norm": 4.558290004730225, + "learning_rate": 4.4455615132558836e-05, + "loss": 0.6642, + "num_input_tokens_seen": 6927032, + "step": 11940 + }, + { + "epoch": 1.7791182603515043, + "grad_norm": 3.8558192253112793, + "learning_rate": 4.4474232946082814e-05, + "loss": 0.543, + "num_input_tokens_seen": 6929912, + "step": 11945 + }, + { + "epoch": 1.7798629728924635, + "grad_norm": 4.11265754699707, + "learning_rate": 4.449285075960679e-05, + "loss": 0.6989, + "num_input_tokens_seen": 6933240, + "step": 11950 + }, + { + "epoch": 1.7806076854334227, + "grad_norm": 3.5687310695648193, + "learning_rate": 4.451146857313077e-05, + "loss": 0.6146, + "num_input_tokens_seen": 6936056, + "step": 11955 + }, + { + "epoch": 1.781352397974382, + "grad_norm": 4.673165321350098, + "learning_rate": 4.453008638665475e-05, + "loss": 0.4548, + "num_input_tokens_seen": 6938744, + "step": 11960 + }, + { + "epoch": 1.7820971105153411, + "grad_norm": 4.200554847717285, + "learning_rate": 4.4548704200178734e-05, + "loss": 0.576, + "num_input_tokens_seen": 6941752, + "step": 11965 + }, + { + "epoch": 1.7828418230563003, + "grad_norm": 14.030159950256348, + "learning_rate": 4.456732201370271e-05, + "loss": 0.5422, + "num_input_tokens_seen": 6944856, + "step": 11970 + }, + { + "epoch": 1.7835865355972595, + "grad_norm": 7.3450541496276855, + "learning_rate": 4.458593982722669e-05, + "loss": 0.8863, + "num_input_tokens_seen": 6947800, + "step": 11975 + }, + { + "epoch": 1.7843312481382188, + "grad_norm": 7.540850639343262, + "learning_rate": 4.460455764075067e-05, + "loss": 0.6643, + "num_input_tokens_seen": 6950520, + "step": 11980 + }, + { + "epoch": 1.785075960679178, + "grad_norm": 5.935235977172852, + "learning_rate": 4.4623175454274654e-05, + "loss": 0.6367, + "num_input_tokens_seen": 6953368, + "step": 11985 + }, + { + "epoch": 1.7858206732201372, + "grad_norm": 3.1787681579589844, + "learning_rate": 4.464179326779863e-05, + "loss": 0.6682, + "num_input_tokens_seen": 6956376, + "step": 11990 + }, + { + "epoch": 1.7865653857610964, + "grad_norm": 3.7997641563415527, + "learning_rate": 4.466041108132261e-05, + "loss": 0.5995, + "num_input_tokens_seen": 6959224, + "step": 11995 + }, + { + "epoch": 1.7873100983020556, + "grad_norm": 5.153004169464111, + "learning_rate": 4.467902889484659e-05, + "loss": 0.6064, + "num_input_tokens_seen": 6962072, + "step": 12000 + }, + { + "epoch": 1.7880548108430145, + "grad_norm": 3.8363380432128906, + "learning_rate": 4.4697646708370574e-05, + "loss": 0.4953, + "num_input_tokens_seen": 6965208, + "step": 12005 + }, + { + "epoch": 1.7887995233839737, + "grad_norm": 4.094592571258545, + "learning_rate": 4.471626452189455e-05, + "loss": 0.789, + "num_input_tokens_seen": 6968280, + "step": 12010 + }, + { + "epoch": 1.789544235924933, + "grad_norm": 2.8305509090423584, + "learning_rate": 4.473488233541853e-05, + "loss": 0.5117, + "num_input_tokens_seen": 6970936, + "step": 12015 + }, + { + "epoch": 1.7902889484658921, + "grad_norm": 6.363861083984375, + "learning_rate": 4.4753500148942515e-05, + "loss": 0.5626, + "num_input_tokens_seen": 6973816, + "step": 12020 + }, + { + "epoch": 1.7910336610068514, + "grad_norm": 3.672292709350586, + "learning_rate": 4.477211796246649e-05, + "loss": 0.7157, + "num_input_tokens_seen": 6976664, + "step": 12025 + }, + { + "epoch": 1.7917783735478106, + "grad_norm": 4.658320903778076, + "learning_rate": 4.479073577599047e-05, + "loss": 0.3884, + "num_input_tokens_seen": 6979704, + "step": 12030 + }, + { + "epoch": 1.7925230860887698, + "grad_norm": 7.757577419281006, + "learning_rate": 4.480935358951445e-05, + "loss": 0.6727, + "num_input_tokens_seen": 6982456, + "step": 12035 + }, + { + "epoch": 1.793267798629729, + "grad_norm": 6.37185525894165, + "learning_rate": 4.482797140303843e-05, + "loss": 0.6087, + "num_input_tokens_seen": 6985176, + "step": 12040 + }, + { + "epoch": 1.7940125111706882, + "grad_norm": 3.3823137283325195, + "learning_rate": 4.4846589216562406e-05, + "loss": 0.415, + "num_input_tokens_seen": 6987960, + "step": 12045 + }, + { + "epoch": 1.7947572237116471, + "grad_norm": 4.712564945220947, + "learning_rate": 4.4865207030086384e-05, + "loss": 0.7185, + "num_input_tokens_seen": 6991064, + "step": 12050 + }, + { + "epoch": 1.7955019362526063, + "grad_norm": 7.790781497955322, + "learning_rate": 4.488382484361037e-05, + "loss": 0.6203, + "num_input_tokens_seen": 6994008, + "step": 12055 + }, + { + "epoch": 1.7962466487935655, + "grad_norm": 13.311107635498047, + "learning_rate": 4.490244265713435e-05, + "loss": 0.7469, + "num_input_tokens_seen": 6997208, + "step": 12060 + }, + { + "epoch": 1.7969913613345248, + "grad_norm": 3.295586585998535, + "learning_rate": 4.4921060470658326e-05, + "loss": 0.6261, + "num_input_tokens_seen": 7000120, + "step": 12065 + }, + { + "epoch": 1.797736073875484, + "grad_norm": 3.6568756103515625, + "learning_rate": 4.4939678284182304e-05, + "loss": 0.6274, + "num_input_tokens_seen": 7003160, + "step": 12070 + }, + { + "epoch": 1.7984807864164432, + "grad_norm": 5.147959232330322, + "learning_rate": 4.495829609770629e-05, + "loss": 0.6401, + "num_input_tokens_seen": 7006008, + "step": 12075 + }, + { + "epoch": 1.7992254989574024, + "grad_norm": 10.339615821838379, + "learning_rate": 4.497691391123027e-05, + "loss": 0.7111, + "num_input_tokens_seen": 7008792, + "step": 12080 + }, + { + "epoch": 1.7999702114983616, + "grad_norm": 4.225924491882324, + "learning_rate": 4.4995531724754246e-05, + "loss": 0.6707, + "num_input_tokens_seen": 7011736, + "step": 12085 + }, + { + "epoch": 1.8007149240393208, + "grad_norm": 6.2707929611206055, + "learning_rate": 4.501414953827823e-05, + "loss": 0.7039, + "num_input_tokens_seen": 7014296, + "step": 12090 + }, + { + "epoch": 1.80145963658028, + "grad_norm": 3.9044508934020996, + "learning_rate": 4.503276735180221e-05, + "loss": 0.7388, + "num_input_tokens_seen": 7017656, + "step": 12095 + }, + { + "epoch": 1.8022043491212392, + "grad_norm": 4.41541051864624, + "learning_rate": 4.505138516532619e-05, + "loss": 0.5825, + "num_input_tokens_seen": 7020536, + "step": 12100 + }, + { + "epoch": 1.8029490616621984, + "grad_norm": 7.854771614074707, + "learning_rate": 4.5070002978850165e-05, + "loss": 0.6028, + "num_input_tokens_seen": 7023672, + "step": 12105 + }, + { + "epoch": 1.8036937742031576, + "grad_norm": 5.408448219299316, + "learning_rate": 4.508862079237415e-05, + "loss": 0.5552, + "num_input_tokens_seen": 7026232, + "step": 12110 + }, + { + "epoch": 1.8044384867441168, + "grad_norm": 10.372976303100586, + "learning_rate": 4.510723860589813e-05, + "loss": 0.7447, + "num_input_tokens_seen": 7029496, + "step": 12115 + }, + { + "epoch": 1.805183199285076, + "grad_norm": 4.403190612792969, + "learning_rate": 4.512585641942211e-05, + "loss": 0.5816, + "num_input_tokens_seen": 7032312, + "step": 12120 + }, + { + "epoch": 1.8059279118260352, + "grad_norm": 4.813129901885986, + "learning_rate": 4.5144474232946085e-05, + "loss": 0.9561, + "num_input_tokens_seen": 7035256, + "step": 12125 + }, + { + "epoch": 1.8066726243669944, + "grad_norm": 3.825820207595825, + "learning_rate": 4.516309204647006e-05, + "loss": 0.6456, + "num_input_tokens_seen": 7038008, + "step": 12130 + }, + { + "epoch": 1.8074173369079536, + "grad_norm": 5.882998943328857, + "learning_rate": 4.518170985999404e-05, + "loss": 0.4498, + "num_input_tokens_seen": 7040824, + "step": 12135 + }, + { + "epoch": 1.8081620494489128, + "grad_norm": 5.668084621429443, + "learning_rate": 4.520032767351802e-05, + "loss": 0.5676, + "num_input_tokens_seen": 7043960, + "step": 12140 + }, + { + "epoch": 1.808906761989872, + "grad_norm": 5.240414142608643, + "learning_rate": 4.5218945487042005e-05, + "loss": 0.6178, + "num_input_tokens_seen": 7046840, + "step": 12145 + }, + { + "epoch": 1.8096514745308312, + "grad_norm": 10.275688171386719, + "learning_rate": 4.523756330056598e-05, + "loss": 0.7283, + "num_input_tokens_seen": 7050104, + "step": 12150 + }, + { + "epoch": 1.8103961870717904, + "grad_norm": 23.055173873901367, + "learning_rate": 4.525618111408996e-05, + "loss": 0.6218, + "num_input_tokens_seen": 7052728, + "step": 12155 + }, + { + "epoch": 1.8111408996127496, + "grad_norm": 6.450142860412598, + "learning_rate": 4.527479892761394e-05, + "loss": 0.5472, + "num_input_tokens_seen": 7055800, + "step": 12160 + }, + { + "epoch": 1.8118856121537088, + "grad_norm": 9.795681953430176, + "learning_rate": 4.5293416741137924e-05, + "loss": 0.7701, + "num_input_tokens_seen": 7058904, + "step": 12165 + }, + { + "epoch": 1.812630324694668, + "grad_norm": 11.870140075683594, + "learning_rate": 4.53120345546619e-05, + "loss": 0.8517, + "num_input_tokens_seen": 7061688, + "step": 12170 + }, + { + "epoch": 1.8133750372356272, + "grad_norm": 3.642911911010742, + "learning_rate": 4.533065236818588e-05, + "loss": 0.6215, + "num_input_tokens_seen": 7064504, + "step": 12175 + }, + { + "epoch": 1.8141197497765862, + "grad_norm": 5.268712997436523, + "learning_rate": 4.5349270181709866e-05, + "loss": 0.6562, + "num_input_tokens_seen": 7067512, + "step": 12180 + }, + { + "epoch": 1.8148644623175454, + "grad_norm": 6.131793975830078, + "learning_rate": 4.5367887995233844e-05, + "loss": 0.5783, + "num_input_tokens_seen": 7070648, + "step": 12185 + }, + { + "epoch": 1.8156091748585046, + "grad_norm": 7.6967082023620605, + "learning_rate": 4.538650580875782e-05, + "loss": 0.5931, + "num_input_tokens_seen": 7073336, + "step": 12190 + }, + { + "epoch": 1.8163538873994638, + "grad_norm": 9.693009376525879, + "learning_rate": 4.54051236222818e-05, + "loss": 0.6242, + "num_input_tokens_seen": 7076152, + "step": 12195 + }, + { + "epoch": 1.817098599940423, + "grad_norm": 5.761893272399902, + "learning_rate": 4.5423741435805786e-05, + "loss": 0.5264, + "num_input_tokens_seen": 7078968, + "step": 12200 + }, + { + "epoch": 1.8178433124813822, + "grad_norm": 3.169485092163086, + "learning_rate": 4.5442359249329764e-05, + "loss": 0.6428, + "num_input_tokens_seen": 7082168, + "step": 12205 + }, + { + "epoch": 1.8185880250223414, + "grad_norm": 5.56063985824585, + "learning_rate": 4.5460977062853735e-05, + "loss": 0.7265, + "num_input_tokens_seen": 7085144, + "step": 12210 + }, + { + "epoch": 1.8193327375633006, + "grad_norm": 4.7633957862854, + "learning_rate": 4.547959487637772e-05, + "loss": 0.6867, + "num_input_tokens_seen": 7087896, + "step": 12215 + }, + { + "epoch": 1.8200774501042598, + "grad_norm": 3.547271728515625, + "learning_rate": 4.54982126899017e-05, + "loss": 0.6744, + "num_input_tokens_seen": 7090808, + "step": 12220 + }, + { + "epoch": 1.8208221626452188, + "grad_norm": 4.264240264892578, + "learning_rate": 4.551683050342568e-05, + "loss": 0.5539, + "num_input_tokens_seen": 7093624, + "step": 12225 + }, + { + "epoch": 1.821566875186178, + "grad_norm": 4.456305980682373, + "learning_rate": 4.5535448316949655e-05, + "loss": 0.6315, + "num_input_tokens_seen": 7096408, + "step": 12230 + }, + { + "epoch": 1.8223115877271372, + "grad_norm": 3.144590377807617, + "learning_rate": 4.555406613047364e-05, + "loss": 0.6704, + "num_input_tokens_seen": 7099192, + "step": 12235 + }, + { + "epoch": 1.8230563002680964, + "grad_norm": 7.575942039489746, + "learning_rate": 4.557268394399762e-05, + "loss": 0.6479, + "num_input_tokens_seen": 7102008, + "step": 12240 + }, + { + "epoch": 1.8238010128090556, + "grad_norm": 4.107391834259033, + "learning_rate": 4.5591301757521596e-05, + "loss": 0.6937, + "num_input_tokens_seen": 7105016, + "step": 12245 + }, + { + "epoch": 1.8245457253500148, + "grad_norm": 6.510260105133057, + "learning_rate": 4.560991957104558e-05, + "loss": 0.6052, + "num_input_tokens_seen": 7107640, + "step": 12250 + }, + { + "epoch": 1.825290437890974, + "grad_norm": 7.90175724029541, + "learning_rate": 4.562853738456956e-05, + "loss": 0.8746, + "num_input_tokens_seen": 7110328, + "step": 12255 + }, + { + "epoch": 1.8260351504319332, + "grad_norm": 3.8362338542938232, + "learning_rate": 4.564715519809354e-05, + "loss": 0.5334, + "num_input_tokens_seen": 7113112, + "step": 12260 + }, + { + "epoch": 1.8267798629728924, + "grad_norm": 4.0526123046875, + "learning_rate": 4.5665773011617516e-05, + "loss": 0.7329, + "num_input_tokens_seen": 7116216, + "step": 12265 + }, + { + "epoch": 1.8275245755138516, + "grad_norm": 3.652106523513794, + "learning_rate": 4.56843908251415e-05, + "loss": 0.481, + "num_input_tokens_seen": 7119096, + "step": 12270 + }, + { + "epoch": 1.8282692880548108, + "grad_norm": 5.7282938957214355, + "learning_rate": 4.570300863866548e-05, + "loss": 0.5643, + "num_input_tokens_seen": 7122104, + "step": 12275 + }, + { + "epoch": 1.82901400059577, + "grad_norm": 4.560911178588867, + "learning_rate": 4.572162645218946e-05, + "loss": 0.5398, + "num_input_tokens_seen": 7125048, + "step": 12280 + }, + { + "epoch": 1.8297587131367292, + "grad_norm": 7.77790641784668, + "learning_rate": 4.5740244265713436e-05, + "loss": 0.5461, + "num_input_tokens_seen": 7127992, + "step": 12285 + }, + { + "epoch": 1.8305034256776884, + "grad_norm": 5.904672145843506, + "learning_rate": 4.575886207923742e-05, + "loss": 0.5559, + "num_input_tokens_seen": 7130744, + "step": 12290 + }, + { + "epoch": 1.8312481382186476, + "grad_norm": 5.333429336547852, + "learning_rate": 4.57774798927614e-05, + "loss": 0.675, + "num_input_tokens_seen": 7133496, + "step": 12295 + }, + { + "epoch": 1.8319928507596068, + "grad_norm": 12.57758617401123, + "learning_rate": 4.579609770628537e-05, + "loss": 0.5262, + "num_input_tokens_seen": 7136504, + "step": 12300 + }, + { + "epoch": 1.832737563300566, + "grad_norm": 6.370972633361816, + "learning_rate": 4.5814715519809356e-05, + "loss": 0.8319, + "num_input_tokens_seen": 7139896, + "step": 12305 + }, + { + "epoch": 1.8334822758415252, + "grad_norm": 10.909812927246094, + "learning_rate": 4.5833333333333334e-05, + "loss": 0.5855, + "num_input_tokens_seen": 7142520, + "step": 12310 + }, + { + "epoch": 1.8342269883824844, + "grad_norm": 4.637127876281738, + "learning_rate": 4.585195114685731e-05, + "loss": 0.6795, + "num_input_tokens_seen": 7145432, + "step": 12315 + }, + { + "epoch": 1.8349717009234436, + "grad_norm": 11.842432975769043, + "learning_rate": 4.58705689603813e-05, + "loss": 0.797, + "num_input_tokens_seen": 7148408, + "step": 12320 + }, + { + "epoch": 1.8357164134644028, + "grad_norm": 4.445321559906006, + "learning_rate": 4.5889186773905275e-05, + "loss": 0.5384, + "num_input_tokens_seen": 7151160, + "step": 12325 + }, + { + "epoch": 1.836461126005362, + "grad_norm": 3.2518184185028076, + "learning_rate": 4.5907804587429254e-05, + "loss": 0.5262, + "num_input_tokens_seen": 7153976, + "step": 12330 + }, + { + "epoch": 1.8372058385463212, + "grad_norm": 5.05765438079834, + "learning_rate": 4.592642240095323e-05, + "loss": 0.6222, + "num_input_tokens_seen": 7156888, + "step": 12335 + }, + { + "epoch": 1.8379505510872804, + "grad_norm": 8.208946228027344, + "learning_rate": 4.594504021447722e-05, + "loss": 0.6777, + "num_input_tokens_seen": 7159800, + "step": 12340 + }, + { + "epoch": 1.8386952636282397, + "grad_norm": 6.391213417053223, + "learning_rate": 4.5963658028001195e-05, + "loss": 0.604, + "num_input_tokens_seen": 7162936, + "step": 12345 + }, + { + "epoch": 1.8394399761691989, + "grad_norm": 6.056371212005615, + "learning_rate": 4.598227584152517e-05, + "loss": 0.7141, + "num_input_tokens_seen": 7165688, + "step": 12350 + }, + { + "epoch": 1.8401846887101578, + "grad_norm": 3.836357831954956, + "learning_rate": 4.600089365504915e-05, + "loss": 0.7232, + "num_input_tokens_seen": 7168248, + "step": 12355 + }, + { + "epoch": 1.840929401251117, + "grad_norm": 3.547393321990967, + "learning_rate": 4.6019511468573136e-05, + "loss": 0.7409, + "num_input_tokens_seen": 7170872, + "step": 12360 + }, + { + "epoch": 1.8416741137920762, + "grad_norm": 3.9752306938171387, + "learning_rate": 4.6038129282097115e-05, + "loss": 0.5905, + "num_input_tokens_seen": 7173656, + "step": 12365 + }, + { + "epoch": 1.8424188263330354, + "grad_norm": 4.336883068084717, + "learning_rate": 4.605674709562109e-05, + "loss": 0.7708, + "num_input_tokens_seen": 7176728, + "step": 12370 + }, + { + "epoch": 1.8431635388739946, + "grad_norm": 4.42314338684082, + "learning_rate": 4.607536490914507e-05, + "loss": 0.6326, + "num_input_tokens_seen": 7179640, + "step": 12375 + }, + { + "epoch": 1.8439082514149538, + "grad_norm": 4.061672210693359, + "learning_rate": 4.6093982722669056e-05, + "loss": 0.703, + "num_input_tokens_seen": 7182424, + "step": 12380 + }, + { + "epoch": 1.844652963955913, + "grad_norm": 4.492058753967285, + "learning_rate": 4.611260053619303e-05, + "loss": 0.8266, + "num_input_tokens_seen": 7185304, + "step": 12385 + }, + { + "epoch": 1.8453976764968723, + "grad_norm": 4.08718729019165, + "learning_rate": 4.6131218349717006e-05, + "loss": 0.693, + "num_input_tokens_seen": 7188248, + "step": 12390 + }, + { + "epoch": 1.8461423890378312, + "grad_norm": 2.4980199337005615, + "learning_rate": 4.614983616324099e-05, + "loss": 0.6187, + "num_input_tokens_seen": 7191192, + "step": 12395 + }, + { + "epoch": 1.8468871015787904, + "grad_norm": 4.249551296234131, + "learning_rate": 4.616845397676497e-05, + "loss": 0.6052, + "num_input_tokens_seen": 7193976, + "step": 12400 + }, + { + "epoch": 1.8476318141197496, + "grad_norm": 2.358276605606079, + "learning_rate": 4.618707179028895e-05, + "loss": 0.5534, + "num_input_tokens_seen": 7196824, + "step": 12405 + }, + { + "epoch": 1.8483765266607088, + "grad_norm": 3.281371593475342, + "learning_rate": 4.620568960381293e-05, + "loss": 0.6024, + "num_input_tokens_seen": 7199672, + "step": 12410 + }, + { + "epoch": 1.849121239201668, + "grad_norm": 3.7525646686553955, + "learning_rate": 4.622430741733691e-05, + "loss": 0.525, + "num_input_tokens_seen": 7202648, + "step": 12415 + }, + { + "epoch": 1.8498659517426272, + "grad_norm": 8.10236644744873, + "learning_rate": 4.624292523086089e-05, + "loss": 0.4726, + "num_input_tokens_seen": 7205688, + "step": 12420 + }, + { + "epoch": 1.8506106642835864, + "grad_norm": 8.03190803527832, + "learning_rate": 4.626154304438487e-05, + "loss": 0.6496, + "num_input_tokens_seen": 7208824, + "step": 12425 + }, + { + "epoch": 1.8513553768245457, + "grad_norm": 9.363776206970215, + "learning_rate": 4.628016085790885e-05, + "loss": 0.7654, + "num_input_tokens_seen": 7211960, + "step": 12430 + }, + { + "epoch": 1.8521000893655049, + "grad_norm": 8.239511489868164, + "learning_rate": 4.629877867143283e-05, + "loss": 0.4651, + "num_input_tokens_seen": 7214776, + "step": 12435 + }, + { + "epoch": 1.852844801906464, + "grad_norm": 7.337381839752197, + "learning_rate": 4.631739648495681e-05, + "loss": 0.6336, + "num_input_tokens_seen": 7218072, + "step": 12440 + }, + { + "epoch": 1.8535895144474233, + "grad_norm": 4.978404521942139, + "learning_rate": 4.633601429848079e-05, + "loss": 0.6342, + "num_input_tokens_seen": 7221272, + "step": 12445 + }, + { + "epoch": 1.8543342269883825, + "grad_norm": 5.510385513305664, + "learning_rate": 4.635463211200477e-05, + "loss": 0.7317, + "num_input_tokens_seen": 7224120, + "step": 12450 + }, + { + "epoch": 1.8550789395293417, + "grad_norm": 4.966899394989014, + "learning_rate": 4.637324992552875e-05, + "loss": 0.8651, + "num_input_tokens_seen": 7227256, + "step": 12455 + }, + { + "epoch": 1.8558236520703009, + "grad_norm": 3.945984363555908, + "learning_rate": 4.639186773905273e-05, + "loss": 0.5784, + "num_input_tokens_seen": 7230520, + "step": 12460 + }, + { + "epoch": 1.85656836461126, + "grad_norm": 2.4968619346618652, + "learning_rate": 4.641048555257671e-05, + "loss": 0.5563, + "num_input_tokens_seen": 7233272, + "step": 12465 + }, + { + "epoch": 1.8573130771522193, + "grad_norm": 5.11669397354126, + "learning_rate": 4.6429103366100685e-05, + "loss": 0.5977, + "num_input_tokens_seen": 7236024, + "step": 12470 + }, + { + "epoch": 1.8580577896931785, + "grad_norm": 3.194843053817749, + "learning_rate": 4.644772117962466e-05, + "loss": 0.7205, + "num_input_tokens_seen": 7238840, + "step": 12475 + }, + { + "epoch": 1.8588025022341377, + "grad_norm": 3.843435525894165, + "learning_rate": 4.646633899314865e-05, + "loss": 0.5314, + "num_input_tokens_seen": 7241784, + "step": 12480 + }, + { + "epoch": 1.8595472147750969, + "grad_norm": 3.1227831840515137, + "learning_rate": 4.6484956806672626e-05, + "loss": 0.6284, + "num_input_tokens_seen": 7244440, + "step": 12485 + }, + { + "epoch": 1.860291927316056, + "grad_norm": 3.9463272094726562, + "learning_rate": 4.6503574620196604e-05, + "loss": 0.6492, + "num_input_tokens_seen": 7247320, + "step": 12490 + }, + { + "epoch": 1.8610366398570153, + "grad_norm": 6.545291900634766, + "learning_rate": 4.652219243372058e-05, + "loss": 0.7819, + "num_input_tokens_seen": 7249944, + "step": 12495 + }, + { + "epoch": 1.8617813523979745, + "grad_norm": 3.638263702392578, + "learning_rate": 4.654081024724457e-05, + "loss": 0.5644, + "num_input_tokens_seen": 7253112, + "step": 12500 + }, + { + "epoch": 1.8625260649389337, + "grad_norm": 8.08830451965332, + "learning_rate": 4.6559428060768546e-05, + "loss": 0.6523, + "num_input_tokens_seen": 7256472, + "step": 12505 + }, + { + "epoch": 1.863270777479893, + "grad_norm": 5.866307258605957, + "learning_rate": 4.6578045874292524e-05, + "loss": 0.5269, + "num_input_tokens_seen": 7259224, + "step": 12510 + }, + { + "epoch": 1.864015490020852, + "grad_norm": 4.291989326477051, + "learning_rate": 4.65966636878165e-05, + "loss": 0.6169, + "num_input_tokens_seen": 7261912, + "step": 12515 + }, + { + "epoch": 1.8647602025618113, + "grad_norm": 4.261879920959473, + "learning_rate": 4.661528150134049e-05, + "loss": 0.6691, + "num_input_tokens_seen": 7264728, + "step": 12520 + }, + { + "epoch": 1.8655049151027703, + "grad_norm": 4.928695201873779, + "learning_rate": 4.6633899314864466e-05, + "loss": 0.495, + "num_input_tokens_seen": 7267672, + "step": 12525 + }, + { + "epoch": 1.8662496276437295, + "grad_norm": 12.360894203186035, + "learning_rate": 4.6652517128388444e-05, + "loss": 0.6437, + "num_input_tokens_seen": 7270552, + "step": 12530 + }, + { + "epoch": 1.8669943401846887, + "grad_norm": 4.220439910888672, + "learning_rate": 4.667113494191243e-05, + "loss": 0.7523, + "num_input_tokens_seen": 7273240, + "step": 12535 + }, + { + "epoch": 1.8677390527256479, + "grad_norm": 4.9522705078125, + "learning_rate": 4.668975275543641e-05, + "loss": 0.5393, + "num_input_tokens_seen": 7276216, + "step": 12540 + }, + { + "epoch": 1.868483765266607, + "grad_norm": 4.45865535736084, + "learning_rate": 4.6708370568960385e-05, + "loss": 0.5943, + "num_input_tokens_seen": 7278904, + "step": 12545 + }, + { + "epoch": 1.8692284778075663, + "grad_norm": 7.295464038848877, + "learning_rate": 4.6726988382484364e-05, + "loss": 0.5597, + "num_input_tokens_seen": 7281720, + "step": 12550 + }, + { + "epoch": 1.8699731903485255, + "grad_norm": 8.103312492370605, + "learning_rate": 4.674560619600835e-05, + "loss": 0.7416, + "num_input_tokens_seen": 7284504, + "step": 12555 + }, + { + "epoch": 1.8707179028894847, + "grad_norm": 7.559497833251953, + "learning_rate": 4.676422400953232e-05, + "loss": 0.7232, + "num_input_tokens_seen": 7287160, + "step": 12560 + }, + { + "epoch": 1.871462615430444, + "grad_norm": 4.835958003997803, + "learning_rate": 4.67828418230563e-05, + "loss": 0.5966, + "num_input_tokens_seen": 7290008, + "step": 12565 + }, + { + "epoch": 1.8722073279714029, + "grad_norm": 5.323870658874512, + "learning_rate": 4.680145963658028e-05, + "loss": 0.4286, + "num_input_tokens_seen": 7292920, + "step": 12570 + }, + { + "epoch": 1.872952040512362, + "grad_norm": 5.4406304359436035, + "learning_rate": 4.682007745010426e-05, + "loss": 0.6086, + "num_input_tokens_seen": 7295928, + "step": 12575 + }, + { + "epoch": 1.8736967530533213, + "grad_norm": 9.2958345413208, + "learning_rate": 4.683869526362824e-05, + "loss": 0.6594, + "num_input_tokens_seen": 7298968, + "step": 12580 + }, + { + "epoch": 1.8744414655942805, + "grad_norm": 4.04008150100708, + "learning_rate": 4.685731307715222e-05, + "loss": 0.5466, + "num_input_tokens_seen": 7302104, + "step": 12585 + }, + { + "epoch": 1.8751861781352397, + "grad_norm": 3.168978214263916, + "learning_rate": 4.68759308906762e-05, + "loss": 0.6096, + "num_input_tokens_seen": 7305080, + "step": 12590 + }, + { + "epoch": 1.875930890676199, + "grad_norm": 4.5565009117126465, + "learning_rate": 4.689454870420018e-05, + "loss": 0.6714, + "num_input_tokens_seen": 7308152, + "step": 12595 + }, + { + "epoch": 1.876675603217158, + "grad_norm": 9.10167121887207, + "learning_rate": 4.691316651772416e-05, + "loss": 0.6034, + "num_input_tokens_seen": 7310840, + "step": 12600 + }, + { + "epoch": 1.8774203157581173, + "grad_norm": 2.4165894985198975, + "learning_rate": 4.693178433124814e-05, + "loss": 0.56, + "num_input_tokens_seen": 7313368, + "step": 12605 + }, + { + "epoch": 1.8781650282990765, + "grad_norm": 7.391448020935059, + "learning_rate": 4.695040214477212e-05, + "loss": 0.641, + "num_input_tokens_seen": 7316728, + "step": 12610 + }, + { + "epoch": 1.8789097408400357, + "grad_norm": 8.675928115844727, + "learning_rate": 4.69690199582961e-05, + "loss": 0.597, + "num_input_tokens_seen": 7319704, + "step": 12615 + }, + { + "epoch": 1.879654453380995, + "grad_norm": 12.229706764221191, + "learning_rate": 4.698763777182008e-05, + "loss": 0.7536, + "num_input_tokens_seen": 7322392, + "step": 12620 + }, + { + "epoch": 1.880399165921954, + "grad_norm": 5.9260663986206055, + "learning_rate": 4.7006255585344064e-05, + "loss": 0.6667, + "num_input_tokens_seen": 7325208, + "step": 12625 + }, + { + "epoch": 1.8811438784629133, + "grad_norm": 3.890897035598755, + "learning_rate": 4.702487339886804e-05, + "loss": 0.6468, + "num_input_tokens_seen": 7328120, + "step": 12630 + }, + { + "epoch": 1.8818885910038725, + "grad_norm": 7.052812099456787, + "learning_rate": 4.704349121239202e-05, + "loss": 0.5943, + "num_input_tokens_seen": 7331096, + "step": 12635 + }, + { + "epoch": 1.8826333035448317, + "grad_norm": 3.6111230850219727, + "learning_rate": 4.7062109025916e-05, + "loss": 0.502, + "num_input_tokens_seen": 7333944, + "step": 12640 + }, + { + "epoch": 1.883378016085791, + "grad_norm": 5.997236728668213, + "learning_rate": 4.708072683943998e-05, + "loss": 0.7763, + "num_input_tokens_seen": 7336824, + "step": 12645 + }, + { + "epoch": 1.8841227286267501, + "grad_norm": 4.6778459548950195, + "learning_rate": 4.7099344652963955e-05, + "loss": 0.4038, + "num_input_tokens_seen": 7339736, + "step": 12650 + }, + { + "epoch": 1.8848674411677093, + "grad_norm": 3.449641466140747, + "learning_rate": 4.7117962466487934e-05, + "loss": 0.6124, + "num_input_tokens_seen": 7342936, + "step": 12655 + }, + { + "epoch": 1.8856121537086685, + "grad_norm": 4.878323554992676, + "learning_rate": 4.713658028001192e-05, + "loss": 0.5062, + "num_input_tokens_seen": 7345752, + "step": 12660 + }, + { + "epoch": 1.8863568662496277, + "grad_norm": 6.740253925323486, + "learning_rate": 4.71551980935359e-05, + "loss": 0.5242, + "num_input_tokens_seen": 7348408, + "step": 12665 + }, + { + "epoch": 1.887101578790587, + "grad_norm": 5.260500431060791, + "learning_rate": 4.7173815907059875e-05, + "loss": 0.4223, + "num_input_tokens_seen": 7351064, + "step": 12670 + }, + { + "epoch": 1.8878462913315461, + "grad_norm": 12.136411666870117, + "learning_rate": 4.719243372058385e-05, + "loss": 0.5207, + "num_input_tokens_seen": 7353624, + "step": 12675 + }, + { + "epoch": 1.8885910038725053, + "grad_norm": 9.891522407531738, + "learning_rate": 4.721105153410784e-05, + "loss": 0.6006, + "num_input_tokens_seen": 7356376, + "step": 12680 + }, + { + "epoch": 1.8893357164134645, + "grad_norm": 7.3438215255737305, + "learning_rate": 4.7229669347631816e-05, + "loss": 0.8169, + "num_input_tokens_seen": 7359256, + "step": 12685 + }, + { + "epoch": 1.8900804289544237, + "grad_norm": 4.895998954772949, + "learning_rate": 4.7248287161155795e-05, + "loss": 0.6071, + "num_input_tokens_seen": 7362392, + "step": 12690 + }, + { + "epoch": 1.890825141495383, + "grad_norm": 6.437453269958496, + "learning_rate": 4.726690497467978e-05, + "loss": 0.7057, + "num_input_tokens_seen": 7365176, + "step": 12695 + }, + { + "epoch": 1.891569854036342, + "grad_norm": 8.169780731201172, + "learning_rate": 4.728552278820376e-05, + "loss": 0.6824, + "num_input_tokens_seen": 7368056, + "step": 12700 + }, + { + "epoch": 1.8923145665773011, + "grad_norm": 5.807445526123047, + "learning_rate": 4.7304140601727736e-05, + "loss": 0.5857, + "num_input_tokens_seen": 7371064, + "step": 12705 + }, + { + "epoch": 1.8930592791182603, + "grad_norm": 2.9792118072509766, + "learning_rate": 4.7322758415251714e-05, + "loss": 0.6971, + "num_input_tokens_seen": 7373912, + "step": 12710 + }, + { + "epoch": 1.8938039916592195, + "grad_norm": 8.401451110839844, + "learning_rate": 4.73413762287757e-05, + "loss": 0.7363, + "num_input_tokens_seen": 7376888, + "step": 12715 + }, + { + "epoch": 1.8945487042001787, + "grad_norm": 7.356899738311768, + "learning_rate": 4.735999404229968e-05, + "loss": 0.6103, + "num_input_tokens_seen": 7379736, + "step": 12720 + }, + { + "epoch": 1.895293416741138, + "grad_norm": 5.026978015899658, + "learning_rate": 4.7378611855823656e-05, + "loss": 0.6124, + "num_input_tokens_seen": 7382776, + "step": 12725 + }, + { + "epoch": 1.8960381292820971, + "grad_norm": 6.7746124267578125, + "learning_rate": 4.7397229669347634e-05, + "loss": 0.5792, + "num_input_tokens_seen": 7385912, + "step": 12730 + }, + { + "epoch": 1.8967828418230563, + "grad_norm": 5.913501739501953, + "learning_rate": 4.741584748287161e-05, + "loss": 0.6004, + "num_input_tokens_seen": 7388728, + "step": 12735 + }, + { + "epoch": 1.8975275543640155, + "grad_norm": 6.688913345336914, + "learning_rate": 4.743446529639559e-05, + "loss": 0.5955, + "num_input_tokens_seen": 7391384, + "step": 12740 + }, + { + "epoch": 1.8982722669049745, + "grad_norm": 4.768941402435303, + "learning_rate": 4.745308310991957e-05, + "loss": 0.7057, + "num_input_tokens_seen": 7394328, + "step": 12745 + }, + { + "epoch": 1.8990169794459337, + "grad_norm": 9.925533294677734, + "learning_rate": 4.7471700923443554e-05, + "loss": 0.7209, + "num_input_tokens_seen": 7397112, + "step": 12750 + }, + { + "epoch": 1.899761691986893, + "grad_norm": 8.51792049407959, + "learning_rate": 4.749031873696753e-05, + "loss": 0.6855, + "num_input_tokens_seen": 7400152, + "step": 12755 + }, + { + "epoch": 1.9005064045278521, + "grad_norm": 5.708010196685791, + "learning_rate": 4.750893655049151e-05, + "loss": 0.5572, + "num_input_tokens_seen": 7403064, + "step": 12760 + }, + { + "epoch": 1.9012511170688113, + "grad_norm": 8.990972518920898, + "learning_rate": 4.752755436401549e-05, + "loss": 0.5483, + "num_input_tokens_seen": 7405720, + "step": 12765 + }, + { + "epoch": 1.9019958296097705, + "grad_norm": 2.874333143234253, + "learning_rate": 4.7546172177539474e-05, + "loss": 0.6752, + "num_input_tokens_seen": 7408536, + "step": 12770 + }, + { + "epoch": 1.9027405421507297, + "grad_norm": 4.8823747634887695, + "learning_rate": 4.756478999106345e-05, + "loss": 0.7139, + "num_input_tokens_seen": 7411736, + "step": 12775 + }, + { + "epoch": 1.903485254691689, + "grad_norm": 4.749825954437256, + "learning_rate": 4.758340780458743e-05, + "loss": 0.5845, + "num_input_tokens_seen": 7414552, + "step": 12780 + }, + { + "epoch": 1.9042299672326481, + "grad_norm": 4.747127532958984, + "learning_rate": 4.7602025618111415e-05, + "loss": 0.6541, + "num_input_tokens_seen": 7417368, + "step": 12785 + }, + { + "epoch": 1.9049746797736073, + "grad_norm": 5.846039295196533, + "learning_rate": 4.762064343163539e-05, + "loss": 0.4357, + "num_input_tokens_seen": 7420504, + "step": 12790 + }, + { + "epoch": 1.9057193923145666, + "grad_norm": 8.718234062194824, + "learning_rate": 4.763926124515937e-05, + "loss": 0.5911, + "num_input_tokens_seen": 7423448, + "step": 12795 + }, + { + "epoch": 1.9064641048555258, + "grad_norm": 4.623010158538818, + "learning_rate": 4.765787905868335e-05, + "loss": 0.6561, + "num_input_tokens_seen": 7426328, + "step": 12800 + }, + { + "epoch": 1.907208817396485, + "grad_norm": 7.291468143463135, + "learning_rate": 4.7676496872207335e-05, + "loss": 0.7681, + "num_input_tokens_seen": 7428824, + "step": 12805 + }, + { + "epoch": 1.9079535299374442, + "grad_norm": 2.5542683601379395, + "learning_rate": 4.769511468573131e-05, + "loss": 0.6232, + "num_input_tokens_seen": 7432184, + "step": 12810 + }, + { + "epoch": 1.9086982424784034, + "grad_norm": 2.1891143321990967, + "learning_rate": 4.7713732499255284e-05, + "loss": 0.5096, + "num_input_tokens_seen": 7435160, + "step": 12815 + }, + { + "epoch": 1.9094429550193626, + "grad_norm": 7.707122802734375, + "learning_rate": 4.773235031277927e-05, + "loss": 0.6096, + "num_input_tokens_seen": 7437944, + "step": 12820 + }, + { + "epoch": 1.9101876675603218, + "grad_norm": 5.932345867156982, + "learning_rate": 4.775096812630325e-05, + "loss": 0.7202, + "num_input_tokens_seen": 7440952, + "step": 12825 + }, + { + "epoch": 1.910932380101281, + "grad_norm": 3.5268161296844482, + "learning_rate": 4.7769585939827226e-05, + "loss": 0.6289, + "num_input_tokens_seen": 7444216, + "step": 12830 + }, + { + "epoch": 1.9116770926422402, + "grad_norm": 9.117827415466309, + "learning_rate": 4.7788203753351204e-05, + "loss": 0.4934, + "num_input_tokens_seen": 7447160, + "step": 12835 + }, + { + "epoch": 1.9124218051831994, + "grad_norm": 5.451147079467773, + "learning_rate": 4.780682156687519e-05, + "loss": 0.5596, + "num_input_tokens_seen": 7450008, + "step": 12840 + }, + { + "epoch": 1.9131665177241586, + "grad_norm": 5.063117980957031, + "learning_rate": 4.782543938039917e-05, + "loss": 0.6293, + "num_input_tokens_seen": 7453048, + "step": 12845 + }, + { + "epoch": 1.9139112302651178, + "grad_norm": 6.180686950683594, + "learning_rate": 4.7844057193923146e-05, + "loss": 0.9461, + "num_input_tokens_seen": 7456152, + "step": 12850 + }, + { + "epoch": 1.914655942806077, + "grad_norm": 4.891303062438965, + "learning_rate": 4.786267500744713e-05, + "loss": 0.6697, + "num_input_tokens_seen": 7459288, + "step": 12855 + }, + { + "epoch": 1.9154006553470362, + "grad_norm": 3.9424827098846436, + "learning_rate": 4.788129282097111e-05, + "loss": 0.65, + "num_input_tokens_seen": 7462168, + "step": 12860 + }, + { + "epoch": 1.9161453678879954, + "grad_norm": 4.9055256843566895, + "learning_rate": 4.789991063449509e-05, + "loss": 0.4692, + "num_input_tokens_seen": 7464888, + "step": 12865 + }, + { + "epoch": 1.9168900804289544, + "grad_norm": 2.540562868118286, + "learning_rate": 4.7918528448019065e-05, + "loss": 0.5202, + "num_input_tokens_seen": 7468120, + "step": 12870 + }, + { + "epoch": 1.9176347929699136, + "grad_norm": 3.446700096130371, + "learning_rate": 4.793714626154305e-05, + "loss": 0.6426, + "num_input_tokens_seen": 7471288, + "step": 12875 + }, + { + "epoch": 1.9183795055108728, + "grad_norm": 6.529088020324707, + "learning_rate": 4.795576407506703e-05, + "loss": 0.5848, + "num_input_tokens_seen": 7474520, + "step": 12880 + }, + { + "epoch": 1.919124218051832, + "grad_norm": 3.6026418209075928, + "learning_rate": 4.797438188859101e-05, + "loss": 0.6589, + "num_input_tokens_seen": 7477240, + "step": 12885 + }, + { + "epoch": 1.9198689305927912, + "grad_norm": 8.842243194580078, + "learning_rate": 4.7992999702114985e-05, + "loss": 0.6315, + "num_input_tokens_seen": 7480184, + "step": 12890 + }, + { + "epoch": 1.9206136431337504, + "grad_norm": 6.886911392211914, + "learning_rate": 4.801161751563897e-05, + "loss": 0.7881, + "num_input_tokens_seen": 7483128, + "step": 12895 + }, + { + "epoch": 1.9213583556747096, + "grad_norm": 4.373748302459717, + "learning_rate": 4.803023532916295e-05, + "loss": 0.7107, + "num_input_tokens_seen": 7486264, + "step": 12900 + }, + { + "epoch": 1.9221030682156688, + "grad_norm": 4.885788440704346, + "learning_rate": 4.804885314268692e-05, + "loss": 0.6127, + "num_input_tokens_seen": 7488984, + "step": 12905 + }, + { + "epoch": 1.922847780756628, + "grad_norm": 5.418909072875977, + "learning_rate": 4.8067470956210905e-05, + "loss": 0.5957, + "num_input_tokens_seen": 7491576, + "step": 12910 + }, + { + "epoch": 1.923592493297587, + "grad_norm": 3.2288200855255127, + "learning_rate": 4.808608876973488e-05, + "loss": 0.5693, + "num_input_tokens_seen": 7494616, + "step": 12915 + }, + { + "epoch": 1.9243372058385462, + "grad_norm": 6.804723262786865, + "learning_rate": 4.810470658325886e-05, + "loss": 0.6218, + "num_input_tokens_seen": 7497144, + "step": 12920 + }, + { + "epoch": 1.9250819183795054, + "grad_norm": 5.268540859222412, + "learning_rate": 4.8123324396782846e-05, + "loss": 0.5075, + "num_input_tokens_seen": 7499896, + "step": 12925 + }, + { + "epoch": 1.9258266309204646, + "grad_norm": 9.802830696105957, + "learning_rate": 4.8141942210306824e-05, + "loss": 0.5109, + "num_input_tokens_seen": 7502680, + "step": 12930 + }, + { + "epoch": 1.9265713434614238, + "grad_norm": 6.123415470123291, + "learning_rate": 4.81605600238308e-05, + "loss": 0.5882, + "num_input_tokens_seen": 7505432, + "step": 12935 + }, + { + "epoch": 1.927316056002383, + "grad_norm": 5.500946998596191, + "learning_rate": 4.817917783735478e-05, + "loss": 0.4473, + "num_input_tokens_seen": 7508376, + "step": 12940 + }, + { + "epoch": 1.9280607685433422, + "grad_norm": 6.891738414764404, + "learning_rate": 4.8197795650878766e-05, + "loss": 0.404, + "num_input_tokens_seen": 7511384, + "step": 12945 + }, + { + "epoch": 1.9288054810843014, + "grad_norm": 11.126585960388184, + "learning_rate": 4.8216413464402744e-05, + "loss": 0.7379, + "num_input_tokens_seen": 7514360, + "step": 12950 + }, + { + "epoch": 1.9295501936252606, + "grad_norm": 5.634708881378174, + "learning_rate": 4.823503127792672e-05, + "loss": 0.6019, + "num_input_tokens_seen": 7517176, + "step": 12955 + }, + { + "epoch": 1.9302949061662198, + "grad_norm": 5.925952911376953, + "learning_rate": 4.82536490914507e-05, + "loss": 0.5757, + "num_input_tokens_seen": 7520120, + "step": 12960 + }, + { + "epoch": 1.931039618707179, + "grad_norm": 5.480795860290527, + "learning_rate": 4.8272266904974686e-05, + "loss": 0.6853, + "num_input_tokens_seen": 7523224, + "step": 12965 + }, + { + "epoch": 1.9317843312481382, + "grad_norm": 4.197751998901367, + "learning_rate": 4.8290884718498664e-05, + "loss": 0.5657, + "num_input_tokens_seen": 7525752, + "step": 12970 + }, + { + "epoch": 1.9325290437890974, + "grad_norm": 4.8223700523376465, + "learning_rate": 4.830950253202264e-05, + "loss": 0.7637, + "num_input_tokens_seen": 7528504, + "step": 12975 + }, + { + "epoch": 1.9332737563300566, + "grad_norm": 6.754553318023682, + "learning_rate": 4.832812034554662e-05, + "loss": 0.6767, + "num_input_tokens_seen": 7531416, + "step": 12980 + }, + { + "epoch": 1.9340184688710158, + "grad_norm": 2.5686800479888916, + "learning_rate": 4.8346738159070605e-05, + "loss": 0.4565, + "num_input_tokens_seen": 7534200, + "step": 12985 + }, + { + "epoch": 1.934763181411975, + "grad_norm": 9.872787475585938, + "learning_rate": 4.836535597259458e-05, + "loss": 0.5603, + "num_input_tokens_seen": 7537112, + "step": 12990 + }, + { + "epoch": 1.9355078939529342, + "grad_norm": 8.892083168029785, + "learning_rate": 4.8383973786118555e-05, + "loss": 0.6282, + "num_input_tokens_seen": 7539704, + "step": 12995 + }, + { + "epoch": 1.9362526064938934, + "grad_norm": 8.511700630187988, + "learning_rate": 4.840259159964254e-05, + "loss": 0.5157, + "num_input_tokens_seen": 7542456, + "step": 13000 + }, + { + "epoch": 1.9369973190348526, + "grad_norm": 9.817156791687012, + "learning_rate": 4.842120941316652e-05, + "loss": 0.6698, + "num_input_tokens_seen": 7545336, + "step": 13005 + }, + { + "epoch": 1.9377420315758118, + "grad_norm": 3.221287965774536, + "learning_rate": 4.8439827226690496e-05, + "loss": 0.6438, + "num_input_tokens_seen": 7548248, + "step": 13010 + }, + { + "epoch": 1.938486744116771, + "grad_norm": 3.9870805740356445, + "learning_rate": 4.845844504021448e-05, + "loss": 0.5749, + "num_input_tokens_seen": 7551192, + "step": 13015 + }, + { + "epoch": 1.9392314566577302, + "grad_norm": 6.194468021392822, + "learning_rate": 4.847706285373846e-05, + "loss": 0.7545, + "num_input_tokens_seen": 7554328, + "step": 13020 + }, + { + "epoch": 1.9399761691986894, + "grad_norm": 5.72884464263916, + "learning_rate": 4.849568066726244e-05, + "loss": 0.5434, + "num_input_tokens_seen": 7557432, + "step": 13025 + }, + { + "epoch": 1.9407208817396486, + "grad_norm": 5.1958537101745605, + "learning_rate": 4.8514298480786416e-05, + "loss": 0.82, + "num_input_tokens_seen": 7560504, + "step": 13030 + }, + { + "epoch": 1.9414655942806078, + "grad_norm": 3.133334159851074, + "learning_rate": 4.85329162943104e-05, + "loss": 0.5635, + "num_input_tokens_seen": 7563128, + "step": 13035 + }, + { + "epoch": 1.942210306821567, + "grad_norm": 4.409267425537109, + "learning_rate": 4.855153410783438e-05, + "loss": 0.624, + "num_input_tokens_seen": 7566232, + "step": 13040 + }, + { + "epoch": 1.942955019362526, + "grad_norm": 5.628040790557861, + "learning_rate": 4.857015192135836e-05, + "loss": 0.6954, + "num_input_tokens_seen": 7569400, + "step": 13045 + }, + { + "epoch": 1.9436997319034852, + "grad_norm": 3.5552353858947754, + "learning_rate": 4.8588769734882336e-05, + "loss": 0.7528, + "num_input_tokens_seen": 7572376, + "step": 13050 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 8.057907104492188, + "learning_rate": 4.860738754840632e-05, + "loss": 0.6168, + "num_input_tokens_seen": 7575160, + "step": 13055 + }, + { + "epoch": 1.9451891569854036, + "grad_norm": 6.746297359466553, + "learning_rate": 4.86260053619303e-05, + "loss": 0.5561, + "num_input_tokens_seen": 7578072, + "step": 13060 + }, + { + "epoch": 1.9459338695263628, + "grad_norm": 4.203850746154785, + "learning_rate": 4.864462317545428e-05, + "loss": 0.5328, + "num_input_tokens_seen": 7580696, + "step": 13065 + }, + { + "epoch": 1.946678582067322, + "grad_norm": 4.422901630401611, + "learning_rate": 4.866324098897826e-05, + "loss": 0.718, + "num_input_tokens_seen": 7583640, + "step": 13070 + }, + { + "epoch": 1.9474232946082812, + "grad_norm": 11.687323570251465, + "learning_rate": 4.8681858802502234e-05, + "loss": 0.5975, + "num_input_tokens_seen": 7586616, + "step": 13075 + }, + { + "epoch": 1.9481680071492404, + "grad_norm": 3.1808347702026367, + "learning_rate": 4.870047661602621e-05, + "loss": 0.5436, + "num_input_tokens_seen": 7589304, + "step": 13080 + }, + { + "epoch": 1.9489127196901996, + "grad_norm": 3.196286916732788, + "learning_rate": 4.87190944295502e-05, + "loss": 0.5542, + "num_input_tokens_seen": 7592280, + "step": 13085 + }, + { + "epoch": 1.9496574322311586, + "grad_norm": 6.048326015472412, + "learning_rate": 4.8737712243074175e-05, + "loss": 0.7229, + "num_input_tokens_seen": 7595352, + "step": 13090 + }, + { + "epoch": 1.9504021447721178, + "grad_norm": 2.4612624645233154, + "learning_rate": 4.8756330056598153e-05, + "loss": 0.5093, + "num_input_tokens_seen": 7598040, + "step": 13095 + }, + { + "epoch": 1.951146857313077, + "grad_norm": 12.80179500579834, + "learning_rate": 4.877494787012213e-05, + "loss": 0.8161, + "num_input_tokens_seen": 7600760, + "step": 13100 + }, + { + "epoch": 1.9518915698540362, + "grad_norm": 6.257133483886719, + "learning_rate": 4.879356568364612e-05, + "loss": 0.5985, + "num_input_tokens_seen": 7603736, + "step": 13105 + }, + { + "epoch": 1.9526362823949954, + "grad_norm": 5.857337951660156, + "learning_rate": 4.8812183497170095e-05, + "loss": 0.514, + "num_input_tokens_seen": 7607000, + "step": 13110 + }, + { + "epoch": 1.9533809949359546, + "grad_norm": 5.35329532623291, + "learning_rate": 4.883080131069407e-05, + "loss": 0.5739, + "num_input_tokens_seen": 7609752, + "step": 13115 + }, + { + "epoch": 1.9541257074769138, + "grad_norm": 5.023961067199707, + "learning_rate": 4.884941912421805e-05, + "loss": 0.5333, + "num_input_tokens_seen": 7612600, + "step": 13120 + }, + { + "epoch": 1.954870420017873, + "grad_norm": 6.453316688537598, + "learning_rate": 4.8868036937742036e-05, + "loss": 0.9204, + "num_input_tokens_seen": 7615480, + "step": 13125 + }, + { + "epoch": 1.9556151325588322, + "grad_norm": 4.639744281768799, + "learning_rate": 4.8886654751266015e-05, + "loss": 0.6841, + "num_input_tokens_seen": 7618488, + "step": 13130 + }, + { + "epoch": 1.9563598450997914, + "grad_norm": 8.783773422241211, + "learning_rate": 4.890527256478999e-05, + "loss": 0.5669, + "num_input_tokens_seen": 7621304, + "step": 13135 + }, + { + "epoch": 1.9571045576407506, + "grad_norm": 5.744968891143799, + "learning_rate": 4.892389037831398e-05, + "loss": 0.6504, + "num_input_tokens_seen": 7624184, + "step": 13140 + }, + { + "epoch": 1.9578492701817098, + "grad_norm": 4.226619720458984, + "learning_rate": 4.8942508191837956e-05, + "loss": 0.7156, + "num_input_tokens_seen": 7626936, + "step": 13145 + }, + { + "epoch": 1.958593982722669, + "grad_norm": 4.257082939147949, + "learning_rate": 4.8961126005361934e-05, + "loss": 0.5164, + "num_input_tokens_seen": 7629752, + "step": 13150 + }, + { + "epoch": 1.9593386952636282, + "grad_norm": 5.965811729431152, + "learning_rate": 4.897974381888591e-05, + "loss": 0.5608, + "num_input_tokens_seen": 7632696, + "step": 13155 + }, + { + "epoch": 1.9600834078045875, + "grad_norm": 3.0770561695098877, + "learning_rate": 4.89983616324099e-05, + "loss": 0.4029, + "num_input_tokens_seen": 7635480, + "step": 13160 + }, + { + "epoch": 1.9608281203455467, + "grad_norm": 3.943382740020752, + "learning_rate": 4.901697944593387e-05, + "loss": 0.5647, + "num_input_tokens_seen": 7638520, + "step": 13165 + }, + { + "epoch": 1.9615728328865059, + "grad_norm": 8.189173698425293, + "learning_rate": 4.903559725945785e-05, + "loss": 0.4854, + "num_input_tokens_seen": 7641464, + "step": 13170 + }, + { + "epoch": 1.962317545427465, + "grad_norm": 3.4897220134735107, + "learning_rate": 4.905421507298183e-05, + "loss": 0.5345, + "num_input_tokens_seen": 7644280, + "step": 13175 + }, + { + "epoch": 1.9630622579684243, + "grad_norm": 5.181923866271973, + "learning_rate": 4.907283288650581e-05, + "loss": 0.6163, + "num_input_tokens_seen": 7646968, + "step": 13180 + }, + { + "epoch": 1.9638069705093835, + "grad_norm": 9.744804382324219, + "learning_rate": 4.909145070002979e-05, + "loss": 0.6928, + "num_input_tokens_seen": 7649816, + "step": 13185 + }, + { + "epoch": 1.9645516830503427, + "grad_norm": 4.926015853881836, + "learning_rate": 4.911006851355377e-05, + "loss": 0.5953, + "num_input_tokens_seen": 7653048, + "step": 13190 + }, + { + "epoch": 1.9652963955913019, + "grad_norm": 5.950814247131348, + "learning_rate": 4.912868632707775e-05, + "loss": 0.6075, + "num_input_tokens_seen": 7655992, + "step": 13195 + }, + { + "epoch": 1.966041108132261, + "grad_norm": 12.539837837219238, + "learning_rate": 4.914730414060173e-05, + "loss": 0.682, + "num_input_tokens_seen": 7658840, + "step": 13200 + }, + { + "epoch": 1.9667858206732203, + "grad_norm": 9.15200424194336, + "learning_rate": 4.916592195412571e-05, + "loss": 0.5215, + "num_input_tokens_seen": 7661912, + "step": 13205 + }, + { + "epoch": 1.9675305332141795, + "grad_norm": 8.336627006530762, + "learning_rate": 4.918453976764969e-05, + "loss": 0.5665, + "num_input_tokens_seen": 7664792, + "step": 13210 + }, + { + "epoch": 1.9682752457551387, + "grad_norm": 4.388415336608887, + "learning_rate": 4.920315758117367e-05, + "loss": 0.3267, + "num_input_tokens_seen": 7667448, + "step": 13215 + }, + { + "epoch": 1.9690199582960977, + "grad_norm": 18.75495719909668, + "learning_rate": 4.922177539469765e-05, + "loss": 0.7693, + "num_input_tokens_seen": 7670232, + "step": 13220 + }, + { + "epoch": 1.9697646708370569, + "grad_norm": 10.72925853729248, + "learning_rate": 4.924039320822163e-05, + "loss": 1.0, + "num_input_tokens_seen": 7672984, + "step": 13225 + }, + { + "epoch": 1.970509383378016, + "grad_norm": 20.4044132232666, + "learning_rate": 4.925901102174561e-05, + "loss": 0.5418, + "num_input_tokens_seen": 7675704, + "step": 13230 + }, + { + "epoch": 1.9712540959189753, + "grad_norm": 7.6132049560546875, + "learning_rate": 4.927762883526959e-05, + "loss": 0.6964, + "num_input_tokens_seen": 7679000, + "step": 13235 + }, + { + "epoch": 1.9719988084599345, + "grad_norm": 5.980737686157227, + "learning_rate": 4.929624664879357e-05, + "loss": 0.6191, + "num_input_tokens_seen": 7681880, + "step": 13240 + }, + { + "epoch": 1.9727435210008937, + "grad_norm": 4.3771891593933105, + "learning_rate": 4.931486446231755e-05, + "loss": 0.6844, + "num_input_tokens_seen": 7684952, + "step": 13245 + }, + { + "epoch": 1.9734882335418529, + "grad_norm": 4.212878704071045, + "learning_rate": 4.9333482275841526e-05, + "loss": 0.5967, + "num_input_tokens_seen": 7687768, + "step": 13250 + }, + { + "epoch": 1.974232946082812, + "grad_norm": 5.57409143447876, + "learning_rate": 4.9352100089365504e-05, + "loss": 0.6295, + "num_input_tokens_seen": 7690680, + "step": 13255 + }, + { + "epoch": 1.974977658623771, + "grad_norm": 3.19882869720459, + "learning_rate": 4.937071790288948e-05, + "loss": 0.582, + "num_input_tokens_seen": 7693560, + "step": 13260 + }, + { + "epoch": 1.9757223711647303, + "grad_norm": 5.860116481781006, + "learning_rate": 4.938933571641347e-05, + "loss": 0.656, + "num_input_tokens_seen": 7696536, + "step": 13265 + }, + { + "epoch": 1.9764670837056895, + "grad_norm": 4.772298336029053, + "learning_rate": 4.9407953529937446e-05, + "loss": 0.7168, + "num_input_tokens_seen": 7699224, + "step": 13270 + }, + { + "epoch": 1.9772117962466487, + "grad_norm": 3.313297748565674, + "learning_rate": 4.9426571343461424e-05, + "loss": 0.5156, + "num_input_tokens_seen": 7702648, + "step": 13275 + }, + { + "epoch": 1.9779565087876079, + "grad_norm": 6.6272053718566895, + "learning_rate": 4.94451891569854e-05, + "loss": 0.5347, + "num_input_tokens_seen": 7705528, + "step": 13280 + }, + { + "epoch": 1.978701221328567, + "grad_norm": 6.899621486663818, + "learning_rate": 4.946380697050939e-05, + "loss": 0.6204, + "num_input_tokens_seen": 7708376, + "step": 13285 + }, + { + "epoch": 1.9794459338695263, + "grad_norm": 9.009224891662598, + "learning_rate": 4.9482424784033366e-05, + "loss": 0.5656, + "num_input_tokens_seen": 7711256, + "step": 13290 + }, + { + "epoch": 1.9801906464104855, + "grad_norm": 5.331570148468018, + "learning_rate": 4.9501042597557344e-05, + "loss": 0.7629, + "num_input_tokens_seen": 7714360, + "step": 13295 + }, + { + "epoch": 1.9809353589514447, + "grad_norm": 7.173440933227539, + "learning_rate": 4.951966041108133e-05, + "loss": 0.6549, + "num_input_tokens_seen": 7717016, + "step": 13300 + }, + { + "epoch": 1.9816800714924039, + "grad_norm": 5.156104564666748, + "learning_rate": 4.953827822460531e-05, + "loss": 0.5926, + "num_input_tokens_seen": 7720440, + "step": 13305 + }, + { + "epoch": 1.982424784033363, + "grad_norm": 7.400082111358643, + "learning_rate": 4.9556896038129285e-05, + "loss": 0.659, + "num_input_tokens_seen": 7723672, + "step": 13310 + }, + { + "epoch": 1.9831694965743223, + "grad_norm": 7.21546745300293, + "learning_rate": 4.9575513851653263e-05, + "loss": 0.7822, + "num_input_tokens_seen": 7726360, + "step": 13315 + }, + { + "epoch": 1.9839142091152815, + "grad_norm": 8.031098365783691, + "learning_rate": 4.959413166517725e-05, + "loss": 0.5569, + "num_input_tokens_seen": 7729016, + "step": 13320 + }, + { + "epoch": 1.9846589216562407, + "grad_norm": 5.952142715454102, + "learning_rate": 4.961274947870123e-05, + "loss": 0.5945, + "num_input_tokens_seen": 7731864, + "step": 13325 + }, + { + "epoch": 1.9854036341972, + "grad_norm": 9.842682838439941, + "learning_rate": 4.9631367292225205e-05, + "loss": 0.6367, + "num_input_tokens_seen": 7734552, + "step": 13330 + }, + { + "epoch": 1.986148346738159, + "grad_norm": 7.957555294036865, + "learning_rate": 4.964998510574918e-05, + "loss": 0.7687, + "num_input_tokens_seen": 7737528, + "step": 13335 + }, + { + "epoch": 1.9868930592791183, + "grad_norm": 4.062140941619873, + "learning_rate": 4.966860291927316e-05, + "loss": 0.6367, + "num_input_tokens_seen": 7740568, + "step": 13340 + }, + { + "epoch": 1.9876377718200775, + "grad_norm": 7.076771259307861, + "learning_rate": 4.968722073279714e-05, + "loss": 0.6744, + "num_input_tokens_seen": 7743704, + "step": 13345 + }, + { + "epoch": 1.9883824843610367, + "grad_norm": 3.6759743690490723, + "learning_rate": 4.970583854632112e-05, + "loss": 0.5306, + "num_input_tokens_seen": 7746648, + "step": 13350 + }, + { + "epoch": 1.989127196901996, + "grad_norm": 5.929739952087402, + "learning_rate": 4.97244563598451e-05, + "loss": 0.4882, + "num_input_tokens_seen": 7749624, + "step": 13355 + }, + { + "epoch": 1.9898719094429551, + "grad_norm": 9.313572883605957, + "learning_rate": 4.974307417336908e-05, + "loss": 0.8207, + "num_input_tokens_seen": 7752184, + "step": 13360 + }, + { + "epoch": 1.9906166219839143, + "grad_norm": 4.068609237670898, + "learning_rate": 4.976169198689306e-05, + "loss": 0.561, + "num_input_tokens_seen": 7754936, + "step": 13365 + }, + { + "epoch": 1.9913613345248735, + "grad_norm": 6.355746269226074, + "learning_rate": 4.978030980041704e-05, + "loss": 0.6854, + "num_input_tokens_seen": 7757688, + "step": 13370 + }, + { + "epoch": 1.9921060470658327, + "grad_norm": 5.646906852722168, + "learning_rate": 4.979892761394102e-05, + "loss": 0.5496, + "num_input_tokens_seen": 7760472, + "step": 13375 + }, + { + "epoch": 1.992850759606792, + "grad_norm": 5.956446647644043, + "learning_rate": 4.9817545427465e-05, + "loss": 0.6824, + "num_input_tokens_seen": 7763576, + "step": 13380 + }, + { + "epoch": 1.9935954721477511, + "grad_norm": 6.352950096130371, + "learning_rate": 4.983616324098898e-05, + "loss": 0.6054, + "num_input_tokens_seen": 7766488, + "step": 13385 + }, + { + "epoch": 1.99434018468871, + "grad_norm": 4.378606796264648, + "learning_rate": 4.9854781054512964e-05, + "loss": 0.5929, + "num_input_tokens_seen": 7769432, + "step": 13390 + }, + { + "epoch": 1.9950848972296693, + "grad_norm": 6.738806247711182, + "learning_rate": 4.987339886803694e-05, + "loss": 0.6207, + "num_input_tokens_seen": 7771992, + "step": 13395 + }, + { + "epoch": 1.9958296097706285, + "grad_norm": 7.701402187347412, + "learning_rate": 4.989201668156092e-05, + "loss": 0.6077, + "num_input_tokens_seen": 7774808, + "step": 13400 + }, + { + "epoch": 1.9965743223115877, + "grad_norm": 4.60798978805542, + "learning_rate": 4.99106344950849e-05, + "loss": 0.5663, + "num_input_tokens_seen": 7777624, + "step": 13405 + }, + { + "epoch": 1.997319034852547, + "grad_norm": 6.352741718292236, + "learning_rate": 4.9929252308608884e-05, + "loss": 0.5525, + "num_input_tokens_seen": 7780696, + "step": 13410 + }, + { + "epoch": 1.9980637473935061, + "grad_norm": 5.918435096740723, + "learning_rate": 4.994787012213286e-05, + "loss": 0.5793, + "num_input_tokens_seen": 7783736, + "step": 13415 + }, + { + "epoch": 1.9988084599344653, + "grad_norm": 9.411706924438477, + "learning_rate": 4.996648793565684e-05, + "loss": 0.8297, + "num_input_tokens_seen": 7786520, + "step": 13420 + }, + { + "epoch": 1.9995531724754245, + "grad_norm": 6.208799839019775, + "learning_rate": 4.998510574918082e-05, + "loss": 0.6586, + "num_input_tokens_seen": 7789368, + "step": 13425 + }, + { + "epoch": 2.0, + "eval_loss": 0.6468682885169983, + "eval_runtime": 51.2693, + "eval_samples_per_second": 58.203, + "eval_steps_per_second": 14.551, + "num_input_tokens_seen": 7790784, + "step": 13428 + }, + { + "epoch": 2.0002978850163835, + "grad_norm": 4.061120510101318, + "learning_rate": 4.999999999155301e-05, + "loss": 0.6472, + "num_input_tokens_seen": 7792000, + "step": 13430 + }, + { + "epoch": 2.0010425975573427, + "grad_norm": 3.0161781311035156, + "learning_rate": 4.9999999695908296e-05, + "loss": 0.5689, + "num_input_tokens_seen": 7794848, + "step": 13435 + }, + { + "epoch": 2.001787310098302, + "grad_norm": 4.787919998168945, + "learning_rate": 4.9999998977913995e-05, + "loss": 0.6092, + "num_input_tokens_seen": 7797760, + "step": 13440 + }, + { + "epoch": 2.002532022639261, + "grad_norm": 4.2400054931640625, + "learning_rate": 4.999999783757012e-05, + "loss": 0.5812, + "num_input_tokens_seen": 7800896, + "step": 13445 + }, + { + "epoch": 2.0032767351802203, + "grad_norm": 7.234805107116699, + "learning_rate": 4.999999627487669e-05, + "loss": 0.558, + "num_input_tokens_seen": 7803936, + "step": 13450 + }, + { + "epoch": 2.0040214477211795, + "grad_norm": 6.776047706604004, + "learning_rate": 4.999999428983374e-05, + "loss": 0.7003, + "num_input_tokens_seen": 7806784, + "step": 13455 + }, + { + "epoch": 2.0047661602621387, + "grad_norm": 5.051130294799805, + "learning_rate": 4.999999188244129e-05, + "loss": 0.4599, + "num_input_tokens_seen": 7809664, + "step": 13460 + }, + { + "epoch": 2.005510872803098, + "grad_norm": 11.780901908874512, + "learning_rate": 4.999998905269938e-05, + "loss": 0.6073, + "num_input_tokens_seen": 7812448, + "step": 13465 + }, + { + "epoch": 2.006255585344057, + "grad_norm": 5.338559627532959, + "learning_rate": 4.9999985800608076e-05, + "loss": 0.4255, + "num_input_tokens_seen": 7815456, + "step": 13470 + }, + { + "epoch": 2.0070002978850163, + "grad_norm": 6.358265399932861, + "learning_rate": 4.9999982126167414e-05, + "loss": 0.6, + "num_input_tokens_seen": 7818432, + "step": 13475 + }, + { + "epoch": 2.0077450104259755, + "grad_norm": 3.8529622554779053, + "learning_rate": 4.9999978029377456e-05, + "loss": 0.5987, + "num_input_tokens_seen": 7821152, + "step": 13480 + }, + { + "epoch": 2.0084897229669347, + "grad_norm": 5.649214267730713, + "learning_rate": 4.9999973510238284e-05, + "loss": 0.5339, + "num_input_tokens_seen": 7824288, + "step": 13485 + }, + { + "epoch": 2.009234435507894, + "grad_norm": 5.218984127044678, + "learning_rate": 4.999996856874997e-05, + "loss": 0.4667, + "num_input_tokens_seen": 7827136, + "step": 13490 + }, + { + "epoch": 2.009979148048853, + "grad_norm": 4.291970729827881, + "learning_rate": 4.99999632049126e-05, + "loss": 0.409, + "num_input_tokens_seen": 7829984, + "step": 13495 + }, + { + "epoch": 2.0107238605898123, + "grad_norm": 9.29696273803711, + "learning_rate": 4.999995741872625e-05, + "loss": 0.4809, + "num_input_tokens_seen": 7832928, + "step": 13500 + }, + { + "epoch": 2.0114685731307715, + "grad_norm": 5.088766098022461, + "learning_rate": 4.999995121019103e-05, + "loss": 0.6762, + "num_input_tokens_seen": 7835872, + "step": 13505 + }, + { + "epoch": 2.0122132856717307, + "grad_norm": 10.890259742736816, + "learning_rate": 4.999994457930705e-05, + "loss": 0.6884, + "num_input_tokens_seen": 7838784, + "step": 13510 + }, + { + "epoch": 2.01295799821269, + "grad_norm": 7.353583812713623, + "learning_rate": 4.999993752607441e-05, + "loss": 0.5719, + "num_input_tokens_seen": 7841792, + "step": 13515 + }, + { + "epoch": 2.013702710753649, + "grad_norm": 5.768372058868408, + "learning_rate": 4.999993005049324e-05, + "loss": 0.6149, + "num_input_tokens_seen": 7844768, + "step": 13520 + }, + { + "epoch": 2.0144474232946084, + "grad_norm": 5.792473316192627, + "learning_rate": 4.999992215256365e-05, + "loss": 0.5755, + "num_input_tokens_seen": 7848224, + "step": 13525 + }, + { + "epoch": 2.0151921358355676, + "grad_norm": 6.082853317260742, + "learning_rate": 4.9999913832285796e-05, + "loss": 0.4102, + "num_input_tokens_seen": 7851072, + "step": 13530 + }, + { + "epoch": 2.0159368483765268, + "grad_norm": 4.5590972900390625, + "learning_rate": 4.99999050896598e-05, + "loss": 0.399, + "num_input_tokens_seen": 7853984, + "step": 13535 + }, + { + "epoch": 2.016681560917486, + "grad_norm": 6.10585880279541, + "learning_rate": 4.999989592468582e-05, + "loss": 0.5727, + "num_input_tokens_seen": 7856640, + "step": 13540 + }, + { + "epoch": 2.017426273458445, + "grad_norm": 3.914299488067627, + "learning_rate": 4.9999886337364004e-05, + "loss": 0.4748, + "num_input_tokens_seen": 7859392, + "step": 13545 + }, + { + "epoch": 2.0181709859994044, + "grad_norm": 3.9800519943237305, + "learning_rate": 4.999987632769452e-05, + "loss": 0.3364, + "num_input_tokens_seen": 7862016, + "step": 13550 + }, + { + "epoch": 2.0189156985403636, + "grad_norm": 26.396020889282227, + "learning_rate": 4.9999865895677534e-05, + "loss": 0.3826, + "num_input_tokens_seen": 7864640, + "step": 13555 + }, + { + "epoch": 2.0196604110813228, + "grad_norm": 7.3549885749816895, + "learning_rate": 4.999985504131322e-05, + "loss": 0.5028, + "num_input_tokens_seen": 7867488, + "step": 13560 + }, + { + "epoch": 2.020405123622282, + "grad_norm": 10.1882963180542, + "learning_rate": 4.999984376460176e-05, + "loss": 0.7014, + "num_input_tokens_seen": 7870496, + "step": 13565 + }, + { + "epoch": 2.021149836163241, + "grad_norm": 9.880688667297363, + "learning_rate": 4.999983206554335e-05, + "loss": 0.5356, + "num_input_tokens_seen": 7873344, + "step": 13570 + }, + { + "epoch": 2.0218945487042004, + "grad_norm": 11.975808143615723, + "learning_rate": 4.9999819944138194e-05, + "loss": 0.8088, + "num_input_tokens_seen": 7876512, + "step": 13575 + }, + { + "epoch": 2.0226392612451596, + "grad_norm": 4.733309745788574, + "learning_rate": 4.999980740038648e-05, + "loss": 0.4485, + "num_input_tokens_seen": 7879360, + "step": 13580 + }, + { + "epoch": 2.0233839737861183, + "grad_norm": 22.258588790893555, + "learning_rate": 4.9999794434288434e-05, + "loss": 0.4871, + "num_input_tokens_seen": 7881792, + "step": 13585 + }, + { + "epoch": 2.0241286863270775, + "grad_norm": 7.1815080642700195, + "learning_rate": 4.9999781045844266e-05, + "loss": 0.5042, + "num_input_tokens_seen": 7884960, + "step": 13590 + }, + { + "epoch": 2.0248733988680367, + "grad_norm": 7.118896961212158, + "learning_rate": 4.999976723505421e-05, + "loss": 0.594, + "num_input_tokens_seen": 7887776, + "step": 13595 + }, + { + "epoch": 2.025618111408996, + "grad_norm": 4.515633583068848, + "learning_rate": 4.999975300191849e-05, + "loss": 0.5689, + "num_input_tokens_seen": 7890880, + "step": 13600 + }, + { + "epoch": 2.026362823949955, + "grad_norm": 10.145580291748047, + "learning_rate": 4.9999738346437355e-05, + "loss": 0.742, + "num_input_tokens_seen": 7894336, + "step": 13605 + }, + { + "epoch": 2.0271075364909144, + "grad_norm": 15.129510879516602, + "learning_rate": 4.9999723268611046e-05, + "loss": 0.6746, + "num_input_tokens_seen": 7897664, + "step": 13610 + }, + { + "epoch": 2.0278522490318736, + "grad_norm": 5.618595600128174, + "learning_rate": 4.9999707768439824e-05, + "loss": 0.4312, + "num_input_tokens_seen": 7900544, + "step": 13615 + }, + { + "epoch": 2.0285969615728328, + "grad_norm": 5.363778114318848, + "learning_rate": 4.999969184592395e-05, + "loss": 0.4848, + "num_input_tokens_seen": 7903392, + "step": 13620 + }, + { + "epoch": 2.029341674113792, + "grad_norm": 4.3231611251831055, + "learning_rate": 4.999967550106368e-05, + "loss": 0.4994, + "num_input_tokens_seen": 7906336, + "step": 13625 + }, + { + "epoch": 2.030086386654751, + "grad_norm": 10.734672546386719, + "learning_rate": 4.999965873385931e-05, + "loss": 0.5116, + "num_input_tokens_seen": 7909088, + "step": 13630 + }, + { + "epoch": 2.0308310991957104, + "grad_norm": 11.522080421447754, + "learning_rate": 4.999964154431112e-05, + "loss": 0.4488, + "num_input_tokens_seen": 7912000, + "step": 13635 + }, + { + "epoch": 2.0315758117366696, + "grad_norm": 6.0150957107543945, + "learning_rate": 4.999962393241938e-05, + "loss": 0.4925, + "num_input_tokens_seen": 7915072, + "step": 13640 + }, + { + "epoch": 2.0323205242776288, + "grad_norm": 8.220385551452637, + "learning_rate": 4.999960589818441e-05, + "loss": 0.5701, + "num_input_tokens_seen": 7917952, + "step": 13645 + }, + { + "epoch": 2.033065236818588, + "grad_norm": 13.579400062561035, + "learning_rate": 4.999958744160651e-05, + "loss": 0.4907, + "num_input_tokens_seen": 7920704, + "step": 13650 + }, + { + "epoch": 2.033809949359547, + "grad_norm": 10.365190505981445, + "learning_rate": 4.9999568562685986e-05, + "loss": 0.607, + "num_input_tokens_seen": 7923392, + "step": 13655 + }, + { + "epoch": 2.0345546619005064, + "grad_norm": 11.359854698181152, + "learning_rate": 4.999954926142316e-05, + "loss": 0.57, + "num_input_tokens_seen": 7926112, + "step": 13660 + }, + { + "epoch": 2.0352993744414656, + "grad_norm": 4.834968566894531, + "learning_rate": 4.999952953781836e-05, + "loss": 0.5736, + "num_input_tokens_seen": 7929472, + "step": 13665 + }, + { + "epoch": 2.036044086982425, + "grad_norm": 5.824040412902832, + "learning_rate": 4.9999509391871905e-05, + "loss": 0.4719, + "num_input_tokens_seen": 7932224, + "step": 13670 + }, + { + "epoch": 2.036788799523384, + "grad_norm": 16.30296516418457, + "learning_rate": 4.999948882358416e-05, + "loss": 0.4697, + "num_input_tokens_seen": 7934944, + "step": 13675 + }, + { + "epoch": 2.037533512064343, + "grad_norm": 4.665271759033203, + "learning_rate": 4.9999467832955454e-05, + "loss": 0.5861, + "num_input_tokens_seen": 7937920, + "step": 13680 + }, + { + "epoch": 2.0382782246053024, + "grad_norm": 12.73347282409668, + "learning_rate": 4.999944641998615e-05, + "loss": 0.4534, + "num_input_tokens_seen": 7940832, + "step": 13685 + }, + { + "epoch": 2.0390229371462616, + "grad_norm": 11.936064720153809, + "learning_rate": 4.99994245846766e-05, + "loss": 0.5238, + "num_input_tokens_seen": 7944192, + "step": 13690 + }, + { + "epoch": 2.039767649687221, + "grad_norm": 14.407795906066895, + "learning_rate": 4.999940232702719e-05, + "loss": 0.5471, + "num_input_tokens_seen": 7946944, + "step": 13695 + }, + { + "epoch": 2.04051236222818, + "grad_norm": 6.5216474533081055, + "learning_rate": 4.999937964703828e-05, + "loss": 0.5317, + "num_input_tokens_seen": 7949920, + "step": 13700 + }, + { + "epoch": 2.041257074769139, + "grad_norm": 22.768342971801758, + "learning_rate": 4.999935654471026e-05, + "loss": 0.4616, + "num_input_tokens_seen": 7952736, + "step": 13705 + }, + { + "epoch": 2.0420017873100984, + "grad_norm": 10.500631332397461, + "learning_rate": 4.999933302004352e-05, + "loss": 0.5328, + "num_input_tokens_seen": 7955392, + "step": 13710 + }, + { + "epoch": 2.0427464998510576, + "grad_norm": 19.2142333984375, + "learning_rate": 4.999930907303846e-05, + "loss": 0.4889, + "num_input_tokens_seen": 7958080, + "step": 13715 + }, + { + "epoch": 2.043491212392017, + "grad_norm": 9.483284950256348, + "learning_rate": 4.9999284703695474e-05, + "loss": 0.627, + "num_input_tokens_seen": 7960960, + "step": 13720 + }, + { + "epoch": 2.044235924932976, + "grad_norm": 4.191361904144287, + "learning_rate": 4.9999259912014986e-05, + "loss": 0.4408, + "num_input_tokens_seen": 7963744, + "step": 13725 + }, + { + "epoch": 2.044980637473935, + "grad_norm": 6.045959949493408, + "learning_rate": 4.999923469799741e-05, + "loss": 0.7331, + "num_input_tokens_seen": 7966336, + "step": 13730 + }, + { + "epoch": 2.0457253500148944, + "grad_norm": 8.170378684997559, + "learning_rate": 4.9999209061643174e-05, + "loss": 0.6632, + "num_input_tokens_seen": 7969088, + "step": 13735 + }, + { + "epoch": 2.0464700625558536, + "grad_norm": 6.414699077606201, + "learning_rate": 4.99991830029527e-05, + "loss": 0.4949, + "num_input_tokens_seen": 7972160, + "step": 13740 + }, + { + "epoch": 2.047214775096813, + "grad_norm": 5.550452709197998, + "learning_rate": 4.999915652192645e-05, + "loss": 0.5567, + "num_input_tokens_seen": 7975072, + "step": 13745 + }, + { + "epoch": 2.047959487637772, + "grad_norm": 6.687952995300293, + "learning_rate": 4.9999129618564844e-05, + "loss": 0.7013, + "num_input_tokens_seen": 7977792, + "step": 13750 + }, + { + "epoch": 2.0487042001787312, + "grad_norm": 8.205970764160156, + "learning_rate": 4.999910229286836e-05, + "loss": 0.4252, + "num_input_tokens_seen": 7980672, + "step": 13755 + }, + { + "epoch": 2.04944891271969, + "grad_norm": 5.832874774932861, + "learning_rate": 4.999907454483745e-05, + "loss": 0.4873, + "num_input_tokens_seen": 7983456, + "step": 13760 + }, + { + "epoch": 2.050193625260649, + "grad_norm": 4.69923734664917, + "learning_rate": 4.999904637447258e-05, + "loss": 0.5219, + "num_input_tokens_seen": 7986496, + "step": 13765 + }, + { + "epoch": 2.0509383378016084, + "grad_norm": 7.8558573722839355, + "learning_rate": 4.9999017781774236e-05, + "loss": 0.5266, + "num_input_tokens_seen": 7989504, + "step": 13770 + }, + { + "epoch": 2.0516830503425676, + "grad_norm": 6.495645999908447, + "learning_rate": 4.999898876674289e-05, + "loss": 0.5873, + "num_input_tokens_seen": 7992064, + "step": 13775 + }, + { + "epoch": 2.052427762883527, + "grad_norm": 6.397319316864014, + "learning_rate": 4.9998959329379036e-05, + "loss": 0.5601, + "num_input_tokens_seen": 7994592, + "step": 13780 + }, + { + "epoch": 2.053172475424486, + "grad_norm": 9.73226261138916, + "learning_rate": 4.999892946968318e-05, + "loss": 0.5873, + "num_input_tokens_seen": 7997664, + "step": 13785 + }, + { + "epoch": 2.053917187965445, + "grad_norm": 9.148930549621582, + "learning_rate": 4.999889918765581e-05, + "loss": 0.5534, + "num_input_tokens_seen": 8000736, + "step": 13790 + }, + { + "epoch": 2.0546619005064044, + "grad_norm": 6.900266170501709, + "learning_rate": 4.999886848329744e-05, + "loss": 0.6256, + "num_input_tokens_seen": 8003488, + "step": 13795 + }, + { + "epoch": 2.0554066130473636, + "grad_norm": 6.461239337921143, + "learning_rate": 4.999883735660861e-05, + "loss": 0.4942, + "num_input_tokens_seen": 8006400, + "step": 13800 + }, + { + "epoch": 2.056151325588323, + "grad_norm": 7.178962707519531, + "learning_rate": 4.999880580758982e-05, + "loss": 0.4322, + "num_input_tokens_seen": 8009312, + "step": 13805 + }, + { + "epoch": 2.056896038129282, + "grad_norm": 4.438699722290039, + "learning_rate": 4.999877383624162e-05, + "loss": 0.3458, + "num_input_tokens_seen": 8012128, + "step": 13810 + }, + { + "epoch": 2.057640750670241, + "grad_norm": 6.634634017944336, + "learning_rate": 4.9998741442564535e-05, + "loss": 0.4478, + "num_input_tokens_seen": 8014944, + "step": 13815 + }, + { + "epoch": 2.0583854632112004, + "grad_norm": 5.958606243133545, + "learning_rate": 4.999870862655913e-05, + "loss": 0.5404, + "num_input_tokens_seen": 8017568, + "step": 13820 + }, + { + "epoch": 2.0591301757521596, + "grad_norm": 4.236353397369385, + "learning_rate": 4.999867538822595e-05, + "loss": 0.5783, + "num_input_tokens_seen": 8020672, + "step": 13825 + }, + { + "epoch": 2.059874888293119, + "grad_norm": 5.5943284034729, + "learning_rate": 4.999864172756554e-05, + "loss": 0.4456, + "num_input_tokens_seen": 8023520, + "step": 13830 + }, + { + "epoch": 2.060619600834078, + "grad_norm": 7.014066696166992, + "learning_rate": 4.9998607644578505e-05, + "loss": 0.4228, + "num_input_tokens_seen": 8026528, + "step": 13835 + }, + { + "epoch": 2.0613643133750372, + "grad_norm": 10.536420822143555, + "learning_rate": 4.9998573139265395e-05, + "loss": 0.6983, + "num_input_tokens_seen": 8029440, + "step": 13840 + }, + { + "epoch": 2.0621090259159964, + "grad_norm": 12.337493896484375, + "learning_rate": 4.99985382116268e-05, + "loss": 0.7148, + "num_input_tokens_seen": 8032160, + "step": 13845 + }, + { + "epoch": 2.0628537384569556, + "grad_norm": 6.739989280700684, + "learning_rate": 4.999850286166331e-05, + "loss": 0.533, + "num_input_tokens_seen": 8034912, + "step": 13850 + }, + { + "epoch": 2.063598450997915, + "grad_norm": 5.415397644042969, + "learning_rate": 4.999846708937552e-05, + "loss": 0.4841, + "num_input_tokens_seen": 8037920, + "step": 13855 + }, + { + "epoch": 2.064343163538874, + "grad_norm": 10.104103088378906, + "learning_rate": 4.9998430894764034e-05, + "loss": 0.6587, + "num_input_tokens_seen": 8040800, + "step": 13860 + }, + { + "epoch": 2.0650878760798332, + "grad_norm": 5.107353210449219, + "learning_rate": 4.9998394277829466e-05, + "loss": 0.4769, + "num_input_tokens_seen": 8044000, + "step": 13865 + }, + { + "epoch": 2.0658325886207924, + "grad_norm": 7.073644638061523, + "learning_rate": 4.9998357238572435e-05, + "loss": 0.5438, + "num_input_tokens_seen": 8046784, + "step": 13870 + }, + { + "epoch": 2.0665773011617516, + "grad_norm": 8.61697769165039, + "learning_rate": 4.9998319776993566e-05, + "loss": 0.6421, + "num_input_tokens_seen": 8049376, + "step": 13875 + }, + { + "epoch": 2.067322013702711, + "grad_norm": 9.102777481079102, + "learning_rate": 4.999828189309349e-05, + "loss": 0.4126, + "num_input_tokens_seen": 8052160, + "step": 13880 + }, + { + "epoch": 2.06806672624367, + "grad_norm": 6.00195837020874, + "learning_rate": 4.999824358687285e-05, + "loss": 0.4738, + "num_input_tokens_seen": 8055072, + "step": 13885 + }, + { + "epoch": 2.0688114387846293, + "grad_norm": 24.8004150390625, + "learning_rate": 4.9998204858332295e-05, + "loss": 0.5617, + "num_input_tokens_seen": 8057984, + "step": 13890 + }, + { + "epoch": 2.0695561513255885, + "grad_norm": 4.309549331665039, + "learning_rate": 4.999816570747247e-05, + "loss": 0.4786, + "num_input_tokens_seen": 8060640, + "step": 13895 + }, + { + "epoch": 2.0703008638665477, + "grad_norm": 8.476490020751953, + "learning_rate": 4.999812613429404e-05, + "loss": 0.4991, + "num_input_tokens_seen": 8063648, + "step": 13900 + }, + { + "epoch": 2.071045576407507, + "grad_norm": 4.725999355316162, + "learning_rate": 4.9998086138797685e-05, + "loss": 0.6694, + "num_input_tokens_seen": 8066624, + "step": 13905 + }, + { + "epoch": 2.071790288948466, + "grad_norm": 2.852064371109009, + "learning_rate": 4.9998045720984065e-05, + "loss": 0.3575, + "num_input_tokens_seen": 8069664, + "step": 13910 + }, + { + "epoch": 2.0725350014894253, + "grad_norm": 7.802427291870117, + "learning_rate": 4.999800488085388e-05, + "loss": 0.5275, + "num_input_tokens_seen": 8072672, + "step": 13915 + }, + { + "epoch": 2.0732797140303845, + "grad_norm": 7.900514602661133, + "learning_rate": 4.9997963618407794e-05, + "loss": 0.5637, + "num_input_tokens_seen": 8075744, + "step": 13920 + }, + { + "epoch": 2.0740244265713437, + "grad_norm": 14.836126327514648, + "learning_rate": 4.999792193364653e-05, + "loss": 0.8227, + "num_input_tokens_seen": 8079040, + "step": 13925 + }, + { + "epoch": 2.074769139112303, + "grad_norm": 10.484004020690918, + "learning_rate": 4.999787982657077e-05, + "loss": 0.5365, + "num_input_tokens_seen": 8081984, + "step": 13930 + }, + { + "epoch": 2.0755138516532616, + "grad_norm": 10.779911994934082, + "learning_rate": 4.999783729718125e-05, + "loss": 0.5, + "num_input_tokens_seen": 8084864, + "step": 13935 + }, + { + "epoch": 2.076258564194221, + "grad_norm": 7.3112077713012695, + "learning_rate": 4.999779434547867e-05, + "loss": 0.5767, + "num_input_tokens_seen": 8087776, + "step": 13940 + }, + { + "epoch": 2.07700327673518, + "grad_norm": 5.709752559661865, + "learning_rate": 4.999775097146376e-05, + "loss": 0.4262, + "num_input_tokens_seen": 8090560, + "step": 13945 + }, + { + "epoch": 2.0777479892761392, + "grad_norm": 9.336882591247559, + "learning_rate": 4.999770717513726e-05, + "loss": 0.4867, + "num_input_tokens_seen": 8093408, + "step": 13950 + }, + { + "epoch": 2.0784927018170984, + "grad_norm": 3.9581851959228516, + "learning_rate": 4.99976629564999e-05, + "loss": 0.5857, + "num_input_tokens_seen": 8096320, + "step": 13955 + }, + { + "epoch": 2.0792374143580576, + "grad_norm": 6.5717854499816895, + "learning_rate": 4.999761831555243e-05, + "loss": 0.5167, + "num_input_tokens_seen": 8099136, + "step": 13960 + }, + { + "epoch": 2.079982126899017, + "grad_norm": 6.315347194671631, + "learning_rate": 4.9997573252295604e-05, + "loss": 0.3295, + "num_input_tokens_seen": 8102080, + "step": 13965 + }, + { + "epoch": 2.080726839439976, + "grad_norm": 6.933663845062256, + "learning_rate": 4.999752776673018e-05, + "loss": 0.5918, + "num_input_tokens_seen": 8104800, + "step": 13970 + }, + { + "epoch": 2.0814715519809353, + "grad_norm": 8.533441543579102, + "learning_rate": 4.999748185885694e-05, + "loss": 0.4541, + "num_input_tokens_seen": 8108064, + "step": 13975 + }, + { + "epoch": 2.0822162645218945, + "grad_norm": 3.028834104537964, + "learning_rate": 4.999743552867665e-05, + "loss": 0.2382, + "num_input_tokens_seen": 8110976, + "step": 13980 + }, + { + "epoch": 2.0829609770628537, + "grad_norm": 4.723057270050049, + "learning_rate": 4.999738877619009e-05, + "loss": 0.6755, + "num_input_tokens_seen": 8113824, + "step": 13985 + }, + { + "epoch": 2.083705689603813, + "grad_norm": 5.498496055603027, + "learning_rate": 4.999734160139805e-05, + "loss": 0.5337, + "num_input_tokens_seen": 8116608, + "step": 13990 + }, + { + "epoch": 2.084450402144772, + "grad_norm": 8.141018867492676, + "learning_rate": 4.999729400430133e-05, + "loss": 0.4988, + "num_input_tokens_seen": 8119520, + "step": 13995 + }, + { + "epoch": 2.0851951146857313, + "grad_norm": 6.546530246734619, + "learning_rate": 4.9997245984900745e-05, + "loss": 0.4916, + "num_input_tokens_seen": 8122208, + "step": 14000 + }, + { + "epoch": 2.0859398272266905, + "grad_norm": 4.0596723556518555, + "learning_rate": 4.999719754319708e-05, + "loss": 0.49, + "num_input_tokens_seen": 8125440, + "step": 14005 + }, + { + "epoch": 2.0866845397676497, + "grad_norm": 13.288451194763184, + "learning_rate": 4.9997148679191174e-05, + "loss": 0.6562, + "num_input_tokens_seen": 8128128, + "step": 14010 + }, + { + "epoch": 2.087429252308609, + "grad_norm": 5.954975128173828, + "learning_rate": 4.999709939288385e-05, + "loss": 0.4826, + "num_input_tokens_seen": 8131104, + "step": 14015 + }, + { + "epoch": 2.088173964849568, + "grad_norm": 3.1763081550598145, + "learning_rate": 4.9997049684275936e-05, + "loss": 0.3734, + "num_input_tokens_seen": 8133952, + "step": 14020 + }, + { + "epoch": 2.0889186773905273, + "grad_norm": 10.033167839050293, + "learning_rate": 4.999699955336827e-05, + "loss": 0.7915, + "num_input_tokens_seen": 8136992, + "step": 14025 + }, + { + "epoch": 2.0896633899314865, + "grad_norm": 9.726927757263184, + "learning_rate": 4.9996949000161705e-05, + "loss": 0.6368, + "num_input_tokens_seen": 8139744, + "step": 14030 + }, + { + "epoch": 2.0904081024724457, + "grad_norm": 5.358009338378906, + "learning_rate": 4.99968980246571e-05, + "loss": 0.5871, + "num_input_tokens_seen": 8142688, + "step": 14035 + }, + { + "epoch": 2.091152815013405, + "grad_norm": 6.619868755340576, + "learning_rate": 4.99968466268553e-05, + "loss": 0.5999, + "num_input_tokens_seen": 8145632, + "step": 14040 + }, + { + "epoch": 2.091897527554364, + "grad_norm": 14.791997909545898, + "learning_rate": 4.999679480675719e-05, + "loss": 0.6554, + "num_input_tokens_seen": 8148736, + "step": 14045 + }, + { + "epoch": 2.0926422400953233, + "grad_norm": 2.9561755657196045, + "learning_rate": 4.9996742564363616e-05, + "loss": 0.3425, + "num_input_tokens_seen": 8151648, + "step": 14050 + }, + { + "epoch": 2.0933869526362825, + "grad_norm": 5.594529628753662, + "learning_rate": 4.99966898996755e-05, + "loss": 0.6248, + "num_input_tokens_seen": 8154560, + "step": 14055 + }, + { + "epoch": 2.0941316651772417, + "grad_norm": 11.529151916503906, + "learning_rate": 4.999663681269372e-05, + "loss": 0.5408, + "num_input_tokens_seen": 8157568, + "step": 14060 + }, + { + "epoch": 2.094876377718201, + "grad_norm": 5.557284355163574, + "learning_rate": 4.999658330341915e-05, + "loss": 0.4298, + "num_input_tokens_seen": 8160384, + "step": 14065 + }, + { + "epoch": 2.09562109025916, + "grad_norm": 5.774513244628906, + "learning_rate": 4.9996529371852716e-05, + "loss": 0.5086, + "num_input_tokens_seen": 8163296, + "step": 14070 + }, + { + "epoch": 2.0963658028001193, + "grad_norm": 13.20856761932373, + "learning_rate": 4.999647501799532e-05, + "loss": 0.5747, + "num_input_tokens_seen": 8166304, + "step": 14075 + }, + { + "epoch": 2.0971105153410785, + "grad_norm": 3.5870718955993652, + "learning_rate": 4.99964202418479e-05, + "loss": 0.5569, + "num_input_tokens_seen": 8169600, + "step": 14080 + }, + { + "epoch": 2.0978552278820377, + "grad_norm": 9.601762771606445, + "learning_rate": 4.999636504341135e-05, + "loss": 0.6495, + "num_input_tokens_seen": 8172160, + "step": 14085 + }, + { + "epoch": 2.098599940422997, + "grad_norm": 20.699953079223633, + "learning_rate": 4.9996309422686624e-05, + "loss": 0.4556, + "num_input_tokens_seen": 8175264, + "step": 14090 + }, + { + "epoch": 2.099344652963956, + "grad_norm": 4.582831859588623, + "learning_rate": 4.999625337967465e-05, + "loss": 0.4004, + "num_input_tokens_seen": 8178144, + "step": 14095 + }, + { + "epoch": 2.1000893655049153, + "grad_norm": 8.372200965881348, + "learning_rate": 4.999619691437638e-05, + "loss": 0.5444, + "num_input_tokens_seen": 8181088, + "step": 14100 + }, + { + "epoch": 2.1008340780458745, + "grad_norm": 4.389862537384033, + "learning_rate": 4.9996140026792774e-05, + "loss": 0.544, + "num_input_tokens_seen": 8183840, + "step": 14105 + }, + { + "epoch": 2.1015787905868333, + "grad_norm": 7.061047554016113, + "learning_rate": 4.999608271692479e-05, + "loss": 0.6489, + "num_input_tokens_seen": 8186944, + "step": 14110 + }, + { + "epoch": 2.1023235031277925, + "grad_norm": 4.170165538787842, + "learning_rate": 4.999602498477338e-05, + "loss": 0.5801, + "num_input_tokens_seen": 8190144, + "step": 14115 + }, + { + "epoch": 2.1030682156687517, + "grad_norm": 6.176851272583008, + "learning_rate": 4.999596683033955e-05, + "loss": 0.4053, + "num_input_tokens_seen": 8193056, + "step": 14120 + }, + { + "epoch": 2.103812928209711, + "grad_norm": 6.356723308563232, + "learning_rate": 4.999590825362425e-05, + "loss": 0.5791, + "num_input_tokens_seen": 8196032, + "step": 14125 + }, + { + "epoch": 2.10455764075067, + "grad_norm": 7.206398010253906, + "learning_rate": 4.999584925462849e-05, + "loss": 0.5603, + "num_input_tokens_seen": 8198880, + "step": 14130 + }, + { + "epoch": 2.1053023532916293, + "grad_norm": 2.494659900665283, + "learning_rate": 4.999578983335327e-05, + "loss": 0.6699, + "num_input_tokens_seen": 8201728, + "step": 14135 + }, + { + "epoch": 2.1060470658325885, + "grad_norm": 2.4296305179595947, + "learning_rate": 4.999572998979957e-05, + "loss": 0.5076, + "num_input_tokens_seen": 8204512, + "step": 14140 + }, + { + "epoch": 2.1067917783735477, + "grad_norm": 6.397665023803711, + "learning_rate": 4.9995669723968426e-05, + "loss": 0.3188, + "num_input_tokens_seen": 8207392, + "step": 14145 + }, + { + "epoch": 2.107536490914507, + "grad_norm": 8.143342971801758, + "learning_rate": 4.9995609035860845e-05, + "loss": 0.5309, + "num_input_tokens_seen": 8210304, + "step": 14150 + }, + { + "epoch": 2.108281203455466, + "grad_norm": 6.858109474182129, + "learning_rate": 4.9995547925477856e-05, + "loss": 0.5871, + "num_input_tokens_seen": 8213248, + "step": 14155 + }, + { + "epoch": 2.1090259159964253, + "grad_norm": 6.939999103546143, + "learning_rate": 4.999548639282048e-05, + "loss": 0.481, + "num_input_tokens_seen": 8216000, + "step": 14160 + }, + { + "epoch": 2.1097706285373845, + "grad_norm": 19.07619285583496, + "learning_rate": 4.9995424437889774e-05, + "loss": 0.5771, + "num_input_tokens_seen": 8218752, + "step": 14165 + }, + { + "epoch": 2.1105153410783437, + "grad_norm": 10.181623458862305, + "learning_rate": 4.999536206068678e-05, + "loss": 0.5184, + "num_input_tokens_seen": 8221824, + "step": 14170 + }, + { + "epoch": 2.111260053619303, + "grad_norm": 5.075578689575195, + "learning_rate": 4.9995299261212536e-05, + "loss": 0.6424, + "num_input_tokens_seen": 8224512, + "step": 14175 + }, + { + "epoch": 2.112004766160262, + "grad_norm": 9.359231948852539, + "learning_rate": 4.999523603946812e-05, + "loss": 0.5705, + "num_input_tokens_seen": 8227520, + "step": 14180 + }, + { + "epoch": 2.1127494787012213, + "grad_norm": 8.490419387817383, + "learning_rate": 4.9995172395454606e-05, + "loss": 0.4795, + "num_input_tokens_seen": 8230304, + "step": 14185 + }, + { + "epoch": 2.1134941912421805, + "grad_norm": 7.629395961761475, + "learning_rate": 4.999510832917304e-05, + "loss": 0.3533, + "num_input_tokens_seen": 8233024, + "step": 14190 + }, + { + "epoch": 2.1142389037831397, + "grad_norm": 4.518339157104492, + "learning_rate": 4.9995043840624536e-05, + "loss": 0.5469, + "num_input_tokens_seen": 8236032, + "step": 14195 + }, + { + "epoch": 2.114983616324099, + "grad_norm": 7.922678470611572, + "learning_rate": 4.999497892981017e-05, + "loss": 0.4608, + "num_input_tokens_seen": 8238656, + "step": 14200 + }, + { + "epoch": 2.115728328865058, + "grad_norm": 9.940962791442871, + "learning_rate": 4.999491359673103e-05, + "loss": 0.5381, + "num_input_tokens_seen": 8241440, + "step": 14205 + }, + { + "epoch": 2.1164730414060173, + "grad_norm": 2.632568359375, + "learning_rate": 4.999484784138823e-05, + "loss": 0.3678, + "num_input_tokens_seen": 8244416, + "step": 14210 + }, + { + "epoch": 2.1172177539469765, + "grad_norm": 11.91773509979248, + "learning_rate": 4.9994781663782884e-05, + "loss": 0.469, + "num_input_tokens_seen": 8247296, + "step": 14215 + }, + { + "epoch": 2.1179624664879357, + "grad_norm": 5.649469375610352, + "learning_rate": 4.99947150639161e-05, + "loss": 0.4633, + "num_input_tokens_seen": 8250304, + "step": 14220 + }, + { + "epoch": 2.118707179028895, + "grad_norm": 6.7494964599609375, + "learning_rate": 4.9994648041789016e-05, + "loss": 0.3439, + "num_input_tokens_seen": 8253472, + "step": 14225 + }, + { + "epoch": 2.119451891569854, + "grad_norm": 5.033598899841309, + "learning_rate": 4.999458059740275e-05, + "loss": 0.408, + "num_input_tokens_seen": 8256288, + "step": 14230 + }, + { + "epoch": 2.1201966041108133, + "grad_norm": 10.966415405273438, + "learning_rate": 4.9994512730758454e-05, + "loss": 0.6983, + "num_input_tokens_seen": 8259104, + "step": 14235 + }, + { + "epoch": 2.1209413166517725, + "grad_norm": 12.271732330322266, + "learning_rate": 4.999444444185727e-05, + "loss": 0.5817, + "num_input_tokens_seen": 8261984, + "step": 14240 + }, + { + "epoch": 2.1216860291927317, + "grad_norm": 7.848992347717285, + "learning_rate": 4.999437573070034e-05, + "loss": 0.4196, + "num_input_tokens_seen": 8264768, + "step": 14245 + }, + { + "epoch": 2.122430741733691, + "grad_norm": 4.796075820922852, + "learning_rate": 4.999430659728884e-05, + "loss": 0.5274, + "num_input_tokens_seen": 8267840, + "step": 14250 + }, + { + "epoch": 2.12317545427465, + "grad_norm": 11.1924467086792, + "learning_rate": 4.9994237041623935e-05, + "loss": 0.631, + "num_input_tokens_seen": 8270784, + "step": 14255 + }, + { + "epoch": 2.1239201668156094, + "grad_norm": 9.541632652282715, + "learning_rate": 4.99941670637068e-05, + "loss": 0.4638, + "num_input_tokens_seen": 8273952, + "step": 14260 + }, + { + "epoch": 2.1246648793565686, + "grad_norm": 12.135838508605957, + "learning_rate": 4.999409666353861e-05, + "loss": 0.5588, + "num_input_tokens_seen": 8277088, + "step": 14265 + }, + { + "epoch": 2.1254095918975278, + "grad_norm": 5.017489910125732, + "learning_rate": 4.999402584112057e-05, + "loss": 0.4169, + "num_input_tokens_seen": 8280000, + "step": 14270 + }, + { + "epoch": 2.1261543044384865, + "grad_norm": 4.544531345367432, + "learning_rate": 4.999395459645385e-05, + "loss": 0.6274, + "num_input_tokens_seen": 8282784, + "step": 14275 + }, + { + "epoch": 2.126899016979446, + "grad_norm": 7.457905292510986, + "learning_rate": 4.999388292953968e-05, + "loss": 0.4656, + "num_input_tokens_seen": 8285792, + "step": 14280 + }, + { + "epoch": 2.127643729520405, + "grad_norm": 5.394796848297119, + "learning_rate": 4.999381084037926e-05, + "loss": 0.4525, + "num_input_tokens_seen": 8288544, + "step": 14285 + }, + { + "epoch": 2.128388442061364, + "grad_norm": 6.2662811279296875, + "learning_rate": 4.999373832897381e-05, + "loss": 0.34, + "num_input_tokens_seen": 8291200, + "step": 14290 + }, + { + "epoch": 2.1291331546023233, + "grad_norm": 5.786048889160156, + "learning_rate": 4.9993665395324554e-05, + "loss": 0.5591, + "num_input_tokens_seen": 8294208, + "step": 14295 + }, + { + "epoch": 2.1298778671432825, + "grad_norm": 9.34676742553711, + "learning_rate": 4.999359203943272e-05, + "loss": 0.4809, + "num_input_tokens_seen": 8297248, + "step": 14300 + }, + { + "epoch": 2.1306225796842417, + "grad_norm": 11.368597984313965, + "learning_rate": 4.999351826129955e-05, + "loss": 0.6986, + "num_input_tokens_seen": 8300064, + "step": 14305 + }, + { + "epoch": 2.131367292225201, + "grad_norm": 10.443925857543945, + "learning_rate": 4.9993444060926296e-05, + "loss": 0.7018, + "num_input_tokens_seen": 8302912, + "step": 14310 + }, + { + "epoch": 2.13211200476616, + "grad_norm": 7.585705757141113, + "learning_rate": 4.9993369438314204e-05, + "loss": 0.7398, + "num_input_tokens_seen": 8305888, + "step": 14315 + }, + { + "epoch": 2.1328567173071193, + "grad_norm": 10.560093879699707, + "learning_rate": 4.9993294393464536e-05, + "loss": 0.4544, + "num_input_tokens_seen": 8308448, + "step": 14320 + }, + { + "epoch": 2.1336014298480785, + "grad_norm": 5.680571556091309, + "learning_rate": 4.999321892637856e-05, + "loss": 0.4405, + "num_input_tokens_seen": 8311424, + "step": 14325 + }, + { + "epoch": 2.1343461423890377, + "grad_norm": 8.245468139648438, + "learning_rate": 4.9993143037057554e-05, + "loss": 0.7553, + "num_input_tokens_seen": 8314208, + "step": 14330 + }, + { + "epoch": 2.135090854929997, + "grad_norm": 7.352914333343506, + "learning_rate": 4.99930667255028e-05, + "loss": 0.5002, + "num_input_tokens_seen": 8317216, + "step": 14335 + }, + { + "epoch": 2.135835567470956, + "grad_norm": 5.178259372711182, + "learning_rate": 4.999298999171559e-05, + "loss": 0.6682, + "num_input_tokens_seen": 8319936, + "step": 14340 + }, + { + "epoch": 2.1365802800119154, + "grad_norm": 3.9069290161132812, + "learning_rate": 4.99929128356972e-05, + "loss": 0.5285, + "num_input_tokens_seen": 8322912, + "step": 14345 + }, + { + "epoch": 2.1373249925528746, + "grad_norm": 10.093354225158691, + "learning_rate": 4.9992835257448965e-05, + "loss": 0.5183, + "num_input_tokens_seen": 8326432, + "step": 14350 + }, + { + "epoch": 2.1380697050938338, + "grad_norm": 5.20806360244751, + "learning_rate": 4.999275725697218e-05, + "loss": 0.42, + "num_input_tokens_seen": 8329408, + "step": 14355 + }, + { + "epoch": 2.138814417634793, + "grad_norm": 5.099167346954346, + "learning_rate": 4.9992678834268154e-05, + "loss": 0.344, + "num_input_tokens_seen": 8332256, + "step": 14360 + }, + { + "epoch": 2.139559130175752, + "grad_norm": 15.104972839355469, + "learning_rate": 4.999259998933822e-05, + "loss": 0.4206, + "num_input_tokens_seen": 8335232, + "step": 14365 + }, + { + "epoch": 2.1403038427167114, + "grad_norm": 7.2143940925598145, + "learning_rate": 4.9992520722183714e-05, + "loss": 0.4794, + "num_input_tokens_seen": 8337920, + "step": 14370 + }, + { + "epoch": 2.1410485552576706, + "grad_norm": 20.678598403930664, + "learning_rate": 4.999244103280597e-05, + "loss": 0.5753, + "num_input_tokens_seen": 8340480, + "step": 14375 + }, + { + "epoch": 2.1417932677986298, + "grad_norm": 6.513937950134277, + "learning_rate": 4.999236092120634e-05, + "loss": 0.6093, + "num_input_tokens_seen": 8343520, + "step": 14380 + }, + { + "epoch": 2.142537980339589, + "grad_norm": 7.755355358123779, + "learning_rate": 4.999228038738617e-05, + "loss": 0.5614, + "num_input_tokens_seen": 8346592, + "step": 14385 + }, + { + "epoch": 2.143282692880548, + "grad_norm": 7.119178771972656, + "learning_rate": 4.999219943134683e-05, + "loss": 0.4415, + "num_input_tokens_seen": 8349504, + "step": 14390 + }, + { + "epoch": 2.1440274054215074, + "grad_norm": 6.843456745147705, + "learning_rate": 4.9992118053089675e-05, + "loss": 0.5081, + "num_input_tokens_seen": 8352384, + "step": 14395 + }, + { + "epoch": 2.1447721179624666, + "grad_norm": 5.5691752433776855, + "learning_rate": 4.999203625261609e-05, + "loss": 0.4456, + "num_input_tokens_seen": 8355584, + "step": 14400 + }, + { + "epoch": 2.145516830503426, + "grad_norm": 11.96141529083252, + "learning_rate": 4.999195402992745e-05, + "loss": 0.6061, + "num_input_tokens_seen": 8358624, + "step": 14405 + }, + { + "epoch": 2.146261543044385, + "grad_norm": 8.102926254272461, + "learning_rate": 4.999187138502515e-05, + "loss": 0.4875, + "num_input_tokens_seen": 8361472, + "step": 14410 + }, + { + "epoch": 2.147006255585344, + "grad_norm": 4.762436866760254, + "learning_rate": 4.999178831791058e-05, + "loss": 0.4058, + "num_input_tokens_seen": 8364384, + "step": 14415 + }, + { + "epoch": 2.1477509681263034, + "grad_norm": 30.850114822387695, + "learning_rate": 4.999170482858515e-05, + "loss": 0.5131, + "num_input_tokens_seen": 8367136, + "step": 14420 + }, + { + "epoch": 2.1484956806672626, + "grad_norm": 6.374943256378174, + "learning_rate": 4.999162091705026e-05, + "loss": 0.6318, + "num_input_tokens_seen": 8370112, + "step": 14425 + }, + { + "epoch": 2.149240393208222, + "grad_norm": 3.6602623462677, + "learning_rate": 4.9991536583307344e-05, + "loss": 0.5657, + "num_input_tokens_seen": 8373024, + "step": 14430 + }, + { + "epoch": 2.149985105749181, + "grad_norm": 14.626864433288574, + "learning_rate": 4.999145182735782e-05, + "loss": 0.3825, + "num_input_tokens_seen": 8375648, + "step": 14435 + }, + { + "epoch": 2.15072981829014, + "grad_norm": 6.890417575836182, + "learning_rate": 4.999136664920311e-05, + "loss": 0.5521, + "num_input_tokens_seen": 8378656, + "step": 14440 + }, + { + "epoch": 2.1514745308310994, + "grad_norm": 6.263740062713623, + "learning_rate": 4.999128104884466e-05, + "loss": 0.5624, + "num_input_tokens_seen": 8381408, + "step": 14445 + }, + { + "epoch": 2.152219243372058, + "grad_norm": 12.430770874023438, + "learning_rate": 4.999119502628392e-05, + "loss": 0.5269, + "num_input_tokens_seen": 8384416, + "step": 14450 + }, + { + "epoch": 2.1529639559130174, + "grad_norm": 8.689655303955078, + "learning_rate": 4.999110858152234e-05, + "loss": 0.7502, + "num_input_tokens_seen": 8387712, + "step": 14455 + }, + { + "epoch": 2.1537086684539766, + "grad_norm": 3.6042604446411133, + "learning_rate": 4.999102171456138e-05, + "loss": 0.4666, + "num_input_tokens_seen": 8390688, + "step": 14460 + }, + { + "epoch": 2.1544533809949358, + "grad_norm": 7.285758018493652, + "learning_rate": 4.999093442540251e-05, + "loss": 0.5881, + "num_input_tokens_seen": 8393664, + "step": 14465 + }, + { + "epoch": 2.155198093535895, + "grad_norm": 6.057789325714111, + "learning_rate": 4.9990846714047204e-05, + "loss": 0.5108, + "num_input_tokens_seen": 8396480, + "step": 14470 + }, + { + "epoch": 2.155942806076854, + "grad_norm": 5.726284027099609, + "learning_rate": 4.9990758580496935e-05, + "loss": 0.536, + "num_input_tokens_seen": 8399264, + "step": 14475 + }, + { + "epoch": 2.1566875186178134, + "grad_norm": 6.958995819091797, + "learning_rate": 4.99906700247532e-05, + "loss": 0.457, + "num_input_tokens_seen": 8402016, + "step": 14480 + }, + { + "epoch": 2.1574322311587726, + "grad_norm": 6.471965312957764, + "learning_rate": 4.99905810468175e-05, + "loss": 0.381, + "num_input_tokens_seen": 8404672, + "step": 14485 + }, + { + "epoch": 2.158176943699732, + "grad_norm": 2.2408220767974854, + "learning_rate": 4.999049164669133e-05, + "loss": 0.4759, + "num_input_tokens_seen": 8407328, + "step": 14490 + }, + { + "epoch": 2.158921656240691, + "grad_norm": 5.037514686584473, + "learning_rate": 4.9990401824376196e-05, + "loss": 0.5221, + "num_input_tokens_seen": 8410240, + "step": 14495 + }, + { + "epoch": 2.15966636878165, + "grad_norm": 22.663135528564453, + "learning_rate": 4.999031157987364e-05, + "loss": 0.5454, + "num_input_tokens_seen": 8413152, + "step": 14500 + }, + { + "epoch": 2.1604110813226094, + "grad_norm": 6.161334037780762, + "learning_rate": 4.9990220913185146e-05, + "loss": 0.478, + "num_input_tokens_seen": 8416160, + "step": 14505 + }, + { + "epoch": 2.1611557938635686, + "grad_norm": 8.565131187438965, + "learning_rate": 4.9990129824312285e-05, + "loss": 0.4308, + "num_input_tokens_seen": 8418976, + "step": 14510 + }, + { + "epoch": 2.161900506404528, + "grad_norm": 7.223117351531982, + "learning_rate": 4.9990038313256573e-05, + "loss": 0.4598, + "num_input_tokens_seen": 8421856, + "step": 14515 + }, + { + "epoch": 2.162645218945487, + "grad_norm": 6.083220481872559, + "learning_rate": 4.998994638001957e-05, + "loss": 0.4411, + "num_input_tokens_seen": 8424448, + "step": 14520 + }, + { + "epoch": 2.163389931486446, + "grad_norm": 9.736961364746094, + "learning_rate": 4.998985402460281e-05, + "loss": 0.3648, + "num_input_tokens_seen": 8427392, + "step": 14525 + }, + { + "epoch": 2.1641346440274054, + "grad_norm": 12.481714248657227, + "learning_rate": 4.998976124700787e-05, + "loss": 0.5747, + "num_input_tokens_seen": 8430528, + "step": 14530 + }, + { + "epoch": 2.1648793565683646, + "grad_norm": 20.78035545349121, + "learning_rate": 4.9989668047236316e-05, + "loss": 0.4917, + "num_input_tokens_seen": 8433408, + "step": 14535 + }, + { + "epoch": 2.165624069109324, + "grad_norm": 6.803805351257324, + "learning_rate": 4.998957442528972e-05, + "loss": 0.4569, + "num_input_tokens_seen": 8436480, + "step": 14540 + }, + { + "epoch": 2.166368781650283, + "grad_norm": 6.456179141998291, + "learning_rate": 4.998948038116965e-05, + "loss": 0.6437, + "num_input_tokens_seen": 8439200, + "step": 14545 + }, + { + "epoch": 2.167113494191242, + "grad_norm": 4.809442520141602, + "learning_rate": 4.9989385914877717e-05, + "loss": 0.461, + "num_input_tokens_seen": 8442048, + "step": 14550 + }, + { + "epoch": 2.1678582067322014, + "grad_norm": 3.990797758102417, + "learning_rate": 4.998929102641551e-05, + "loss": 0.4955, + "num_input_tokens_seen": 8444832, + "step": 14555 + }, + { + "epoch": 2.1686029192731606, + "grad_norm": 11.244812965393066, + "learning_rate": 4.998919571578462e-05, + "loss": 0.4913, + "num_input_tokens_seen": 8447616, + "step": 14560 + }, + { + "epoch": 2.16934763181412, + "grad_norm": 4.8545098304748535, + "learning_rate": 4.998909998298668e-05, + "loss": 0.4066, + "num_input_tokens_seen": 8450656, + "step": 14565 + }, + { + "epoch": 2.170092344355079, + "grad_norm": 20.723094940185547, + "learning_rate": 4.998900382802327e-05, + "loss": 0.8567, + "num_input_tokens_seen": 8453536, + "step": 14570 + }, + { + "epoch": 2.1708370568960382, + "grad_norm": 4.375345706939697, + "learning_rate": 4.9988907250896056e-05, + "loss": 0.5705, + "num_input_tokens_seen": 8456576, + "step": 14575 + }, + { + "epoch": 2.1715817694369974, + "grad_norm": 5.250986576080322, + "learning_rate": 4.998881025160665e-05, + "loss": 0.5819, + "num_input_tokens_seen": 8459296, + "step": 14580 + }, + { + "epoch": 2.1723264819779566, + "grad_norm": 7.5028276443481445, + "learning_rate": 4.9988712830156694e-05, + "loss": 0.3955, + "num_input_tokens_seen": 8462272, + "step": 14585 + }, + { + "epoch": 2.173071194518916, + "grad_norm": 5.130731105804443, + "learning_rate": 4.998861498654782e-05, + "loss": 0.628, + "num_input_tokens_seen": 8465152, + "step": 14590 + }, + { + "epoch": 2.173815907059875, + "grad_norm": 4.697448253631592, + "learning_rate": 4.9988516720781705e-05, + "loss": 0.5247, + "num_input_tokens_seen": 8468032, + "step": 14595 + }, + { + "epoch": 2.1745606196008342, + "grad_norm": 11.796954154968262, + "learning_rate": 4.998841803286e-05, + "loss": 0.4255, + "num_input_tokens_seen": 8470784, + "step": 14600 + }, + { + "epoch": 2.1753053321417934, + "grad_norm": 5.583069324493408, + "learning_rate": 4.9988318922784364e-05, + "loss": 0.6685, + "num_input_tokens_seen": 8473568, + "step": 14605 + }, + { + "epoch": 2.1760500446827526, + "grad_norm": 12.164819717407227, + "learning_rate": 4.9988219390556466e-05, + "loss": 0.533, + "num_input_tokens_seen": 8476512, + "step": 14610 + }, + { + "epoch": 2.176794757223712, + "grad_norm": 7.480849266052246, + "learning_rate": 4.998811943617801e-05, + "loss": 0.568, + "num_input_tokens_seen": 8479392, + "step": 14615 + }, + { + "epoch": 2.177539469764671, + "grad_norm": 3.309083938598633, + "learning_rate": 4.998801905965067e-05, + "loss": 0.6598, + "num_input_tokens_seen": 8482240, + "step": 14620 + }, + { + "epoch": 2.17828418230563, + "grad_norm": 7.361867427825928, + "learning_rate": 4.998791826097615e-05, + "loss": 0.6222, + "num_input_tokens_seen": 8484960, + "step": 14625 + }, + { + "epoch": 2.179028894846589, + "grad_norm": 9.948698997497559, + "learning_rate": 4.998781704015614e-05, + "loss": 0.5629, + "num_input_tokens_seen": 8487616, + "step": 14630 + }, + { + "epoch": 2.179773607387548, + "grad_norm": 4.246376991271973, + "learning_rate": 4.998771539719236e-05, + "loss": 0.5546, + "num_input_tokens_seen": 8490368, + "step": 14635 + }, + { + "epoch": 2.1805183199285074, + "grad_norm": 10.880427360534668, + "learning_rate": 4.998761333208652e-05, + "loss": 0.5426, + "num_input_tokens_seen": 8493216, + "step": 14640 + }, + { + "epoch": 2.1812630324694666, + "grad_norm": 4.915191650390625, + "learning_rate": 4.9987510844840354e-05, + "loss": 0.5305, + "num_input_tokens_seen": 8496128, + "step": 14645 + }, + { + "epoch": 2.182007745010426, + "grad_norm": 6.932729244232178, + "learning_rate": 4.998740793545559e-05, + "loss": 0.4984, + "num_input_tokens_seen": 8498848, + "step": 14650 + }, + { + "epoch": 2.182752457551385, + "grad_norm": 12.887442588806152, + "learning_rate": 4.998730460393397e-05, + "loss": 0.3606, + "num_input_tokens_seen": 8501856, + "step": 14655 + }, + { + "epoch": 2.1834971700923442, + "grad_norm": 3.8378446102142334, + "learning_rate": 4.998720085027723e-05, + "loss": 0.5963, + "num_input_tokens_seen": 8504704, + "step": 14660 + }, + { + "epoch": 2.1842418826333034, + "grad_norm": 2.1547908782958984, + "learning_rate": 4.998709667448712e-05, + "loss": 0.4153, + "num_input_tokens_seen": 8507360, + "step": 14665 + }, + { + "epoch": 2.1849865951742626, + "grad_norm": 5.732864856719971, + "learning_rate": 4.998699207656542e-05, + "loss": 0.6157, + "num_input_tokens_seen": 8510400, + "step": 14670 + }, + { + "epoch": 2.185731307715222, + "grad_norm": 14.968467712402344, + "learning_rate": 4.9986887056513874e-05, + "loss": 0.611, + "num_input_tokens_seen": 8513184, + "step": 14675 + }, + { + "epoch": 2.186476020256181, + "grad_norm": 9.648078918457031, + "learning_rate": 4.998678161433427e-05, + "loss": 0.658, + "num_input_tokens_seen": 8516704, + "step": 14680 + }, + { + "epoch": 2.1872207327971402, + "grad_norm": 7.209280014038086, + "learning_rate": 4.998667575002839e-05, + "loss": 0.5306, + "num_input_tokens_seen": 8519584, + "step": 14685 + }, + { + "epoch": 2.1879654453380994, + "grad_norm": 5.669667720794678, + "learning_rate": 4.998656946359801e-05, + "loss": 0.6782, + "num_input_tokens_seen": 8522144, + "step": 14690 + }, + { + "epoch": 2.1887101578790586, + "grad_norm": 6.72018575668335, + "learning_rate": 4.998646275504494e-05, + "loss": 0.5239, + "num_input_tokens_seen": 8525248, + "step": 14695 + }, + { + "epoch": 2.189454870420018, + "grad_norm": 4.614031791687012, + "learning_rate": 4.998635562437098e-05, + "loss": 0.49, + "num_input_tokens_seen": 8527904, + "step": 14700 + }, + { + "epoch": 2.190199582960977, + "grad_norm": 6.14945650100708, + "learning_rate": 4.9986248071577934e-05, + "loss": 0.4885, + "num_input_tokens_seen": 8530720, + "step": 14705 + }, + { + "epoch": 2.1909442955019363, + "grad_norm": 9.852653503417969, + "learning_rate": 4.998614009666762e-05, + "loss": 0.6512, + "num_input_tokens_seen": 8533344, + "step": 14710 + }, + { + "epoch": 2.1916890080428955, + "grad_norm": 7.554182052612305, + "learning_rate": 4.9986031699641866e-05, + "loss": 0.5169, + "num_input_tokens_seen": 8536032, + "step": 14715 + }, + { + "epoch": 2.1924337205838547, + "grad_norm": 9.158299446105957, + "learning_rate": 4.99859228805025e-05, + "loss": 0.5808, + "num_input_tokens_seen": 8539040, + "step": 14720 + }, + { + "epoch": 2.193178433124814, + "grad_norm": 3.364818572998047, + "learning_rate": 4.9985813639251355e-05, + "loss": 0.5585, + "num_input_tokens_seen": 8542048, + "step": 14725 + }, + { + "epoch": 2.193923145665773, + "grad_norm": 10.560331344604492, + "learning_rate": 4.9985703975890294e-05, + "loss": 0.5971, + "num_input_tokens_seen": 8544672, + "step": 14730 + }, + { + "epoch": 2.1946678582067323, + "grad_norm": 6.482458591461182, + "learning_rate": 4.998559389042115e-05, + "loss": 0.5574, + "num_input_tokens_seen": 8547456, + "step": 14735 + }, + { + "epoch": 2.1954125707476915, + "grad_norm": 9.140132904052734, + "learning_rate": 4.99854833828458e-05, + "loss": 0.4932, + "num_input_tokens_seen": 8550304, + "step": 14740 + }, + { + "epoch": 2.1961572832886507, + "grad_norm": 11.449177742004395, + "learning_rate": 4.998537245316609e-05, + "loss": 0.5197, + "num_input_tokens_seen": 8553408, + "step": 14745 + }, + { + "epoch": 2.19690199582961, + "grad_norm": 6.713076114654541, + "learning_rate": 4.998526110138392e-05, + "loss": 0.6446, + "num_input_tokens_seen": 8556128, + "step": 14750 + }, + { + "epoch": 2.197646708370569, + "grad_norm": 5.168856143951416, + "learning_rate": 4.9985149327501146e-05, + "loss": 0.4252, + "num_input_tokens_seen": 8558976, + "step": 14755 + }, + { + "epoch": 2.1983914209115283, + "grad_norm": 7.1779046058654785, + "learning_rate": 4.998503713151967e-05, + "loss": 0.4237, + "num_input_tokens_seen": 8561888, + "step": 14760 + }, + { + "epoch": 2.1991361334524875, + "grad_norm": 8.660825729370117, + "learning_rate": 4.9984924513441397e-05, + "loss": 0.6223, + "num_input_tokens_seen": 8564640, + "step": 14765 + }, + { + "epoch": 2.1998808459934467, + "grad_norm": 15.190290451049805, + "learning_rate": 4.9984811473268214e-05, + "loss": 0.5525, + "num_input_tokens_seen": 8567456, + "step": 14770 + }, + { + "epoch": 2.200625558534406, + "grad_norm": 24.822715759277344, + "learning_rate": 4.998469801100203e-05, + "loss": 0.9062, + "num_input_tokens_seen": 8570528, + "step": 14775 + }, + { + "epoch": 2.201370271075365, + "grad_norm": 8.286845207214355, + "learning_rate": 4.998458412664476e-05, + "loss": 0.3679, + "num_input_tokens_seen": 8573664, + "step": 14780 + }, + { + "epoch": 2.2021149836163243, + "grad_norm": 6.863988399505615, + "learning_rate": 4.9984469820198345e-05, + "loss": 0.4406, + "num_input_tokens_seen": 8576704, + "step": 14785 + }, + { + "epoch": 2.202859696157283, + "grad_norm": 11.242212295532227, + "learning_rate": 4.9984355091664705e-05, + "loss": 0.6024, + "num_input_tokens_seen": 8579552, + "step": 14790 + }, + { + "epoch": 2.2036044086982427, + "grad_norm": 14.864794731140137, + "learning_rate": 4.9984239941045766e-05, + "loss": 0.6309, + "num_input_tokens_seen": 8582400, + "step": 14795 + }, + { + "epoch": 2.2043491212392015, + "grad_norm": 21.646249771118164, + "learning_rate": 4.99841243683435e-05, + "loss": 0.6187, + "num_input_tokens_seen": 8585344, + "step": 14800 + }, + { + "epoch": 2.2050938337801607, + "grad_norm": 4.411581516265869, + "learning_rate": 4.998400837355984e-05, + "loss": 0.7584, + "num_input_tokens_seen": 8588320, + "step": 14805 + }, + { + "epoch": 2.20583854632112, + "grad_norm": 9.258537292480469, + "learning_rate": 4.998389195669675e-05, + "loss": 0.6835, + "num_input_tokens_seen": 8591264, + "step": 14810 + }, + { + "epoch": 2.206583258862079, + "grad_norm": 5.3486762046813965, + "learning_rate": 4.998377511775621e-05, + "loss": 0.5536, + "num_input_tokens_seen": 8594176, + "step": 14815 + }, + { + "epoch": 2.2073279714030383, + "grad_norm": 6.457184791564941, + "learning_rate": 4.9983657856740165e-05, + "loss": 0.5573, + "num_input_tokens_seen": 8597280, + "step": 14820 + }, + { + "epoch": 2.2080726839439975, + "grad_norm": 6.968055248260498, + "learning_rate": 4.9983540173650614e-05, + "loss": 0.2445, + "num_input_tokens_seen": 8600192, + "step": 14825 + }, + { + "epoch": 2.2088173964849567, + "grad_norm": 7.162648677825928, + "learning_rate": 4.9983422068489546e-05, + "loss": 0.5536, + "num_input_tokens_seen": 8603232, + "step": 14830 + }, + { + "epoch": 2.209562109025916, + "grad_norm": 12.24746036529541, + "learning_rate": 4.998330354125896e-05, + "loss": 0.8162, + "num_input_tokens_seen": 8606176, + "step": 14835 + }, + { + "epoch": 2.210306821566875, + "grad_norm": 13.030911445617676, + "learning_rate": 4.998318459196085e-05, + "loss": 0.4821, + "num_input_tokens_seen": 8609120, + "step": 14840 + }, + { + "epoch": 2.2110515341078343, + "grad_norm": 3.147897720336914, + "learning_rate": 4.998306522059723e-05, + "loss": 0.5523, + "num_input_tokens_seen": 8611936, + "step": 14845 + }, + { + "epoch": 2.2117962466487935, + "grad_norm": 7.563390731811523, + "learning_rate": 4.9982945427170115e-05, + "loss": 0.5325, + "num_input_tokens_seen": 8614784, + "step": 14850 + }, + { + "epoch": 2.2125409591897527, + "grad_norm": 9.419598579406738, + "learning_rate": 4.998282521168153e-05, + "loss": 0.5341, + "num_input_tokens_seen": 8617888, + "step": 14855 + }, + { + "epoch": 2.213285671730712, + "grad_norm": 4.788609504699707, + "learning_rate": 4.9982704574133497e-05, + "loss": 0.5865, + "num_input_tokens_seen": 8620896, + "step": 14860 + }, + { + "epoch": 2.214030384271671, + "grad_norm": 7.583703994750977, + "learning_rate": 4.998258351452806e-05, + "loss": 0.5333, + "num_input_tokens_seen": 8623808, + "step": 14865 + }, + { + "epoch": 2.2147750968126303, + "grad_norm": 5.179026126861572, + "learning_rate": 4.998246203286727e-05, + "loss": 0.6006, + "num_input_tokens_seen": 8626688, + "step": 14870 + }, + { + "epoch": 2.2155198093535895, + "grad_norm": 14.74863052368164, + "learning_rate": 4.9982340129153185e-05, + "loss": 0.561, + "num_input_tokens_seen": 8629536, + "step": 14875 + }, + { + "epoch": 2.2162645218945487, + "grad_norm": 5.661761283874512, + "learning_rate": 4.9982217803387844e-05, + "loss": 0.4718, + "num_input_tokens_seen": 8632480, + "step": 14880 + }, + { + "epoch": 2.217009234435508, + "grad_norm": 7.765168190002441, + "learning_rate": 4.998209505557333e-05, + "loss": 0.5099, + "num_input_tokens_seen": 8635520, + "step": 14885 + }, + { + "epoch": 2.217753946976467, + "grad_norm": 4.6525397300720215, + "learning_rate": 4.99819718857117e-05, + "loss": 0.7293, + "num_input_tokens_seen": 8638752, + "step": 14890 + }, + { + "epoch": 2.2184986595174263, + "grad_norm": 10.976317405700684, + "learning_rate": 4.998184829380505e-05, + "loss": 0.6559, + "num_input_tokens_seen": 8641824, + "step": 14895 + }, + { + "epoch": 2.2192433720583855, + "grad_norm": 14.33455753326416, + "learning_rate": 4.9981724279855466e-05, + "loss": 0.4048, + "num_input_tokens_seen": 8644928, + "step": 14900 + }, + { + "epoch": 2.2199880845993447, + "grad_norm": 9.016510963439941, + "learning_rate": 4.998159984386504e-05, + "loss": 0.7886, + "num_input_tokens_seen": 8648000, + "step": 14905 + }, + { + "epoch": 2.220732797140304, + "grad_norm": 8.278648376464844, + "learning_rate": 4.9981474985835875e-05, + "loss": 0.4368, + "num_input_tokens_seen": 8651008, + "step": 14910 + }, + { + "epoch": 2.221477509681263, + "grad_norm": 8.028736114501953, + "learning_rate": 4.9981349705770074e-05, + "loss": 0.4325, + "num_input_tokens_seen": 8653696, + "step": 14915 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 9.484582901000977, + "learning_rate": 4.998122400366977e-05, + "loss": 0.4724, + "num_input_tokens_seen": 8656576, + "step": 14920 + }, + { + "epoch": 2.2229669347631815, + "grad_norm": 10.360629081726074, + "learning_rate": 4.998109787953708e-05, + "loss": 0.6576, + "num_input_tokens_seen": 8659488, + "step": 14925 + }, + { + "epoch": 2.2237116473041407, + "grad_norm": 11.790396690368652, + "learning_rate": 4.998097133337412e-05, + "loss": 0.5661, + "num_input_tokens_seen": 8662208, + "step": 14930 + }, + { + "epoch": 2.2244563598451, + "grad_norm": 12.406560897827148, + "learning_rate": 4.998084436518303e-05, + "loss": 0.4911, + "num_input_tokens_seen": 8664896, + "step": 14935 + }, + { + "epoch": 2.225201072386059, + "grad_norm": 4.09456729888916, + "learning_rate": 4.998071697496598e-05, + "loss": 0.5578, + "num_input_tokens_seen": 8667808, + "step": 14940 + }, + { + "epoch": 2.2259457849270183, + "grad_norm": 4.865762233734131, + "learning_rate": 4.99805891627251e-05, + "loss": 0.5428, + "num_input_tokens_seen": 8670880, + "step": 14945 + }, + { + "epoch": 2.2266904974679775, + "grad_norm": 5.621167182922363, + "learning_rate": 4.998046092846256e-05, + "loss": 0.5236, + "num_input_tokens_seen": 8673856, + "step": 14950 + }, + { + "epoch": 2.2274352100089367, + "grad_norm": 5.614425182342529, + "learning_rate": 4.998033227218052e-05, + "loss": 0.3721, + "num_input_tokens_seen": 8676736, + "step": 14955 + }, + { + "epoch": 2.228179922549896, + "grad_norm": 7.068256378173828, + "learning_rate": 4.998020319388115e-05, + "loss": 0.508, + "num_input_tokens_seen": 8679872, + "step": 14960 + }, + { + "epoch": 2.2289246350908547, + "grad_norm": 13.391153335571289, + "learning_rate": 4.998007369356664e-05, + "loss": 0.643, + "num_input_tokens_seen": 8682496, + "step": 14965 + }, + { + "epoch": 2.2296693476318143, + "grad_norm": 7.8304572105407715, + "learning_rate": 4.997994377123917e-05, + "loss": 0.5269, + "num_input_tokens_seen": 8685280, + "step": 14970 + }, + { + "epoch": 2.230414060172773, + "grad_norm": 7.773122787475586, + "learning_rate": 4.997981342690095e-05, + "loss": 0.573, + "num_input_tokens_seen": 8688128, + "step": 14975 + }, + { + "epoch": 2.2311587727137323, + "grad_norm": 8.440600395202637, + "learning_rate": 4.9979682660554154e-05, + "loss": 0.5293, + "num_input_tokens_seen": 8691200, + "step": 14980 + }, + { + "epoch": 2.2319034852546915, + "grad_norm": 10.128728866577148, + "learning_rate": 4.997955147220101e-05, + "loss": 0.6518, + "num_input_tokens_seen": 8693888, + "step": 14985 + }, + { + "epoch": 2.2326481977956507, + "grad_norm": 5.975476264953613, + "learning_rate": 4.997941986184375e-05, + "loss": 0.4283, + "num_input_tokens_seen": 8696448, + "step": 14990 + }, + { + "epoch": 2.23339291033661, + "grad_norm": 4.736008167266846, + "learning_rate": 4.9979287829484555e-05, + "loss": 0.3224, + "num_input_tokens_seen": 8699360, + "step": 14995 + }, + { + "epoch": 2.234137622877569, + "grad_norm": 8.65848159790039, + "learning_rate": 4.99791553751257e-05, + "loss": 0.5726, + "num_input_tokens_seen": 8702272, + "step": 15000 + }, + { + "epoch": 2.2348823354185283, + "grad_norm": 8.860891342163086, + "learning_rate": 4.997902249876939e-05, + "loss": 0.4443, + "num_input_tokens_seen": 8705472, + "step": 15005 + }, + { + "epoch": 2.2356270479594875, + "grad_norm": 5.111254692077637, + "learning_rate": 4.997888920041789e-05, + "loss": 0.5846, + "num_input_tokens_seen": 8708192, + "step": 15010 + }, + { + "epoch": 2.2363717605004467, + "grad_norm": 10.244353294372559, + "learning_rate": 4.997875548007343e-05, + "loss": 0.543, + "num_input_tokens_seen": 8711072, + "step": 15015 + }, + { + "epoch": 2.237116473041406, + "grad_norm": 12.102142333984375, + "learning_rate": 4.99786213377383e-05, + "loss": 0.6636, + "num_input_tokens_seen": 8713792, + "step": 15020 + }, + { + "epoch": 2.237861185582365, + "grad_norm": 5.424324035644531, + "learning_rate": 4.997848677341474e-05, + "loss": 0.4945, + "num_input_tokens_seen": 8716480, + "step": 15025 + }, + { + "epoch": 2.2386058981233243, + "grad_norm": 8.730977058410645, + "learning_rate": 4.997835178710504e-05, + "loss": 0.4911, + "num_input_tokens_seen": 8719200, + "step": 15030 + }, + { + "epoch": 2.2393506106642835, + "grad_norm": 8.213769912719727, + "learning_rate": 4.997821637881147e-05, + "loss": 0.6229, + "num_input_tokens_seen": 8722112, + "step": 15035 + }, + { + "epoch": 2.2400953232052427, + "grad_norm": 12.360196113586426, + "learning_rate": 4.997808054853632e-05, + "loss": 0.4163, + "num_input_tokens_seen": 8724800, + "step": 15040 + }, + { + "epoch": 2.240840035746202, + "grad_norm": 6.688088893890381, + "learning_rate": 4.9977944296281895e-05, + "loss": 0.5064, + "num_input_tokens_seen": 8727968, + "step": 15045 + }, + { + "epoch": 2.241584748287161, + "grad_norm": 12.523374557495117, + "learning_rate": 4.997780762205047e-05, + "loss": 0.7151, + "num_input_tokens_seen": 8730848, + "step": 15050 + }, + { + "epoch": 2.2423294608281203, + "grad_norm": 4.682248115539551, + "learning_rate": 4.997767052584439e-05, + "loss": 0.5118, + "num_input_tokens_seen": 8733728, + "step": 15055 + }, + { + "epoch": 2.2430741733690795, + "grad_norm": 3.924868583679199, + "learning_rate": 4.9977533007665944e-05, + "loss": 0.4918, + "num_input_tokens_seen": 8736704, + "step": 15060 + }, + { + "epoch": 2.2438188859100388, + "grad_norm": 8.699564933776855, + "learning_rate": 4.9977395067517464e-05, + "loss": 0.4763, + "num_input_tokens_seen": 8739392, + "step": 15065 + }, + { + "epoch": 2.244563598450998, + "grad_norm": 9.727534294128418, + "learning_rate": 4.997725670540128e-05, + "loss": 0.5334, + "num_input_tokens_seen": 8742528, + "step": 15070 + }, + { + "epoch": 2.245308310991957, + "grad_norm": 5.436422348022461, + "learning_rate": 4.997711792131973e-05, + "loss": 0.422, + "num_input_tokens_seen": 8745504, + "step": 15075 + }, + { + "epoch": 2.2460530235329164, + "grad_norm": 6.655827045440674, + "learning_rate": 4.9976978715275155e-05, + "loss": 0.4702, + "num_input_tokens_seen": 8748128, + "step": 15080 + }, + { + "epoch": 2.2467977360738756, + "grad_norm": 1.0682382583618164, + "learning_rate": 4.997683908726991e-05, + "loss": 0.4002, + "num_input_tokens_seen": 8750880, + "step": 15085 + }, + { + "epoch": 2.2475424486148348, + "grad_norm": 10.852532386779785, + "learning_rate": 4.9976699037306356e-05, + "loss": 0.5089, + "num_input_tokens_seen": 8753792, + "step": 15090 + }, + { + "epoch": 2.248287161155794, + "grad_norm": 9.109911918640137, + "learning_rate": 4.997655856538686e-05, + "loss": 0.4805, + "num_input_tokens_seen": 8756608, + "step": 15095 + }, + { + "epoch": 2.249031873696753, + "grad_norm": 17.43606948852539, + "learning_rate": 4.9976417671513787e-05, + "loss": 0.6409, + "num_input_tokens_seen": 8759456, + "step": 15100 + }, + { + "epoch": 2.2497765862377124, + "grad_norm": 13.403923034667969, + "learning_rate": 4.997627635568953e-05, + "loss": 0.5614, + "num_input_tokens_seen": 8762208, + "step": 15105 + }, + { + "epoch": 2.2505212987786716, + "grad_norm": 3.9616429805755615, + "learning_rate": 4.997613461791646e-05, + "loss": 0.5345, + "num_input_tokens_seen": 8765152, + "step": 15110 + }, + { + "epoch": 2.2512660113196308, + "grad_norm": 6.8942646980285645, + "learning_rate": 4.9975992458196986e-05, + "loss": 0.2888, + "num_input_tokens_seen": 8768032, + "step": 15115 + }, + { + "epoch": 2.25201072386059, + "grad_norm": 14.81148910522461, + "learning_rate": 4.99758498765335e-05, + "loss": 0.5971, + "num_input_tokens_seen": 8771104, + "step": 15120 + }, + { + "epoch": 2.252755436401549, + "grad_norm": 13.529821395874023, + "learning_rate": 4.997570687292842e-05, + "loss": 0.6772, + "num_input_tokens_seen": 8773856, + "step": 15125 + }, + { + "epoch": 2.2535001489425084, + "grad_norm": 8.200620651245117, + "learning_rate": 4.9975563447384156e-05, + "loss": 0.6075, + "num_input_tokens_seen": 8776512, + "step": 15130 + }, + { + "epoch": 2.2542448614834676, + "grad_norm": 7.864182949066162, + "learning_rate": 4.997541959990313e-05, + "loss": 0.4276, + "num_input_tokens_seen": 8779328, + "step": 15135 + }, + { + "epoch": 2.2549895740244263, + "grad_norm": 8.738018989562988, + "learning_rate": 4.997527533048777e-05, + "loss": 0.6221, + "num_input_tokens_seen": 8782400, + "step": 15140 + }, + { + "epoch": 2.255734286565386, + "grad_norm": 13.806419372558594, + "learning_rate": 4.997513063914052e-05, + "loss": 0.7734, + "num_input_tokens_seen": 8785504, + "step": 15145 + }, + { + "epoch": 2.2564789991063448, + "grad_norm": 2.8496315479278564, + "learning_rate": 4.997498552586382e-05, + "loss": 0.3834, + "num_input_tokens_seen": 8788480, + "step": 15150 + }, + { + "epoch": 2.257223711647304, + "grad_norm": 13.217519760131836, + "learning_rate": 4.9974839990660124e-05, + "loss": 1.0125, + "num_input_tokens_seen": 8791072, + "step": 15155 + }, + { + "epoch": 2.257968424188263, + "grad_norm": 8.68526840209961, + "learning_rate": 4.997469403353189e-05, + "loss": 0.6598, + "num_input_tokens_seen": 8793920, + "step": 15160 + }, + { + "epoch": 2.2587131367292224, + "grad_norm": 9.925299644470215, + "learning_rate": 4.9974547654481585e-05, + "loss": 0.6661, + "num_input_tokens_seen": 8796704, + "step": 15165 + }, + { + "epoch": 2.2594578492701816, + "grad_norm": 2.8884785175323486, + "learning_rate": 4.997440085351168e-05, + "loss": 0.4984, + "num_input_tokens_seen": 8799360, + "step": 15170 + }, + { + "epoch": 2.2602025618111408, + "grad_norm": 6.929009914398193, + "learning_rate": 4.9974253630624654e-05, + "loss": 0.4887, + "num_input_tokens_seen": 8802016, + "step": 15175 + }, + { + "epoch": 2.2609472743521, + "grad_norm": 6.455765247344971, + "learning_rate": 4.9974105985822996e-05, + "loss": 0.5288, + "num_input_tokens_seen": 8804800, + "step": 15180 + }, + { + "epoch": 2.261691986893059, + "grad_norm": 12.635636329650879, + "learning_rate": 4.997395791910919e-05, + "loss": 0.6249, + "num_input_tokens_seen": 8807968, + "step": 15185 + }, + { + "epoch": 2.2624366994340184, + "grad_norm": 6.708652496337891, + "learning_rate": 4.997380943048576e-05, + "loss": 0.5331, + "num_input_tokens_seen": 8811168, + "step": 15190 + }, + { + "epoch": 2.2631814119749776, + "grad_norm": 8.336541175842285, + "learning_rate": 4.99736605199552e-05, + "loss": 0.9043, + "num_input_tokens_seen": 8813824, + "step": 15195 + }, + { + "epoch": 2.2639261245159368, + "grad_norm": 11.08983325958252, + "learning_rate": 4.9973511187520025e-05, + "loss": 0.4855, + "num_input_tokens_seen": 8816640, + "step": 15200 + }, + { + "epoch": 2.264670837056896, + "grad_norm": 7.067614555358887, + "learning_rate": 4.9973361433182764e-05, + "loss": 0.5763, + "num_input_tokens_seen": 8819616, + "step": 15205 + }, + { + "epoch": 2.265415549597855, + "grad_norm": 5.889827251434326, + "learning_rate": 4.997321125694594e-05, + "loss": 0.7091, + "num_input_tokens_seen": 8822560, + "step": 15210 + }, + { + "epoch": 2.2661602621388144, + "grad_norm": 13.258037567138672, + "learning_rate": 4.99730606588121e-05, + "loss": 0.5309, + "num_input_tokens_seen": 8825280, + "step": 15215 + }, + { + "epoch": 2.2669049746797736, + "grad_norm": 7.869856834411621, + "learning_rate": 4.997290963878377e-05, + "loss": 0.5874, + "num_input_tokens_seen": 8828160, + "step": 15220 + }, + { + "epoch": 2.267649687220733, + "grad_norm": 5.36057186126709, + "learning_rate": 4.9972758196863524e-05, + "loss": 0.5975, + "num_input_tokens_seen": 8830848, + "step": 15225 + }, + { + "epoch": 2.268394399761692, + "grad_norm": 7.840786933898926, + "learning_rate": 4.9972606333053903e-05, + "loss": 0.5467, + "num_input_tokens_seen": 8833920, + "step": 15230 + }, + { + "epoch": 2.269139112302651, + "grad_norm": 6.076071262359619, + "learning_rate": 4.997245404735748e-05, + "loss": 0.2813, + "num_input_tokens_seen": 8836800, + "step": 15235 + }, + { + "epoch": 2.2698838248436104, + "grad_norm": 8.082761764526367, + "learning_rate": 4.997230133977683e-05, + "loss": 0.7071, + "num_input_tokens_seen": 8840032, + "step": 15240 + }, + { + "epoch": 2.2706285373845696, + "grad_norm": 7.761185169219971, + "learning_rate": 4.997214821031453e-05, + "loss": 0.8988, + "num_input_tokens_seen": 8842848, + "step": 15245 + }, + { + "epoch": 2.271373249925529, + "grad_norm": 3.0468852519989014, + "learning_rate": 4.997199465897316e-05, + "loss": 0.4814, + "num_input_tokens_seen": 8845632, + "step": 15250 + }, + { + "epoch": 2.272117962466488, + "grad_norm": 13.414215087890625, + "learning_rate": 4.9971840685755324e-05, + "loss": 0.46, + "num_input_tokens_seen": 8848128, + "step": 15255 + }, + { + "epoch": 2.272862675007447, + "grad_norm": 5.830506801605225, + "learning_rate": 4.997168629066362e-05, + "loss": 0.4012, + "num_input_tokens_seen": 8851328, + "step": 15260 + }, + { + "epoch": 2.2736073875484064, + "grad_norm": 12.585317611694336, + "learning_rate": 4.9971531473700654e-05, + "loss": 0.8017, + "num_input_tokens_seen": 8854368, + "step": 15265 + }, + { + "epoch": 2.2743521000893656, + "grad_norm": 6.239933013916016, + "learning_rate": 4.997137623486905e-05, + "loss": 0.4728, + "num_input_tokens_seen": 8857056, + "step": 15270 + }, + { + "epoch": 2.275096812630325, + "grad_norm": 8.811898231506348, + "learning_rate": 4.9971220574171415e-05, + "loss": 0.4355, + "num_input_tokens_seen": 8859872, + "step": 15275 + }, + { + "epoch": 2.275841525171284, + "grad_norm": 4.413240432739258, + "learning_rate": 4.9971064491610396e-05, + "loss": 0.5579, + "num_input_tokens_seen": 8862848, + "step": 15280 + }, + { + "epoch": 2.276586237712243, + "grad_norm": 8.45430850982666, + "learning_rate": 4.997090798718862e-05, + "loss": 0.6668, + "num_input_tokens_seen": 8866080, + "step": 15285 + }, + { + "epoch": 2.2773309502532024, + "grad_norm": 3.5666205883026123, + "learning_rate": 4.9970751060908735e-05, + "loss": 0.4873, + "num_input_tokens_seen": 8869056, + "step": 15290 + }, + { + "epoch": 2.2780756627941616, + "grad_norm": 6.6021504402160645, + "learning_rate": 4.997059371277339e-05, + "loss": 0.4705, + "num_input_tokens_seen": 8871808, + "step": 15295 + }, + { + "epoch": 2.278820375335121, + "grad_norm": 6.466133117675781, + "learning_rate": 4.997043594278523e-05, + "loss": 0.4454, + "num_input_tokens_seen": 8874752, + "step": 15300 + }, + { + "epoch": 2.2795650878760796, + "grad_norm": 6.317530632019043, + "learning_rate": 4.997027775094695e-05, + "loss": 0.5715, + "num_input_tokens_seen": 8877440, + "step": 15305 + }, + { + "epoch": 2.2803098004170392, + "grad_norm": 9.3051176071167, + "learning_rate": 4.99701191372612e-05, + "loss": 0.3826, + "num_input_tokens_seen": 8880192, + "step": 15310 + }, + { + "epoch": 2.281054512957998, + "grad_norm": 6.387197017669678, + "learning_rate": 4.9969960101730664e-05, + "loss": 0.6208, + "num_input_tokens_seen": 8882624, + "step": 15315 + }, + { + "epoch": 2.2817992254989576, + "grad_norm": 17.997594833374023, + "learning_rate": 4.996980064435803e-05, + "loss": 0.554, + "num_input_tokens_seen": 8885472, + "step": 15320 + }, + { + "epoch": 2.2825439380399164, + "grad_norm": 5.3740925788879395, + "learning_rate": 4.9969640765145996e-05, + "loss": 0.4604, + "num_input_tokens_seen": 8888448, + "step": 15325 + }, + { + "epoch": 2.2832886505808756, + "grad_norm": 6.932991027832031, + "learning_rate": 4.9969480464097255e-05, + "loss": 0.4794, + "num_input_tokens_seen": 8891200, + "step": 15330 + }, + { + "epoch": 2.284033363121835, + "grad_norm": 11.261340141296387, + "learning_rate": 4.9969319741214525e-05, + "loss": 0.4496, + "num_input_tokens_seen": 8894144, + "step": 15335 + }, + { + "epoch": 2.284778075662794, + "grad_norm": 12.12867259979248, + "learning_rate": 4.996915859650051e-05, + "loss": 0.4903, + "num_input_tokens_seen": 8896864, + "step": 15340 + }, + { + "epoch": 2.285522788203753, + "grad_norm": 10.433257102966309, + "learning_rate": 4.996899702995794e-05, + "loss": 0.4617, + "num_input_tokens_seen": 8899488, + "step": 15345 + }, + { + "epoch": 2.2862675007447124, + "grad_norm": 15.420564651489258, + "learning_rate": 4.9968835041589546e-05, + "loss": 0.393, + "num_input_tokens_seen": 8902368, + "step": 15350 + }, + { + "epoch": 2.2870122132856716, + "grad_norm": 1.9566245079040527, + "learning_rate": 4.996867263139806e-05, + "loss": 0.3863, + "num_input_tokens_seen": 8905312, + "step": 15355 + }, + { + "epoch": 2.287756925826631, + "grad_norm": 13.84511661529541, + "learning_rate": 4.996850979938622e-05, + "loss": 0.7071, + "num_input_tokens_seen": 8908192, + "step": 15360 + }, + { + "epoch": 2.28850163836759, + "grad_norm": 4.8452606201171875, + "learning_rate": 4.996834654555679e-05, + "loss": 0.6979, + "num_input_tokens_seen": 8911040, + "step": 15365 + }, + { + "epoch": 2.289246350908549, + "grad_norm": 21.160648345947266, + "learning_rate": 4.9968182869912525e-05, + "loss": 0.7128, + "num_input_tokens_seen": 8913664, + "step": 15370 + }, + { + "epoch": 2.2899910634495084, + "grad_norm": 5.2511162757873535, + "learning_rate": 4.9968018772456185e-05, + "loss": 0.458, + "num_input_tokens_seen": 8916736, + "step": 15375 + }, + { + "epoch": 2.2907357759904676, + "grad_norm": 9.869563102722168, + "learning_rate": 4.9967854253190536e-05, + "loss": 0.4542, + "num_input_tokens_seen": 8919264, + "step": 15380 + }, + { + "epoch": 2.291480488531427, + "grad_norm": 6.094022274017334, + "learning_rate": 4.996768931211837e-05, + "loss": 0.4867, + "num_input_tokens_seen": 8922368, + "step": 15385 + }, + { + "epoch": 2.292225201072386, + "grad_norm": 5.1798224449157715, + "learning_rate": 4.996752394924247e-05, + "loss": 0.5342, + "num_input_tokens_seen": 8925248, + "step": 15390 + }, + { + "epoch": 2.2929699136133452, + "grad_norm": 3.303628921508789, + "learning_rate": 4.996735816456564e-05, + "loss": 0.5163, + "num_input_tokens_seen": 8928160, + "step": 15395 + }, + { + "epoch": 2.2937146261543044, + "grad_norm": 13.905928611755371, + "learning_rate": 4.9967191958090656e-05, + "loss": 0.4444, + "num_input_tokens_seen": 8931104, + "step": 15400 + }, + { + "epoch": 2.2944593386952636, + "grad_norm": 8.15903091430664, + "learning_rate": 4.996702532982034e-05, + "loss": 0.4322, + "num_input_tokens_seen": 8933984, + "step": 15405 + }, + { + "epoch": 2.295204051236223, + "grad_norm": 10.652965545654297, + "learning_rate": 4.99668582797575e-05, + "loss": 0.5614, + "num_input_tokens_seen": 8936832, + "step": 15410 + }, + { + "epoch": 2.295948763777182, + "grad_norm": 12.303750991821289, + "learning_rate": 4.996669080790498e-05, + "loss": 0.5491, + "num_input_tokens_seen": 8939616, + "step": 15415 + }, + { + "epoch": 2.2966934763181412, + "grad_norm": 10.389885902404785, + "learning_rate": 4.996652291426559e-05, + "loss": 0.6722, + "num_input_tokens_seen": 8942720, + "step": 15420 + }, + { + "epoch": 2.2974381888591004, + "grad_norm": 7.063166618347168, + "learning_rate": 4.996635459884216e-05, + "loss": 0.3605, + "num_input_tokens_seen": 8945504, + "step": 15425 + }, + { + "epoch": 2.2981829014000597, + "grad_norm": 11.308326721191406, + "learning_rate": 4.996618586163755e-05, + "loss": 0.5289, + "num_input_tokens_seen": 8948160, + "step": 15430 + }, + { + "epoch": 2.298927613941019, + "grad_norm": 3.3200037479400635, + "learning_rate": 4.996601670265461e-05, + "loss": 0.8047, + "num_input_tokens_seen": 8950912, + "step": 15435 + }, + { + "epoch": 2.299672326481978, + "grad_norm": 5.288147449493408, + "learning_rate": 4.996584712189618e-05, + "loss": 0.6603, + "num_input_tokens_seen": 8954048, + "step": 15440 + }, + { + "epoch": 2.3004170390229373, + "grad_norm": 7.1121134757995605, + "learning_rate": 4.996567711936515e-05, + "loss": 0.54, + "num_input_tokens_seen": 8956832, + "step": 15445 + }, + { + "epoch": 2.3011617515638965, + "grad_norm": 2.470385789871216, + "learning_rate": 4.996550669506438e-05, + "loss": 0.4112, + "num_input_tokens_seen": 8959808, + "step": 15450 + }, + { + "epoch": 2.3019064641048557, + "grad_norm": 5.951114654541016, + "learning_rate": 4.996533584899674e-05, + "loss": 0.6768, + "num_input_tokens_seen": 8962752, + "step": 15455 + }, + { + "epoch": 2.302651176645815, + "grad_norm": 6.828399658203125, + "learning_rate": 4.996516458116512e-05, + "loss": 0.6904, + "num_input_tokens_seen": 8965824, + "step": 15460 + }, + { + "epoch": 2.303395889186774, + "grad_norm": 7.270219326019287, + "learning_rate": 4.9964992891572425e-05, + "loss": 0.3737, + "num_input_tokens_seen": 8968608, + "step": 15465 + }, + { + "epoch": 2.3041406017277333, + "grad_norm": 14.161460876464844, + "learning_rate": 4.996482078022155e-05, + "loss": 0.5871, + "num_input_tokens_seen": 8971488, + "step": 15470 + }, + { + "epoch": 2.3048853142686925, + "grad_norm": 7.364686489105225, + "learning_rate": 4.9964648247115395e-05, + "loss": 0.6008, + "num_input_tokens_seen": 8974368, + "step": 15475 + }, + { + "epoch": 2.3056300268096512, + "grad_norm": 7.195587158203125, + "learning_rate": 4.9964475292256884e-05, + "loss": 0.6695, + "num_input_tokens_seen": 8977280, + "step": 15480 + }, + { + "epoch": 2.306374739350611, + "grad_norm": 6.293237686157227, + "learning_rate": 4.996430191564894e-05, + "loss": 0.4378, + "num_input_tokens_seen": 8980256, + "step": 15485 + }, + { + "epoch": 2.3071194518915696, + "grad_norm": 16.245317459106445, + "learning_rate": 4.996412811729448e-05, + "loss": 0.4841, + "num_input_tokens_seen": 8982912, + "step": 15490 + }, + { + "epoch": 2.3078641644325293, + "grad_norm": 10.283628463745117, + "learning_rate": 4.996395389719646e-05, + "loss": 0.4285, + "num_input_tokens_seen": 8985824, + "step": 15495 + }, + { + "epoch": 2.308608876973488, + "grad_norm": 8.446724891662598, + "learning_rate": 4.99637792553578e-05, + "loss": 0.57, + "num_input_tokens_seen": 8988672, + "step": 15500 + }, + { + "epoch": 2.3093535895144472, + "grad_norm": 5.912455081939697, + "learning_rate": 4.996360419178147e-05, + "loss": 0.4989, + "num_input_tokens_seen": 8991424, + "step": 15505 + }, + { + "epoch": 2.3100983020554064, + "grad_norm": 10.936056137084961, + "learning_rate": 4.9963428706470405e-05, + "loss": 0.6786, + "num_input_tokens_seen": 8994400, + "step": 15510 + }, + { + "epoch": 2.3108430145963657, + "grad_norm": 6.641228199005127, + "learning_rate": 4.9963252799427594e-05, + "loss": 0.4211, + "num_input_tokens_seen": 8997152, + "step": 15515 + }, + { + "epoch": 2.311587727137325, + "grad_norm": 3.1281471252441406, + "learning_rate": 4.9963076470655995e-05, + "loss": 0.5459, + "num_input_tokens_seen": 8999840, + "step": 15520 + }, + { + "epoch": 2.312332439678284, + "grad_norm": 7.387338161468506, + "learning_rate": 4.996289972015859e-05, + "loss": 0.5054, + "num_input_tokens_seen": 9002720, + "step": 15525 + }, + { + "epoch": 2.3130771522192433, + "grad_norm": 9.973636627197266, + "learning_rate": 4.9962722547938365e-05, + "loss": 0.5186, + "num_input_tokens_seen": 9005856, + "step": 15530 + }, + { + "epoch": 2.3138218647602025, + "grad_norm": 8.453790664672852, + "learning_rate": 4.9962544953998316e-05, + "loss": 0.5913, + "num_input_tokens_seen": 9008928, + "step": 15535 + }, + { + "epoch": 2.3145665773011617, + "grad_norm": 15.916231155395508, + "learning_rate": 4.996236693834144e-05, + "loss": 0.4867, + "num_input_tokens_seen": 9011680, + "step": 15540 + }, + { + "epoch": 2.315311289842121, + "grad_norm": 4.791933059692383, + "learning_rate": 4.996218850097075e-05, + "loss": 0.2726, + "num_input_tokens_seen": 9014752, + "step": 15545 + }, + { + "epoch": 2.31605600238308, + "grad_norm": 5.405089855194092, + "learning_rate": 4.996200964188925e-05, + "loss": 0.4485, + "num_input_tokens_seen": 9017984, + "step": 15550 + }, + { + "epoch": 2.3168007149240393, + "grad_norm": 5.550866603851318, + "learning_rate": 4.996183036109997e-05, + "loss": 0.5012, + "num_input_tokens_seen": 9020992, + "step": 15555 + }, + { + "epoch": 2.3175454274649985, + "grad_norm": 9.44770336151123, + "learning_rate": 4.996165065860594e-05, + "loss": 0.8583, + "num_input_tokens_seen": 9023936, + "step": 15560 + }, + { + "epoch": 2.3182901400059577, + "grad_norm": 10.878296852111816, + "learning_rate": 4.996147053441018e-05, + "loss": 0.4693, + "num_input_tokens_seen": 9026784, + "step": 15565 + }, + { + "epoch": 2.319034852546917, + "grad_norm": 6.709068775177002, + "learning_rate": 4.996128998851575e-05, + "loss": 0.563, + "num_input_tokens_seen": 9029696, + "step": 15570 + }, + { + "epoch": 2.319779565087876, + "grad_norm": 6.047092437744141, + "learning_rate": 4.99611090209257e-05, + "loss": 0.5541, + "num_input_tokens_seen": 9032736, + "step": 15575 + }, + { + "epoch": 2.3205242776288353, + "grad_norm": 7.038660049438477, + "learning_rate": 4.9960927631643086e-05, + "loss": 0.488, + "num_input_tokens_seen": 9035680, + "step": 15580 + }, + { + "epoch": 2.3212689901697945, + "grad_norm": 3.286249876022339, + "learning_rate": 4.996074582067096e-05, + "loss": 0.4588, + "num_input_tokens_seen": 9038464, + "step": 15585 + }, + { + "epoch": 2.3220137027107537, + "grad_norm": 4.300003528594971, + "learning_rate": 4.9960563588012396e-05, + "loss": 0.4713, + "num_input_tokens_seen": 9041312, + "step": 15590 + }, + { + "epoch": 2.322758415251713, + "grad_norm": 5.00648832321167, + "learning_rate": 4.9960380933670495e-05, + "loss": 0.4358, + "num_input_tokens_seen": 9043968, + "step": 15595 + }, + { + "epoch": 2.323503127792672, + "grad_norm": 15.774779319763184, + "learning_rate": 4.996019785764832e-05, + "loss": 0.6648, + "num_input_tokens_seen": 9046784, + "step": 15600 + }, + { + "epoch": 2.3242478403336313, + "grad_norm": 9.113486289978027, + "learning_rate": 4.996001435994897e-05, + "loss": 0.5194, + "num_input_tokens_seen": 9049504, + "step": 15605 + }, + { + "epoch": 2.3249925528745905, + "grad_norm": 10.4730806350708, + "learning_rate": 4.995983044057554e-05, + "loss": 0.3555, + "num_input_tokens_seen": 9052544, + "step": 15610 + }, + { + "epoch": 2.3257372654155497, + "grad_norm": 8.351147651672363, + "learning_rate": 4.9959646099531156e-05, + "loss": 0.5545, + "num_input_tokens_seen": 9055424, + "step": 15615 + }, + { + "epoch": 2.326481977956509, + "grad_norm": 6.138033866882324, + "learning_rate": 4.99594613368189e-05, + "loss": 0.5386, + "num_input_tokens_seen": 9058528, + "step": 15620 + }, + { + "epoch": 2.327226690497468, + "grad_norm": 9.701777458190918, + "learning_rate": 4.995927615244193e-05, + "loss": 0.5904, + "num_input_tokens_seen": 9061280, + "step": 15625 + }, + { + "epoch": 2.3279714030384273, + "grad_norm": 19.788358688354492, + "learning_rate": 4.9959090546403356e-05, + "loss": 0.5585, + "num_input_tokens_seen": 9064160, + "step": 15630 + }, + { + "epoch": 2.3287161155793865, + "grad_norm": 8.946006774902344, + "learning_rate": 4.9958904518706305e-05, + "loss": 0.685, + "num_input_tokens_seen": 9067328, + "step": 15635 + }, + { + "epoch": 2.3294608281203457, + "grad_norm": 17.62689971923828, + "learning_rate": 4.9958718069353935e-05, + "loss": 0.5715, + "num_input_tokens_seen": 9070112, + "step": 15640 + }, + { + "epoch": 2.330205540661305, + "grad_norm": 10.630353927612305, + "learning_rate": 4.9958531198349384e-05, + "loss": 0.507, + "num_input_tokens_seen": 9072800, + "step": 15645 + }, + { + "epoch": 2.330950253202264, + "grad_norm": 6.413597583770752, + "learning_rate": 4.9958343905695823e-05, + "loss": 0.5524, + "num_input_tokens_seen": 9075776, + "step": 15650 + }, + { + "epoch": 2.331694965743223, + "grad_norm": 20.52243995666504, + "learning_rate": 4.99581561913964e-05, + "loss": 0.5892, + "num_input_tokens_seen": 9078944, + "step": 15655 + }, + { + "epoch": 2.3324396782841825, + "grad_norm": 6.582764625549316, + "learning_rate": 4.99579680554543e-05, + "loss": 0.5801, + "num_input_tokens_seen": 9081984, + "step": 15660 + }, + { + "epoch": 2.3331843908251413, + "grad_norm": 8.020626068115234, + "learning_rate": 4.99577794978727e-05, + "loss": 0.495, + "num_input_tokens_seen": 9084992, + "step": 15665 + }, + { + "epoch": 2.333929103366101, + "grad_norm": 3.9126665592193604, + "learning_rate": 4.995759051865477e-05, + "loss": 0.4954, + "num_input_tokens_seen": 9088032, + "step": 15670 + }, + { + "epoch": 2.3346738159070597, + "grad_norm": 5.324757099151611, + "learning_rate": 4.995740111780372e-05, + "loss": 0.5049, + "num_input_tokens_seen": 9090912, + "step": 15675 + }, + { + "epoch": 2.335418528448019, + "grad_norm": 2.7942254543304443, + "learning_rate": 4.995721129532275e-05, + "loss": 0.5893, + "num_input_tokens_seen": 9093760, + "step": 15680 + }, + { + "epoch": 2.336163240988978, + "grad_norm": 3.950309991836548, + "learning_rate": 4.9957021051215055e-05, + "loss": 0.4879, + "num_input_tokens_seen": 9096640, + "step": 15685 + }, + { + "epoch": 2.3369079535299373, + "grad_norm": 7.0915045738220215, + "learning_rate": 4.995683038548385e-05, + "loss": 0.5035, + "num_input_tokens_seen": 9099648, + "step": 15690 + }, + { + "epoch": 2.3376526660708965, + "grad_norm": 7.992865562438965, + "learning_rate": 4.995663929813237e-05, + "loss": 0.5971, + "num_input_tokens_seen": 9102432, + "step": 15695 + }, + { + "epoch": 2.3383973786118557, + "grad_norm": 15.78685474395752, + "learning_rate": 4.995644778916383e-05, + "loss": 0.5647, + "num_input_tokens_seen": 9105088, + "step": 15700 + }, + { + "epoch": 2.339142091152815, + "grad_norm": 11.694342613220215, + "learning_rate": 4.995625585858146e-05, + "loss": 0.5818, + "num_input_tokens_seen": 9107776, + "step": 15705 + }, + { + "epoch": 2.339886803693774, + "grad_norm": 11.606334686279297, + "learning_rate": 4.9956063506388524e-05, + "loss": 0.5715, + "num_input_tokens_seen": 9110656, + "step": 15710 + }, + { + "epoch": 2.3406315162347333, + "grad_norm": 7.787665843963623, + "learning_rate": 4.995587073258825e-05, + "loss": 0.4724, + "num_input_tokens_seen": 9113536, + "step": 15715 + }, + { + "epoch": 2.3413762287756925, + "grad_norm": 6.95513916015625, + "learning_rate": 4.995567753718391e-05, + "loss": 0.6183, + "num_input_tokens_seen": 9116576, + "step": 15720 + }, + { + "epoch": 2.3421209413166517, + "grad_norm": 6.263240814208984, + "learning_rate": 4.995548392017876e-05, + "loss": 0.4398, + "num_input_tokens_seen": 9119808, + "step": 15725 + }, + { + "epoch": 2.342865653857611, + "grad_norm": 11.21480655670166, + "learning_rate": 4.995528988157608e-05, + "loss": 0.5457, + "num_input_tokens_seen": 9122816, + "step": 15730 + }, + { + "epoch": 2.34361036639857, + "grad_norm": 10.465643882751465, + "learning_rate": 4.995509542137913e-05, + "loss": 0.444, + "num_input_tokens_seen": 9125408, + "step": 15735 + }, + { + "epoch": 2.3443550789395293, + "grad_norm": 13.061259269714355, + "learning_rate": 4.995490053959121e-05, + "loss": 0.6599, + "num_input_tokens_seen": 9128384, + "step": 15740 + }, + { + "epoch": 2.3450997914804885, + "grad_norm": 4.060069561004639, + "learning_rate": 4.995470523621561e-05, + "loss": 0.5388, + "num_input_tokens_seen": 9131104, + "step": 15745 + }, + { + "epoch": 2.3458445040214477, + "grad_norm": 9.068321228027344, + "learning_rate": 4.9954509511255625e-05, + "loss": 0.6158, + "num_input_tokens_seen": 9133792, + "step": 15750 + }, + { + "epoch": 2.346589216562407, + "grad_norm": 4.150508403778076, + "learning_rate": 4.9954313364714565e-05, + "loss": 0.5185, + "num_input_tokens_seen": 9136800, + "step": 15755 + }, + { + "epoch": 2.347333929103366, + "grad_norm": 8.694531440734863, + "learning_rate": 4.9954116796595754e-05, + "loss": 0.5598, + "num_input_tokens_seen": 9139840, + "step": 15760 + }, + { + "epoch": 2.3480786416443253, + "grad_norm": 2.579299211502075, + "learning_rate": 4.9953919806902486e-05, + "loss": 0.4564, + "num_input_tokens_seen": 9142656, + "step": 15765 + }, + { + "epoch": 2.3488233541852845, + "grad_norm": 16.111013412475586, + "learning_rate": 4.9953722395638115e-05, + "loss": 0.7776, + "num_input_tokens_seen": 9145600, + "step": 15770 + }, + { + "epoch": 2.3495680667262437, + "grad_norm": 5.786339282989502, + "learning_rate": 4.995352456280596e-05, + "loss": 0.6392, + "num_input_tokens_seen": 9148672, + "step": 15775 + }, + { + "epoch": 2.350312779267203, + "grad_norm": 3.413203239440918, + "learning_rate": 4.9953326308409364e-05, + "loss": 0.4491, + "num_input_tokens_seen": 9151680, + "step": 15780 + }, + { + "epoch": 2.351057491808162, + "grad_norm": 2.751650333404541, + "learning_rate": 4.9953127632451694e-05, + "loss": 0.3994, + "num_input_tokens_seen": 9154848, + "step": 15785 + }, + { + "epoch": 2.3518022043491214, + "grad_norm": 8.623340606689453, + "learning_rate": 4.995292853493629e-05, + "loss": 0.3544, + "num_input_tokens_seen": 9157536, + "step": 15790 + }, + { + "epoch": 2.3525469168900806, + "grad_norm": 3.6518781185150146, + "learning_rate": 4.995272901586652e-05, + "loss": 0.8325, + "num_input_tokens_seen": 9160576, + "step": 15795 + }, + { + "epoch": 2.3532916294310398, + "grad_norm": 6.700688362121582, + "learning_rate": 4.9952529075245744e-05, + "loss": 0.6157, + "num_input_tokens_seen": 9163680, + "step": 15800 + }, + { + "epoch": 2.354036341971999, + "grad_norm": 7.99701452255249, + "learning_rate": 4.995232871307736e-05, + "loss": 0.601, + "num_input_tokens_seen": 9166912, + "step": 15805 + }, + { + "epoch": 2.354781054512958, + "grad_norm": 8.529891014099121, + "learning_rate": 4.9952127929364746e-05, + "loss": 0.5616, + "num_input_tokens_seen": 9170048, + "step": 15810 + }, + { + "epoch": 2.3555257670539174, + "grad_norm": 8.171086311340332, + "learning_rate": 4.995192672411128e-05, + "loss": 0.6194, + "num_input_tokens_seen": 9173152, + "step": 15815 + }, + { + "epoch": 2.3562704795948766, + "grad_norm": 18.773773193359375, + "learning_rate": 4.995172509732038e-05, + "loss": 0.6963, + "num_input_tokens_seen": 9176064, + "step": 15820 + }, + { + "epoch": 2.3570151921358358, + "grad_norm": 9.769401550292969, + "learning_rate": 4.995152304899544e-05, + "loss": 0.6286, + "num_input_tokens_seen": 9178656, + "step": 15825 + }, + { + "epoch": 2.3577599046767945, + "grad_norm": 6.796291828155518, + "learning_rate": 4.9951320579139884e-05, + "loss": 0.4417, + "num_input_tokens_seen": 9181536, + "step": 15830 + }, + { + "epoch": 2.358504617217754, + "grad_norm": 8.895793914794922, + "learning_rate": 4.995111768775712e-05, + "loss": 0.5784, + "num_input_tokens_seen": 9184768, + "step": 15835 + }, + { + "epoch": 2.359249329758713, + "grad_norm": 6.070554256439209, + "learning_rate": 4.995091437485058e-05, + "loss": 0.3999, + "num_input_tokens_seen": 9187616, + "step": 15840 + }, + { + "epoch": 2.359994042299672, + "grad_norm": 2.7703468799591064, + "learning_rate": 4.9950710640423705e-05, + "loss": 0.449, + "num_input_tokens_seen": 9190560, + "step": 15845 + }, + { + "epoch": 2.3607387548406313, + "grad_norm": 3.752728223800659, + "learning_rate": 4.9950506484479934e-05, + "loss": 0.4059, + "num_input_tokens_seen": 9193248, + "step": 15850 + }, + { + "epoch": 2.3614834673815905, + "grad_norm": 6.2288079261779785, + "learning_rate": 4.995030190702271e-05, + "loss": 0.5979, + "num_input_tokens_seen": 9195936, + "step": 15855 + }, + { + "epoch": 2.3622281799225497, + "grad_norm": 9.66140079498291, + "learning_rate": 4.99500969080555e-05, + "loss": 0.4421, + "num_input_tokens_seen": 9198560, + "step": 15860 + }, + { + "epoch": 2.362972892463509, + "grad_norm": 10.75510025024414, + "learning_rate": 4.994989148758176e-05, + "loss": 0.4989, + "num_input_tokens_seen": 9201344, + "step": 15865 + }, + { + "epoch": 2.363717605004468, + "grad_norm": 6.48328971862793, + "learning_rate": 4.994968564560495e-05, + "loss": 0.6544, + "num_input_tokens_seen": 9204448, + "step": 15870 + }, + { + "epoch": 2.3644623175454274, + "grad_norm": 12.475186347961426, + "learning_rate": 4.994947938212857e-05, + "loss": 0.4277, + "num_input_tokens_seen": 9207360, + "step": 15875 + }, + { + "epoch": 2.3652070300863866, + "grad_norm": 7.595383644104004, + "learning_rate": 4.994927269715609e-05, + "loss": 0.361, + "num_input_tokens_seen": 9210176, + "step": 15880 + }, + { + "epoch": 2.3659517426273458, + "grad_norm": 15.34066390991211, + "learning_rate": 4.9949065590691e-05, + "loss": 0.6356, + "num_input_tokens_seen": 9213376, + "step": 15885 + }, + { + "epoch": 2.366696455168305, + "grad_norm": 3.962968111038208, + "learning_rate": 4.9948858062736814e-05, + "loss": 0.4644, + "num_input_tokens_seen": 9216416, + "step": 15890 + }, + { + "epoch": 2.367441167709264, + "grad_norm": 9.384471893310547, + "learning_rate": 4.994865011329702e-05, + "loss": 0.416, + "num_input_tokens_seen": 9219424, + "step": 15895 + }, + { + "epoch": 2.3681858802502234, + "grad_norm": 5.569833278656006, + "learning_rate": 4.994844174237514e-05, + "loss": 0.5281, + "num_input_tokens_seen": 9222208, + "step": 15900 + }, + { + "epoch": 2.3689305927911826, + "grad_norm": 7.373844146728516, + "learning_rate": 4.99482329499747e-05, + "loss": 0.5112, + "num_input_tokens_seen": 9224960, + "step": 15905 + }, + { + "epoch": 2.3696753053321418, + "grad_norm": 9.74526596069336, + "learning_rate": 4.994802373609922e-05, + "loss": 0.3867, + "num_input_tokens_seen": 9227776, + "step": 15910 + }, + { + "epoch": 2.370420017873101, + "grad_norm": 3.8649399280548096, + "learning_rate": 4.9947814100752226e-05, + "loss": 0.536, + "num_input_tokens_seen": 9230432, + "step": 15915 + }, + { + "epoch": 2.37116473041406, + "grad_norm": 5.80338191986084, + "learning_rate": 4.994760404393727e-05, + "loss": 0.5673, + "num_input_tokens_seen": 9233440, + "step": 15920 + }, + { + "epoch": 2.3719094429550194, + "grad_norm": 5.506563663482666, + "learning_rate": 4.994739356565791e-05, + "loss": 0.3738, + "num_input_tokens_seen": 9236288, + "step": 15925 + }, + { + "epoch": 2.3726541554959786, + "grad_norm": 7.911239147186279, + "learning_rate": 4.994718266591768e-05, + "loss": 0.4095, + "num_input_tokens_seen": 9238976, + "step": 15930 + }, + { + "epoch": 2.373398868036938, + "grad_norm": 6.794709205627441, + "learning_rate": 4.994697134472016e-05, + "loss": 0.5871, + "num_input_tokens_seen": 9241888, + "step": 15935 + }, + { + "epoch": 2.374143580577897, + "grad_norm": 18.756513595581055, + "learning_rate": 4.994675960206891e-05, + "loss": 0.3244, + "num_input_tokens_seen": 9244736, + "step": 15940 + }, + { + "epoch": 2.374888293118856, + "grad_norm": 9.623908996582031, + "learning_rate": 4.9946547437967515e-05, + "loss": 0.6605, + "num_input_tokens_seen": 9247712, + "step": 15945 + }, + { + "epoch": 2.3756330056598154, + "grad_norm": 5.256226062774658, + "learning_rate": 4.9946334852419555e-05, + "loss": 0.4584, + "num_input_tokens_seen": 9250816, + "step": 15950 + }, + { + "epoch": 2.3763777182007746, + "grad_norm": 4.576859474182129, + "learning_rate": 4.9946121845428616e-05, + "loss": 0.4579, + "num_input_tokens_seen": 9253536, + "step": 15955 + }, + { + "epoch": 2.377122430741734, + "grad_norm": 10.311059951782227, + "learning_rate": 4.994590841699831e-05, + "loss": 0.4985, + "num_input_tokens_seen": 9256544, + "step": 15960 + }, + { + "epoch": 2.377867143282693, + "grad_norm": 10.313130378723145, + "learning_rate": 4.9945694567132227e-05, + "loss": 0.694, + "num_input_tokens_seen": 9259360, + "step": 15965 + }, + { + "epoch": 2.378611855823652, + "grad_norm": 13.77441120147705, + "learning_rate": 4.9945480295834e-05, + "loss": 0.6872, + "num_input_tokens_seen": 9262208, + "step": 15970 + }, + { + "epoch": 2.3793565683646114, + "grad_norm": 9.44268798828125, + "learning_rate": 4.994526560310723e-05, + "loss": 0.4288, + "num_input_tokens_seen": 9265280, + "step": 15975 + }, + { + "epoch": 2.3801012809055706, + "grad_norm": 5.47321891784668, + "learning_rate": 4.994505048895555e-05, + "loss": 0.399, + "num_input_tokens_seen": 9268000, + "step": 15980 + }, + { + "epoch": 2.38084599344653, + "grad_norm": 13.081036567687988, + "learning_rate": 4.99448349533826e-05, + "loss": 0.5137, + "num_input_tokens_seen": 9271008, + "step": 15985 + }, + { + "epoch": 2.381590705987489, + "grad_norm": 4.258434295654297, + "learning_rate": 4.9944618996392014e-05, + "loss": 0.6027, + "num_input_tokens_seen": 9274368, + "step": 15990 + }, + { + "epoch": 2.382335418528448, + "grad_norm": 12.849864959716797, + "learning_rate": 4.994440261798743e-05, + "loss": 0.4921, + "num_input_tokens_seen": 9277344, + "step": 15995 + }, + { + "epoch": 2.3830801310694074, + "grad_norm": 18.576412200927734, + "learning_rate": 4.994418581817254e-05, + "loss": 0.6446, + "num_input_tokens_seen": 9280160, + "step": 16000 + }, + { + "epoch": 2.383824843610366, + "grad_norm": 11.613269805908203, + "learning_rate": 4.994396859695096e-05, + "loss": 0.6009, + "num_input_tokens_seen": 9282912, + "step": 16005 + }, + { + "epoch": 2.384569556151326, + "grad_norm": 11.253809928894043, + "learning_rate": 4.99437509543264e-05, + "loss": 0.6513, + "num_input_tokens_seen": 9285600, + "step": 16010 + }, + { + "epoch": 2.3853142686922846, + "grad_norm": 7.886337757110596, + "learning_rate": 4.994353289030251e-05, + "loss": 0.6066, + "num_input_tokens_seen": 9288576, + "step": 16015 + }, + { + "epoch": 2.386058981233244, + "grad_norm": 4.711582660675049, + "learning_rate": 4.994331440488298e-05, + "loss": 0.4542, + "num_input_tokens_seen": 9291552, + "step": 16020 + }, + { + "epoch": 2.386803693774203, + "grad_norm": 6.658456325531006, + "learning_rate": 4.994309549807151e-05, + "loss": 0.6267, + "num_input_tokens_seen": 9294528, + "step": 16025 + }, + { + "epoch": 2.387548406315162, + "grad_norm": 7.328498840332031, + "learning_rate": 4.9942876169871794e-05, + "loss": 0.4993, + "num_input_tokens_seen": 9297440, + "step": 16030 + }, + { + "epoch": 2.3882931188561214, + "grad_norm": 8.803662300109863, + "learning_rate": 4.9942656420287535e-05, + "loss": 0.5563, + "num_input_tokens_seen": 9300256, + "step": 16035 + }, + { + "epoch": 2.3890378313970806, + "grad_norm": 9.15688419342041, + "learning_rate": 4.9942436249322444e-05, + "loss": 0.4873, + "num_input_tokens_seen": 9303392, + "step": 16040 + }, + { + "epoch": 2.38978254393804, + "grad_norm": 8.541722297668457, + "learning_rate": 4.994221565698025e-05, + "loss": 0.6561, + "num_input_tokens_seen": 9306432, + "step": 16045 + }, + { + "epoch": 2.390527256478999, + "grad_norm": 3.385509490966797, + "learning_rate": 4.9941994643264665e-05, + "loss": 0.3992, + "num_input_tokens_seen": 9309312, + "step": 16050 + }, + { + "epoch": 2.391271969019958, + "grad_norm": 7.005033493041992, + "learning_rate": 4.994177320817943e-05, + "loss": 0.4886, + "num_input_tokens_seen": 9312544, + "step": 16055 + }, + { + "epoch": 2.3920166815609174, + "grad_norm": 2.5769448280334473, + "learning_rate": 4.9941551351728286e-05, + "loss": 0.3117, + "num_input_tokens_seen": 9315520, + "step": 16060 + }, + { + "epoch": 2.3927613941018766, + "grad_norm": 5.350158214569092, + "learning_rate": 4.994132907391499e-05, + "loss": 0.5934, + "num_input_tokens_seen": 9318336, + "step": 16065 + }, + { + "epoch": 2.393506106642836, + "grad_norm": 8.4320650100708, + "learning_rate": 4.994110637474327e-05, + "loss": 0.8251, + "num_input_tokens_seen": 9321600, + "step": 16070 + }, + { + "epoch": 2.394250819183795, + "grad_norm": 5.712413311004639, + "learning_rate": 4.994088325421693e-05, + "loss": 0.5187, + "num_input_tokens_seen": 9324384, + "step": 16075 + }, + { + "epoch": 2.394995531724754, + "grad_norm": 9.18840217590332, + "learning_rate": 4.99406597123397e-05, + "loss": 0.4901, + "num_input_tokens_seen": 9327744, + "step": 16080 + }, + { + "epoch": 2.3957402442657134, + "grad_norm": 3.616295576095581, + "learning_rate": 4.994043574911538e-05, + "loss": 0.2568, + "num_input_tokens_seen": 9330560, + "step": 16085 + }, + { + "epoch": 2.3964849568066726, + "grad_norm": 12.042734146118164, + "learning_rate": 4.9940211364547744e-05, + "loss": 0.5396, + "num_input_tokens_seen": 9333952, + "step": 16090 + }, + { + "epoch": 2.397229669347632, + "grad_norm": 3.242896318435669, + "learning_rate": 4.9939986558640585e-05, + "loss": 0.3063, + "num_input_tokens_seen": 9336992, + "step": 16095 + }, + { + "epoch": 2.397974381888591, + "grad_norm": 12.599398612976074, + "learning_rate": 4.99397613313977e-05, + "loss": 0.8592, + "num_input_tokens_seen": 9340096, + "step": 16100 + }, + { + "epoch": 2.3987190944295502, + "grad_norm": 16.622020721435547, + "learning_rate": 4.99395356828229e-05, + "loss": 0.4086, + "num_input_tokens_seen": 9343104, + "step": 16105 + }, + { + "epoch": 2.3994638069705094, + "grad_norm": 5.22086763381958, + "learning_rate": 4.993930961291999e-05, + "loss": 0.6175, + "num_input_tokens_seen": 9346240, + "step": 16110 + }, + { + "epoch": 2.4002085195114686, + "grad_norm": 12.418217658996582, + "learning_rate": 4.993908312169279e-05, + "loss": 0.673, + "num_input_tokens_seen": 9349056, + "step": 16115 + }, + { + "epoch": 2.400953232052428, + "grad_norm": 8.26197624206543, + "learning_rate": 4.9938856209145123e-05, + "loss": 0.5348, + "num_input_tokens_seen": 9352000, + "step": 16120 + }, + { + "epoch": 2.401697944593387, + "grad_norm": 9.65678596496582, + "learning_rate": 4.993862887528083e-05, + "loss": 0.5152, + "num_input_tokens_seen": 9354880, + "step": 16125 + }, + { + "epoch": 2.4024426571343462, + "grad_norm": 2.5696771144866943, + "learning_rate": 4.9938401120103755e-05, + "loss": 0.3747, + "num_input_tokens_seen": 9357792, + "step": 16130 + }, + { + "epoch": 2.4031873696753054, + "grad_norm": 9.45675277709961, + "learning_rate": 4.9938172943617735e-05, + "loss": 0.215, + "num_input_tokens_seen": 9360608, + "step": 16135 + }, + { + "epoch": 2.4039320822162646, + "grad_norm": 5.8023810386657715, + "learning_rate": 4.993794434582663e-05, + "loss": 0.3455, + "num_input_tokens_seen": 9363744, + "step": 16140 + }, + { + "epoch": 2.404676794757224, + "grad_norm": 4.709512233734131, + "learning_rate": 4.99377153267343e-05, + "loss": 0.5675, + "num_input_tokens_seen": 9366912, + "step": 16145 + }, + { + "epoch": 2.405421507298183, + "grad_norm": 11.220211029052734, + "learning_rate": 4.9937485886344614e-05, + "loss": 0.4121, + "num_input_tokens_seen": 9369856, + "step": 16150 + }, + { + "epoch": 2.4061662198391423, + "grad_norm": 8.757868766784668, + "learning_rate": 4.993725602466145e-05, + "loss": 0.4137, + "num_input_tokens_seen": 9372640, + "step": 16155 + }, + { + "epoch": 2.4069109323801015, + "grad_norm": 4.340311050415039, + "learning_rate": 4.9937025741688694e-05, + "loss": 0.4682, + "num_input_tokens_seen": 9375648, + "step": 16160 + }, + { + "epoch": 2.4076556449210607, + "grad_norm": 17.67807960510254, + "learning_rate": 4.993679503743023e-05, + "loss": 0.4886, + "num_input_tokens_seen": 9378720, + "step": 16165 + }, + { + "epoch": 2.4084003574620194, + "grad_norm": 10.188387870788574, + "learning_rate": 4.993656391188995e-05, + "loss": 0.3164, + "num_input_tokens_seen": 9381632, + "step": 16170 + }, + { + "epoch": 2.409145070002979, + "grad_norm": 7.746921539306641, + "learning_rate": 4.993633236507178e-05, + "loss": 0.6597, + "num_input_tokens_seen": 9384192, + "step": 16175 + }, + { + "epoch": 2.409889782543938, + "grad_norm": 3.6039388179779053, + "learning_rate": 4.9936100396979614e-05, + "loss": 0.3507, + "num_input_tokens_seen": 9387200, + "step": 16180 + }, + { + "epoch": 2.4106344950848975, + "grad_norm": 8.841841697692871, + "learning_rate": 4.993586800761738e-05, + "loss": 0.4572, + "num_input_tokens_seen": 9389888, + "step": 16185 + }, + { + "epoch": 2.4113792076258562, + "grad_norm": 11.154200553894043, + "learning_rate": 4.9935635196989e-05, + "loss": 0.3354, + "num_input_tokens_seen": 9392992, + "step": 16190 + }, + { + "epoch": 2.4121239201668154, + "grad_norm": 5.52909517288208, + "learning_rate": 4.9935401965098395e-05, + "loss": 0.4822, + "num_input_tokens_seen": 9395744, + "step": 16195 + }, + { + "epoch": 2.4128686327077746, + "grad_norm": 7.673958778381348, + "learning_rate": 4.9935168311949524e-05, + "loss": 0.6663, + "num_input_tokens_seen": 9398560, + "step": 16200 + }, + { + "epoch": 2.413613345248734, + "grad_norm": 5.880404949188232, + "learning_rate": 4.9934934237546326e-05, + "loss": 0.452, + "num_input_tokens_seen": 9401568, + "step": 16205 + }, + { + "epoch": 2.414358057789693, + "grad_norm": 3.181967258453369, + "learning_rate": 4.993469974189275e-05, + "loss": 0.6208, + "num_input_tokens_seen": 9404640, + "step": 16210 + }, + { + "epoch": 2.4151027703306522, + "grad_norm": 15.924302101135254, + "learning_rate": 4.993446482499278e-05, + "loss": 0.5467, + "num_input_tokens_seen": 9407776, + "step": 16215 + }, + { + "epoch": 2.4158474828716114, + "grad_norm": 9.445512771606445, + "learning_rate": 4.993422948685036e-05, + "loss": 0.7057, + "num_input_tokens_seen": 9410432, + "step": 16220 + }, + { + "epoch": 2.4165921954125706, + "grad_norm": 8.80227279663086, + "learning_rate": 4.993399372746948e-05, + "loss": 0.7018, + "num_input_tokens_seen": 9413280, + "step": 16225 + }, + { + "epoch": 2.41733690795353, + "grad_norm": 10.468714714050293, + "learning_rate": 4.9933757546854115e-05, + "loss": 0.4497, + "num_input_tokens_seen": 9416032, + "step": 16230 + }, + { + "epoch": 2.418081620494489, + "grad_norm": 7.002691268920898, + "learning_rate": 4.993352094500825e-05, + "loss": 0.4144, + "num_input_tokens_seen": 9419392, + "step": 16235 + }, + { + "epoch": 2.4188263330354483, + "grad_norm": 4.737176418304443, + "learning_rate": 4.99332839219359e-05, + "loss": 0.3751, + "num_input_tokens_seen": 9422080, + "step": 16240 + }, + { + "epoch": 2.4195710455764075, + "grad_norm": 4.817221164703369, + "learning_rate": 4.993304647764106e-05, + "loss": 0.4706, + "num_input_tokens_seen": 9424896, + "step": 16245 + }, + { + "epoch": 2.4203157581173667, + "grad_norm": 10.661848068237305, + "learning_rate": 4.993280861212773e-05, + "loss": 0.4372, + "num_input_tokens_seen": 9427904, + "step": 16250 + }, + { + "epoch": 2.421060470658326, + "grad_norm": 8.93978214263916, + "learning_rate": 4.993257032539995e-05, + "loss": 0.6526, + "num_input_tokens_seen": 9430944, + "step": 16255 + }, + { + "epoch": 2.421805183199285, + "grad_norm": 8.200128555297852, + "learning_rate": 4.993233161746174e-05, + "loss": 0.5656, + "num_input_tokens_seen": 9433952, + "step": 16260 + }, + { + "epoch": 2.4225498957402443, + "grad_norm": 6.141461372375488, + "learning_rate": 4.993209248831711e-05, + "loss": 0.4754, + "num_input_tokens_seen": 9436960, + "step": 16265 + }, + { + "epoch": 2.4232946082812035, + "grad_norm": 6.4933319091796875, + "learning_rate": 4.9931852937970124e-05, + "loss": 0.6706, + "num_input_tokens_seen": 9439712, + "step": 16270 + }, + { + "epoch": 2.4240393208221627, + "grad_norm": 8.142471313476562, + "learning_rate": 4.9931612966424824e-05, + "loss": 0.5526, + "num_input_tokens_seen": 9442656, + "step": 16275 + }, + { + "epoch": 2.424784033363122, + "grad_norm": 6.365461349487305, + "learning_rate": 4.993137257368526e-05, + "loss": 0.4618, + "num_input_tokens_seen": 9445632, + "step": 16280 + }, + { + "epoch": 2.425528745904081, + "grad_norm": 11.93380069732666, + "learning_rate": 4.99311317597555e-05, + "loss": 0.7284, + "num_input_tokens_seen": 9448672, + "step": 16285 + }, + { + "epoch": 2.4262734584450403, + "grad_norm": 7.533616542816162, + "learning_rate": 4.993089052463961e-05, + "loss": 0.5325, + "num_input_tokens_seen": 9451520, + "step": 16290 + }, + { + "epoch": 2.4270181709859995, + "grad_norm": 8.200165748596191, + "learning_rate": 4.993064886834166e-05, + "loss": 0.655, + "num_input_tokens_seen": 9454240, + "step": 16295 + }, + { + "epoch": 2.4277628835269587, + "grad_norm": 5.492110252380371, + "learning_rate": 4.993040679086573e-05, + "loss": 0.5466, + "num_input_tokens_seen": 9457216, + "step": 16300 + }, + { + "epoch": 2.428507596067918, + "grad_norm": 10.641921997070312, + "learning_rate": 4.993016429221592e-05, + "loss": 0.4636, + "num_input_tokens_seen": 9460096, + "step": 16305 + }, + { + "epoch": 2.429252308608877, + "grad_norm": 10.272025108337402, + "learning_rate": 4.992992137239632e-05, + "loss": 0.4378, + "num_input_tokens_seen": 9462976, + "step": 16310 + }, + { + "epoch": 2.4299970211498363, + "grad_norm": 9.303936958312988, + "learning_rate": 4.992967803141104e-05, + "loss": 0.5983, + "num_input_tokens_seen": 9466016, + "step": 16315 + }, + { + "epoch": 2.4307417336907955, + "grad_norm": 7.134891033172607, + "learning_rate": 4.992943426926419e-05, + "loss": 0.5991, + "num_input_tokens_seen": 9469248, + "step": 16320 + }, + { + "epoch": 2.4314864462317547, + "grad_norm": 4.1972336769104, + "learning_rate": 4.9929190085959874e-05, + "loss": 0.4264, + "num_input_tokens_seen": 9472224, + "step": 16325 + }, + { + "epoch": 2.432231158772714, + "grad_norm": 5.7477850914001465, + "learning_rate": 4.9928945481502234e-05, + "loss": 0.4863, + "num_input_tokens_seen": 9474816, + "step": 16330 + }, + { + "epoch": 2.432975871313673, + "grad_norm": 12.881675720214844, + "learning_rate": 4.9928700455895394e-05, + "loss": 0.5649, + "num_input_tokens_seen": 9477504, + "step": 16335 + }, + { + "epoch": 2.4337205838546323, + "grad_norm": 9.548267364501953, + "learning_rate": 4.99284550091435e-05, + "loss": 0.5771, + "num_input_tokens_seen": 9480288, + "step": 16340 + }, + { + "epoch": 2.434465296395591, + "grad_norm": 12.876684188842773, + "learning_rate": 4.992820914125069e-05, + "loss": 0.6126, + "num_input_tokens_seen": 9483392, + "step": 16345 + }, + { + "epoch": 2.4352100089365507, + "grad_norm": 4.261139869689941, + "learning_rate": 4.9927962852221136e-05, + "loss": 0.3674, + "num_input_tokens_seen": 9486208, + "step": 16350 + }, + { + "epoch": 2.4359547214775095, + "grad_norm": 7.16661262512207, + "learning_rate": 4.9927716142058976e-05, + "loss": 0.5314, + "num_input_tokens_seen": 9488960, + "step": 16355 + }, + { + "epoch": 2.436699434018469, + "grad_norm": 6.26410436630249, + "learning_rate": 4.992746901076838e-05, + "loss": 0.8227, + "num_input_tokens_seen": 9491936, + "step": 16360 + }, + { + "epoch": 2.437444146559428, + "grad_norm": 6.255679130554199, + "learning_rate": 4.992722145835354e-05, + "loss": 0.5396, + "num_input_tokens_seen": 9494944, + "step": 16365 + }, + { + "epoch": 2.438188859100387, + "grad_norm": 3.8211863040924072, + "learning_rate": 4.992697348481863e-05, + "loss": 0.6084, + "num_input_tokens_seen": 9497824, + "step": 16370 + }, + { + "epoch": 2.4389335716413463, + "grad_norm": 4.461001873016357, + "learning_rate": 4.992672509016782e-05, + "loss": 0.4799, + "num_input_tokens_seen": 9500672, + "step": 16375 + }, + { + "epoch": 2.4396782841823055, + "grad_norm": 6.441083908081055, + "learning_rate": 4.992647627440534e-05, + "loss": 0.6306, + "num_input_tokens_seen": 9503936, + "step": 16380 + }, + { + "epoch": 2.4404229967232647, + "grad_norm": 4.838130950927734, + "learning_rate": 4.992622703753538e-05, + "loss": 0.4178, + "num_input_tokens_seen": 9506720, + "step": 16385 + }, + { + "epoch": 2.441167709264224, + "grad_norm": 7.3229475021362305, + "learning_rate": 4.9925977379562145e-05, + "loss": 0.4363, + "num_input_tokens_seen": 9509568, + "step": 16390 + }, + { + "epoch": 2.441912421805183, + "grad_norm": 15.103792190551758, + "learning_rate": 4.9925727300489853e-05, + "loss": 0.9632, + "num_input_tokens_seen": 9512512, + "step": 16395 + }, + { + "epoch": 2.4426571343461423, + "grad_norm": 8.262227058410645, + "learning_rate": 4.9925476800322735e-05, + "loss": 0.4538, + "num_input_tokens_seen": 9515680, + "step": 16400 + }, + { + "epoch": 2.4434018468871015, + "grad_norm": 8.70838451385498, + "learning_rate": 4.992522587906501e-05, + "loss": 0.6727, + "num_input_tokens_seen": 9518816, + "step": 16405 + }, + { + "epoch": 2.4441465594280607, + "grad_norm": 6.248902797698975, + "learning_rate": 4.992497453672094e-05, + "loss": 0.6403, + "num_input_tokens_seen": 9521632, + "step": 16410 + }, + { + "epoch": 2.44489127196902, + "grad_norm": 8.698843955993652, + "learning_rate": 4.9924722773294745e-05, + "loss": 0.5957, + "num_input_tokens_seen": 9524512, + "step": 16415 + }, + { + "epoch": 2.445635984509979, + "grad_norm": 7.521799564361572, + "learning_rate": 4.99244705887907e-05, + "loss": 0.5136, + "num_input_tokens_seen": 9527200, + "step": 16420 + }, + { + "epoch": 2.4463806970509383, + "grad_norm": 3.657895803451538, + "learning_rate": 4.992421798321305e-05, + "loss": 0.4338, + "num_input_tokens_seen": 9529984, + "step": 16425 + }, + { + "epoch": 2.4471254095918975, + "grad_norm": 7.795616149902344, + "learning_rate": 4.992396495656608e-05, + "loss": 0.6072, + "num_input_tokens_seen": 9532864, + "step": 16430 + }, + { + "epoch": 2.4478701221328567, + "grad_norm": 9.199615478515625, + "learning_rate": 4.992371150885404e-05, + "loss": 0.464, + "num_input_tokens_seen": 9535680, + "step": 16435 + }, + { + "epoch": 2.448614834673816, + "grad_norm": 8.715947151184082, + "learning_rate": 4.9923457640081236e-05, + "loss": 0.3456, + "num_input_tokens_seen": 9538752, + "step": 16440 + }, + { + "epoch": 2.449359547214775, + "grad_norm": 12.051628112792969, + "learning_rate": 4.992320335025194e-05, + "loss": 0.4922, + "num_input_tokens_seen": 9541856, + "step": 16445 + }, + { + "epoch": 2.4501042597557343, + "grad_norm": 27.558298110961914, + "learning_rate": 4.992294863937046e-05, + "loss": 0.5611, + "num_input_tokens_seen": 9544672, + "step": 16450 + }, + { + "epoch": 2.4508489722966935, + "grad_norm": 8.312311172485352, + "learning_rate": 4.9922693507441084e-05, + "loss": 0.5987, + "num_input_tokens_seen": 9547328, + "step": 16455 + }, + { + "epoch": 2.4515936848376527, + "grad_norm": 8.324525833129883, + "learning_rate": 4.9922437954468136e-05, + "loss": 0.5403, + "num_input_tokens_seen": 9550528, + "step": 16460 + }, + { + "epoch": 2.452338397378612, + "grad_norm": 0.8053598403930664, + "learning_rate": 4.992218198045593e-05, + "loss": 0.2944, + "num_input_tokens_seen": 9553440, + "step": 16465 + }, + { + "epoch": 2.453083109919571, + "grad_norm": 5.028735637664795, + "learning_rate": 4.992192558540879e-05, + "loss": 0.5951, + "num_input_tokens_seen": 9556288, + "step": 16470 + }, + { + "epoch": 2.4538278224605303, + "grad_norm": 12.704538345336914, + "learning_rate": 4.992166876933105e-05, + "loss": 0.599, + "num_input_tokens_seen": 9559232, + "step": 16475 + }, + { + "epoch": 2.4545725350014895, + "grad_norm": 4.889638900756836, + "learning_rate": 4.9921411532227036e-05, + "loss": 0.4644, + "num_input_tokens_seen": 9562464, + "step": 16480 + }, + { + "epoch": 2.4553172475424487, + "grad_norm": 11.321023941040039, + "learning_rate": 4.992115387410111e-05, + "loss": 0.5049, + "num_input_tokens_seen": 9565472, + "step": 16485 + }, + { + "epoch": 2.456061960083408, + "grad_norm": 7.483890056610107, + "learning_rate": 4.992089579495762e-05, + "loss": 0.5293, + "num_input_tokens_seen": 9568448, + "step": 16490 + }, + { + "epoch": 2.456806672624367, + "grad_norm": 18.241926193237305, + "learning_rate": 4.992063729480092e-05, + "loss": 0.5933, + "num_input_tokens_seen": 9571360, + "step": 16495 + }, + { + "epoch": 2.4575513851653263, + "grad_norm": 10.087448120117188, + "learning_rate": 4.992037837363538e-05, + "loss": 0.4498, + "num_input_tokens_seen": 9574176, + "step": 16500 + }, + { + "epoch": 2.4582960977062855, + "grad_norm": 4.653641700744629, + "learning_rate": 4.992011903146537e-05, + "loss": 0.4211, + "num_input_tokens_seen": 9577056, + "step": 16505 + }, + { + "epoch": 2.4590408102472447, + "grad_norm": 9.452094078063965, + "learning_rate": 4.991985926829529e-05, + "loss": 0.7396, + "num_input_tokens_seen": 9579968, + "step": 16510 + }, + { + "epoch": 2.459785522788204, + "grad_norm": 5.652906894683838, + "learning_rate": 4.991959908412951e-05, + "loss": 0.4875, + "num_input_tokens_seen": 9582720, + "step": 16515 + }, + { + "epoch": 2.4605302353291627, + "grad_norm": 8.037702560424805, + "learning_rate": 4.9919338478972424e-05, + "loss": 0.4985, + "num_input_tokens_seen": 9585600, + "step": 16520 + }, + { + "epoch": 2.4612749478701224, + "grad_norm": 5.610611915588379, + "learning_rate": 4.991907745282845e-05, + "loss": 0.3273, + "num_input_tokens_seen": 9588416, + "step": 16525 + }, + { + "epoch": 2.462019660411081, + "grad_norm": 2.1179656982421875, + "learning_rate": 4.9918816005701986e-05, + "loss": 0.426, + "num_input_tokens_seen": 9591424, + "step": 16530 + }, + { + "epoch": 2.4627643729520408, + "grad_norm": 4.579212188720703, + "learning_rate": 4.9918554137597454e-05, + "loss": 0.6393, + "num_input_tokens_seen": 9594560, + "step": 16535 + }, + { + "epoch": 2.4635090854929995, + "grad_norm": 7.938742160797119, + "learning_rate": 4.9918291848519275e-05, + "loss": 0.6077, + "num_input_tokens_seen": 9597504, + "step": 16540 + }, + { + "epoch": 2.4642537980339587, + "grad_norm": 8.110983848571777, + "learning_rate": 4.991802913847188e-05, + "loss": 0.535, + "num_input_tokens_seen": 9600480, + "step": 16545 + }, + { + "epoch": 2.464998510574918, + "grad_norm": 15.408477783203125, + "learning_rate": 4.9917766007459696e-05, + "loss": 0.6079, + "num_input_tokens_seen": 9603232, + "step": 16550 + }, + { + "epoch": 2.465743223115877, + "grad_norm": 5.376555442810059, + "learning_rate": 4.99175024554872e-05, + "loss": 0.5731, + "num_input_tokens_seen": 9606464, + "step": 16555 + }, + { + "epoch": 2.4664879356568363, + "grad_norm": 3.220148801803589, + "learning_rate": 4.991723848255881e-05, + "loss": 0.4299, + "num_input_tokens_seen": 9609504, + "step": 16560 + }, + { + "epoch": 2.4672326481977955, + "grad_norm": 9.309980392456055, + "learning_rate": 4.9916974088679015e-05, + "loss": 0.4575, + "num_input_tokens_seen": 9612256, + "step": 16565 + }, + { + "epoch": 2.4679773607387547, + "grad_norm": 7.3904008865356445, + "learning_rate": 4.991670927385226e-05, + "loss": 0.4559, + "num_input_tokens_seen": 9615264, + "step": 16570 + }, + { + "epoch": 2.468722073279714, + "grad_norm": 6.21350622177124, + "learning_rate": 4.9916444038083024e-05, + "loss": 0.6687, + "num_input_tokens_seen": 9618112, + "step": 16575 + }, + { + "epoch": 2.469466785820673, + "grad_norm": 12.346955299377441, + "learning_rate": 4.991617838137579e-05, + "loss": 0.4805, + "num_input_tokens_seen": 9621152, + "step": 16580 + }, + { + "epoch": 2.4702114983616323, + "grad_norm": 8.350966453552246, + "learning_rate": 4.991591230373505e-05, + "loss": 0.4927, + "num_input_tokens_seen": 9623968, + "step": 16585 + }, + { + "epoch": 2.4709562109025915, + "grad_norm": 10.750513076782227, + "learning_rate": 4.99156458051653e-05, + "loss": 0.4835, + "num_input_tokens_seen": 9626720, + "step": 16590 + }, + { + "epoch": 2.4717009234435507, + "grad_norm": 14.829179763793945, + "learning_rate": 4.9915378885671026e-05, + "loss": 0.536, + "num_input_tokens_seen": 9629664, + "step": 16595 + }, + { + "epoch": 2.47244563598451, + "grad_norm": 14.756640434265137, + "learning_rate": 4.9915111545256754e-05, + "loss": 0.679, + "num_input_tokens_seen": 9632320, + "step": 16600 + }, + { + "epoch": 2.473190348525469, + "grad_norm": 14.345931053161621, + "learning_rate": 4.991484378392699e-05, + "loss": 0.7191, + "num_input_tokens_seen": 9635168, + "step": 16605 + }, + { + "epoch": 2.4739350610664284, + "grad_norm": 6.86167049407959, + "learning_rate": 4.9914575601686266e-05, + "loss": 0.3653, + "num_input_tokens_seen": 9638432, + "step": 16610 + }, + { + "epoch": 2.4746797736073876, + "grad_norm": 8.374226570129395, + "learning_rate": 4.9914306998539115e-05, + "loss": 0.508, + "num_input_tokens_seen": 9641632, + "step": 16615 + }, + { + "epoch": 2.4754244861483468, + "grad_norm": 5.8691229820251465, + "learning_rate": 4.991403797449006e-05, + "loss": 0.572, + "num_input_tokens_seen": 9644672, + "step": 16620 + }, + { + "epoch": 2.476169198689306, + "grad_norm": 12.757109642028809, + "learning_rate": 4.9913768529543656e-05, + "loss": 0.6757, + "num_input_tokens_seen": 9647360, + "step": 16625 + }, + { + "epoch": 2.476913911230265, + "grad_norm": 11.840375900268555, + "learning_rate": 4.991349866370446e-05, + "loss": 0.5802, + "num_input_tokens_seen": 9650464, + "step": 16630 + }, + { + "epoch": 2.4776586237712244, + "grad_norm": 6.205803394317627, + "learning_rate": 4.9913228376977026e-05, + "loss": 0.5862, + "num_input_tokens_seen": 9653056, + "step": 16635 + }, + { + "epoch": 2.4784033363121836, + "grad_norm": 5.225337982177734, + "learning_rate": 4.991295766936591e-05, + "loss": 0.477, + "num_input_tokens_seen": 9655808, + "step": 16640 + }, + { + "epoch": 2.4791480488531428, + "grad_norm": 15.225621223449707, + "learning_rate": 4.9912686540875696e-05, + "loss": 0.4992, + "num_input_tokens_seen": 9658592, + "step": 16645 + }, + { + "epoch": 2.479892761394102, + "grad_norm": 8.532256126403809, + "learning_rate": 4.991241499151097e-05, + "loss": 0.5552, + "num_input_tokens_seen": 9661600, + "step": 16650 + }, + { + "epoch": 2.480637473935061, + "grad_norm": 10.607525825500488, + "learning_rate": 4.9912143021276306e-05, + "loss": 0.4853, + "num_input_tokens_seen": 9664576, + "step": 16655 + }, + { + "epoch": 2.4813821864760204, + "grad_norm": 11.878466606140137, + "learning_rate": 4.991187063017631e-05, + "loss": 0.5818, + "num_input_tokens_seen": 9667776, + "step": 16660 + }, + { + "epoch": 2.4821268990169796, + "grad_norm": 9.163976669311523, + "learning_rate": 4.9911597818215575e-05, + "loss": 0.4652, + "num_input_tokens_seen": 9670720, + "step": 16665 + }, + { + "epoch": 2.482871611557939, + "grad_norm": 5.401299476623535, + "learning_rate": 4.9911324585398724e-05, + "loss": 0.5056, + "num_input_tokens_seen": 9673856, + "step": 16670 + }, + { + "epoch": 2.483616324098898, + "grad_norm": 8.407876014709473, + "learning_rate": 4.9911050931730354e-05, + "loss": 0.5714, + "num_input_tokens_seen": 9676576, + "step": 16675 + }, + { + "epoch": 2.484361036639857, + "grad_norm": 5.717964172363281, + "learning_rate": 4.9910776857215094e-05, + "loss": 0.5592, + "num_input_tokens_seen": 9679872, + "step": 16680 + }, + { + "epoch": 2.4851057491808164, + "grad_norm": 8.144309043884277, + "learning_rate": 4.991050236185758e-05, + "loss": 0.3233, + "num_input_tokens_seen": 9682624, + "step": 16685 + }, + { + "epoch": 2.4858504617217756, + "grad_norm": 6.064314842224121, + "learning_rate": 4.991022744566245e-05, + "loss": 0.5498, + "num_input_tokens_seen": 9685504, + "step": 16690 + }, + { + "epoch": 2.4865951742627344, + "grad_norm": 3.5232086181640625, + "learning_rate": 4.990995210863434e-05, + "loss": 0.5318, + "num_input_tokens_seen": 9688416, + "step": 16695 + }, + { + "epoch": 2.487339886803694, + "grad_norm": 6.1675801277160645, + "learning_rate": 4.9909676350777914e-05, + "loss": 0.5216, + "num_input_tokens_seen": 9691360, + "step": 16700 + }, + { + "epoch": 2.4880845993446528, + "grad_norm": 13.205432891845703, + "learning_rate": 4.990940017209782e-05, + "loss": 0.8931, + "num_input_tokens_seen": 9694400, + "step": 16705 + }, + { + "epoch": 2.488829311885612, + "grad_norm": 4.370105743408203, + "learning_rate": 4.990912357259872e-05, + "loss": 0.4278, + "num_input_tokens_seen": 9697504, + "step": 16710 + }, + { + "epoch": 2.489574024426571, + "grad_norm": 6.18510627746582, + "learning_rate": 4.99088465522853e-05, + "loss": 0.5769, + "num_input_tokens_seen": 9700480, + "step": 16715 + }, + { + "epoch": 2.4903187369675304, + "grad_norm": 5.0007429122924805, + "learning_rate": 4.9908569111162226e-05, + "loss": 0.422, + "num_input_tokens_seen": 9703296, + "step": 16720 + }, + { + "epoch": 2.4910634495084896, + "grad_norm": 9.106595039367676, + "learning_rate": 4.9908291249234207e-05, + "loss": 0.5043, + "num_input_tokens_seen": 9706464, + "step": 16725 + }, + { + "epoch": 2.4918081620494488, + "grad_norm": 5.554112434387207, + "learning_rate": 4.990801296650591e-05, + "loss": 0.3511, + "num_input_tokens_seen": 9709120, + "step": 16730 + }, + { + "epoch": 2.492552874590408, + "grad_norm": 9.232634544372559, + "learning_rate": 4.9907734262982056e-05, + "loss": 0.4359, + "num_input_tokens_seen": 9712192, + "step": 16735 + }, + { + "epoch": 2.493297587131367, + "grad_norm": 14.427287101745605, + "learning_rate": 4.990745513866735e-05, + "loss": 0.559, + "num_input_tokens_seen": 9714848, + "step": 16740 + }, + { + "epoch": 2.4940422996723264, + "grad_norm": 14.783685684204102, + "learning_rate": 4.99071755935665e-05, + "loss": 0.4587, + "num_input_tokens_seen": 9717664, + "step": 16745 + }, + { + "epoch": 2.4947870122132856, + "grad_norm": 4.178961277008057, + "learning_rate": 4.990689562768423e-05, + "loss": 0.583, + "num_input_tokens_seen": 9720960, + "step": 16750 + }, + { + "epoch": 2.495531724754245, + "grad_norm": 10.602678298950195, + "learning_rate": 4.990661524102528e-05, + "loss": 0.6661, + "num_input_tokens_seen": 9723744, + "step": 16755 + }, + { + "epoch": 2.496276437295204, + "grad_norm": 7.836419582366943, + "learning_rate": 4.990633443359437e-05, + "loss": 0.5599, + "num_input_tokens_seen": 9726624, + "step": 16760 + }, + { + "epoch": 2.497021149836163, + "grad_norm": 2.450775623321533, + "learning_rate": 4.990605320539626e-05, + "loss": 0.418, + "num_input_tokens_seen": 9729504, + "step": 16765 + }, + { + "epoch": 2.4977658623771224, + "grad_norm": 10.459741592407227, + "learning_rate": 4.990577155643569e-05, + "loss": 0.4742, + "num_input_tokens_seen": 9732384, + "step": 16770 + }, + { + "epoch": 2.4985105749180816, + "grad_norm": 9.323142051696777, + "learning_rate": 4.9905489486717425e-05, + "loss": 0.4063, + "num_input_tokens_seen": 9735296, + "step": 16775 + }, + { + "epoch": 2.499255287459041, + "grad_norm": 14.87271785736084, + "learning_rate": 4.990520699624623e-05, + "loss": 0.6347, + "num_input_tokens_seen": 9738144, + "step": 16780 + }, + { + "epoch": 2.5, + "grad_norm": 5.727286338806152, + "learning_rate": 4.990492408502687e-05, + "loss": 0.4772, + "num_input_tokens_seen": 9740864, + "step": 16785 + }, + { + "epoch": 2.500744712540959, + "grad_norm": 6.598982810974121, + "learning_rate": 4.9904640753064135e-05, + "loss": 0.5283, + "num_input_tokens_seen": 9743584, + "step": 16790 + }, + { + "epoch": 2.5014894250819184, + "grad_norm": 8.883103370666504, + "learning_rate": 4.99043570003628e-05, + "loss": 0.6965, + "num_input_tokens_seen": 9746304, + "step": 16795 + }, + { + "epoch": 2.5022341376228776, + "grad_norm": 7.478137493133545, + "learning_rate": 4.990407282692767e-05, + "loss": 0.5526, + "num_input_tokens_seen": 9749248, + "step": 16800 + }, + { + "epoch": 2.502978850163837, + "grad_norm": 8.463947296142578, + "learning_rate": 4.9903788232763545e-05, + "loss": 0.4094, + "num_input_tokens_seen": 9752096, + "step": 16805 + }, + { + "epoch": 2.503723562704796, + "grad_norm": 5.350451469421387, + "learning_rate": 4.9903503217875227e-05, + "loss": 0.6539, + "num_input_tokens_seen": 9755456, + "step": 16810 + }, + { + "epoch": 2.504468275245755, + "grad_norm": 7.164551734924316, + "learning_rate": 4.990321778226753e-05, + "loss": 0.5748, + "num_input_tokens_seen": 9758944, + "step": 16815 + }, + { + "epoch": 2.5052129877867144, + "grad_norm": 4.100590229034424, + "learning_rate": 4.990293192594527e-05, + "loss": 0.6091, + "num_input_tokens_seen": 9761664, + "step": 16820 + }, + { + "epoch": 2.5059577003276736, + "grad_norm": 4.391511917114258, + "learning_rate": 4.9902645648913305e-05, + "loss": 0.4714, + "num_input_tokens_seen": 9764384, + "step": 16825 + }, + { + "epoch": 2.506702412868633, + "grad_norm": 12.902721405029297, + "learning_rate": 4.990235895117644e-05, + "loss": 0.5751, + "num_input_tokens_seen": 9767488, + "step": 16830 + }, + { + "epoch": 2.507447125409592, + "grad_norm": 6.867465972900391, + "learning_rate": 4.990207183273953e-05, + "loss": 0.5027, + "num_input_tokens_seen": 9770432, + "step": 16835 + }, + { + "epoch": 2.5081918379505512, + "grad_norm": 6.350703239440918, + "learning_rate": 4.9901784293607424e-05, + "loss": 0.4544, + "num_input_tokens_seen": 9773760, + "step": 16840 + }, + { + "epoch": 2.5089365504915104, + "grad_norm": 4.306856632232666, + "learning_rate": 4.990149633378499e-05, + "loss": 0.6318, + "num_input_tokens_seen": 9776416, + "step": 16845 + }, + { + "epoch": 2.509681263032469, + "grad_norm": 5.342305660247803, + "learning_rate": 4.990120795327707e-05, + "loss": 0.7112, + "num_input_tokens_seen": 9779328, + "step": 16850 + }, + { + "epoch": 2.510425975573429, + "grad_norm": 2.689020872116089, + "learning_rate": 4.990091915208857e-05, + "loss": 0.4042, + "num_input_tokens_seen": 9782080, + "step": 16855 + }, + { + "epoch": 2.5111706881143876, + "grad_norm": 4.386643409729004, + "learning_rate": 4.990062993022433e-05, + "loss": 0.536, + "num_input_tokens_seen": 9784992, + "step": 16860 + }, + { + "epoch": 2.5119154006553472, + "grad_norm": 4.476302623748779, + "learning_rate": 4.990034028768927e-05, + "loss": 0.6431, + "num_input_tokens_seen": 9788160, + "step": 16865 + }, + { + "epoch": 2.512660113196306, + "grad_norm": 13.062644958496094, + "learning_rate": 4.990005022448826e-05, + "loss": 0.5899, + "num_input_tokens_seen": 9790944, + "step": 16870 + }, + { + "epoch": 2.5134048257372656, + "grad_norm": 3.4064974784851074, + "learning_rate": 4.989975974062621e-05, + "loss": 0.4731, + "num_input_tokens_seen": 9793952, + "step": 16875 + }, + { + "epoch": 2.5141495382782244, + "grad_norm": 5.293520450592041, + "learning_rate": 4.989946883610803e-05, + "loss": 0.5757, + "num_input_tokens_seen": 9796928, + "step": 16880 + }, + { + "epoch": 2.514894250819184, + "grad_norm": 5.892383575439453, + "learning_rate": 4.9899177510938624e-05, + "loss": 0.4382, + "num_input_tokens_seen": 9799872, + "step": 16885 + }, + { + "epoch": 2.515638963360143, + "grad_norm": 9.235124588012695, + "learning_rate": 4.9898885765122927e-05, + "loss": 0.4148, + "num_input_tokens_seen": 9802976, + "step": 16890 + }, + { + "epoch": 2.516383675901102, + "grad_norm": 18.50078773498535, + "learning_rate": 4.9898593598665856e-05, + "loss": 0.6696, + "num_input_tokens_seen": 9805952, + "step": 16895 + }, + { + "epoch": 2.517128388442061, + "grad_norm": 6.800465106964111, + "learning_rate": 4.989830101157235e-05, + "loss": 0.4635, + "num_input_tokens_seen": 9808992, + "step": 16900 + }, + { + "epoch": 2.5178731009830204, + "grad_norm": 7.6479291915893555, + "learning_rate": 4.9898008003847363e-05, + "loss": 0.5291, + "num_input_tokens_seen": 9811904, + "step": 16905 + }, + { + "epoch": 2.5186178135239796, + "grad_norm": 17.740718841552734, + "learning_rate": 4.989771457549582e-05, + "loss": 0.5414, + "num_input_tokens_seen": 9814688, + "step": 16910 + }, + { + "epoch": 2.519362526064939, + "grad_norm": 24.272756576538086, + "learning_rate": 4.989742072652271e-05, + "loss": 0.5598, + "num_input_tokens_seen": 9817824, + "step": 16915 + }, + { + "epoch": 2.520107238605898, + "grad_norm": 6.985859394073486, + "learning_rate": 4.989712645693297e-05, + "loss": 0.6421, + "num_input_tokens_seen": 9821152, + "step": 16920 + }, + { + "epoch": 2.5208519511468572, + "grad_norm": 3.616828680038452, + "learning_rate": 4.98968317667316e-05, + "loss": 0.4296, + "num_input_tokens_seen": 9823904, + "step": 16925 + }, + { + "epoch": 2.5215966636878164, + "grad_norm": 4.637538433074951, + "learning_rate": 4.989653665592355e-05, + "loss": 0.6522, + "num_input_tokens_seen": 9826656, + "step": 16930 + }, + { + "epoch": 2.5223413762287756, + "grad_norm": 6.6416120529174805, + "learning_rate": 4.989624112451381e-05, + "loss": 0.6766, + "num_input_tokens_seen": 9829376, + "step": 16935 + }, + { + "epoch": 2.523086088769735, + "grad_norm": 4.5874128341674805, + "learning_rate": 4.989594517250739e-05, + "loss": 0.5038, + "num_input_tokens_seen": 9832448, + "step": 16940 + }, + { + "epoch": 2.523830801310694, + "grad_norm": 2.644378662109375, + "learning_rate": 4.989564879990928e-05, + "loss": 0.4631, + "num_input_tokens_seen": 9835392, + "step": 16945 + }, + { + "epoch": 2.5245755138516532, + "grad_norm": 5.436452388763428, + "learning_rate": 4.9895352006724485e-05, + "loss": 0.4716, + "num_input_tokens_seen": 9838176, + "step": 16950 + }, + { + "epoch": 2.5253202263926124, + "grad_norm": 7.5675811767578125, + "learning_rate": 4.9895054792958015e-05, + "loss": 0.5405, + "num_input_tokens_seen": 9841024, + "step": 16955 + }, + { + "epoch": 2.5260649389335716, + "grad_norm": 4.9473161697387695, + "learning_rate": 4.98947571586149e-05, + "loss": 0.4886, + "num_input_tokens_seen": 9843744, + "step": 16960 + }, + { + "epoch": 2.526809651474531, + "grad_norm": 6.3959150314331055, + "learning_rate": 4.9894459103700166e-05, + "loss": 0.5908, + "num_input_tokens_seen": 9846496, + "step": 16965 + }, + { + "epoch": 2.52755436401549, + "grad_norm": 10.425103187561035, + "learning_rate": 4.989416062821884e-05, + "loss": 0.4573, + "num_input_tokens_seen": 9849120, + "step": 16970 + }, + { + "epoch": 2.5282990765564493, + "grad_norm": 18.13463592529297, + "learning_rate": 4.989386173217598e-05, + "loss": 0.4785, + "num_input_tokens_seen": 9852320, + "step": 16975 + }, + { + "epoch": 2.5290437890974085, + "grad_norm": 33.974910736083984, + "learning_rate": 4.989356241557662e-05, + "loss": 0.7002, + "num_input_tokens_seen": 9855136, + "step": 16980 + }, + { + "epoch": 2.5297885016383677, + "grad_norm": 3.618272066116333, + "learning_rate": 4.989326267842583e-05, + "loss": 0.6106, + "num_input_tokens_seen": 9857856, + "step": 16985 + }, + { + "epoch": 2.530533214179327, + "grad_norm": 8.048412322998047, + "learning_rate": 4.9892962520728664e-05, + "loss": 0.3621, + "num_input_tokens_seen": 9860768, + "step": 16990 + }, + { + "epoch": 2.531277926720286, + "grad_norm": 5.279350757598877, + "learning_rate": 4.989266194249019e-05, + "loss": 0.3562, + "num_input_tokens_seen": 9863584, + "step": 16995 + }, + { + "epoch": 2.5320226392612453, + "grad_norm": 8.56458854675293, + "learning_rate": 4.989236094371551e-05, + "loss": 0.5896, + "num_input_tokens_seen": 9866752, + "step": 17000 + }, + { + "epoch": 2.5327673518022045, + "grad_norm": 3.6308703422546387, + "learning_rate": 4.9892059524409676e-05, + "loss": 0.4923, + "num_input_tokens_seen": 9869760, + "step": 17005 + }, + { + "epoch": 2.5335120643431637, + "grad_norm": 2.5684709548950195, + "learning_rate": 4.98917576845778e-05, + "loss": 0.426, + "num_input_tokens_seen": 9872800, + "step": 17010 + }, + { + "epoch": 2.534256776884123, + "grad_norm": 2.5650365352630615, + "learning_rate": 4.989145542422498e-05, + "loss": 0.4441, + "num_input_tokens_seen": 9875840, + "step": 17015 + }, + { + "epoch": 2.535001489425082, + "grad_norm": 12.187102317810059, + "learning_rate": 4.989115274335632e-05, + "loss": 0.6666, + "num_input_tokens_seen": 9878816, + "step": 17020 + }, + { + "epoch": 2.535746201966041, + "grad_norm": 3.2648775577545166, + "learning_rate": 4.9890849641976924e-05, + "loss": 0.3025, + "num_input_tokens_seen": 9881504, + "step": 17025 + }, + { + "epoch": 2.5364909145070005, + "grad_norm": 15.330765724182129, + "learning_rate": 4.989054612009192e-05, + "loss": 0.7926, + "num_input_tokens_seen": 9884992, + "step": 17030 + }, + { + "epoch": 2.5372356270479592, + "grad_norm": 12.012451171875, + "learning_rate": 4.989024217770645e-05, + "loss": 0.4826, + "num_input_tokens_seen": 9887616, + "step": 17035 + }, + { + "epoch": 2.537980339588919, + "grad_norm": 17.95915985107422, + "learning_rate": 4.988993781482563e-05, + "loss": 0.455, + "num_input_tokens_seen": 9890240, + "step": 17040 + }, + { + "epoch": 2.5387250521298776, + "grad_norm": 9.789167404174805, + "learning_rate": 4.9889633031454604e-05, + "loss": 0.5325, + "num_input_tokens_seen": 9893184, + "step": 17045 + }, + { + "epoch": 2.5394697646708373, + "grad_norm": 3.346251964569092, + "learning_rate": 4.9889327827598526e-05, + "loss": 0.3516, + "num_input_tokens_seen": 9896064, + "step": 17050 + }, + { + "epoch": 2.540214477211796, + "grad_norm": 11.304669380187988, + "learning_rate": 4.988902220326255e-05, + "loss": 0.5286, + "num_input_tokens_seen": 9898752, + "step": 17055 + }, + { + "epoch": 2.5409591897527557, + "grad_norm": 4.27647590637207, + "learning_rate": 4.9888716158451844e-05, + "loss": 0.4698, + "num_input_tokens_seen": 9901632, + "step": 17060 + }, + { + "epoch": 2.5417039022937145, + "grad_norm": 4.137424468994141, + "learning_rate": 4.988840969317157e-05, + "loss": 0.4168, + "num_input_tokens_seen": 9904608, + "step": 17065 + }, + { + "epoch": 2.5424486148346737, + "grad_norm": 5.980827808380127, + "learning_rate": 4.988810280742691e-05, + "loss": 0.7288, + "num_input_tokens_seen": 9907296, + "step": 17070 + }, + { + "epoch": 2.543193327375633, + "grad_norm": 10.985311508178711, + "learning_rate": 4.988779550122305e-05, + "loss": 0.4644, + "num_input_tokens_seen": 9910240, + "step": 17075 + }, + { + "epoch": 2.543938039916592, + "grad_norm": 10.353503227233887, + "learning_rate": 4.9887487774565176e-05, + "loss": 0.7207, + "num_input_tokens_seen": 9913280, + "step": 17080 + }, + { + "epoch": 2.5446827524575513, + "grad_norm": 5.675861358642578, + "learning_rate": 4.98871796274585e-05, + "loss": 0.6523, + "num_input_tokens_seen": 9916160, + "step": 17085 + }, + { + "epoch": 2.5454274649985105, + "grad_norm": 10.60794734954834, + "learning_rate": 4.9886871059908213e-05, + "loss": 0.6656, + "num_input_tokens_seen": 9919072, + "step": 17090 + }, + { + "epoch": 2.5461721775394697, + "grad_norm": 15.613737106323242, + "learning_rate": 4.988656207191953e-05, + "loss": 0.5979, + "num_input_tokens_seen": 9922272, + "step": 17095 + }, + { + "epoch": 2.546916890080429, + "grad_norm": 10.013479232788086, + "learning_rate": 4.988625266349768e-05, + "loss": 0.5143, + "num_input_tokens_seen": 9925024, + "step": 17100 + }, + { + "epoch": 2.547661602621388, + "grad_norm": 7.883745193481445, + "learning_rate": 4.988594283464788e-05, + "loss": 0.6126, + "num_input_tokens_seen": 9928160, + "step": 17105 + }, + { + "epoch": 2.5484063151623473, + "grad_norm": 4.482454776763916, + "learning_rate": 4.988563258537537e-05, + "loss": 0.6198, + "num_input_tokens_seen": 9930688, + "step": 17110 + }, + { + "epoch": 2.5491510277033065, + "grad_norm": 4.749344825744629, + "learning_rate": 4.988532191568539e-05, + "loss": 0.6385, + "num_input_tokens_seen": 9933568, + "step": 17115 + }, + { + "epoch": 2.5498957402442657, + "grad_norm": 6.350439548492432, + "learning_rate": 4.988501082558319e-05, + "loss": 0.4646, + "num_input_tokens_seen": 9936128, + "step": 17120 + }, + { + "epoch": 2.550640452785225, + "grad_norm": 10.483531951904297, + "learning_rate": 4.988469931507402e-05, + "loss": 0.4807, + "num_input_tokens_seen": 9939296, + "step": 17125 + }, + { + "epoch": 2.551385165326184, + "grad_norm": 3.2898683547973633, + "learning_rate": 4.988438738416316e-05, + "loss": 0.6331, + "num_input_tokens_seen": 9942400, + "step": 17130 + }, + { + "epoch": 2.5521298778671433, + "grad_norm": 6.305202960968018, + "learning_rate": 4.988407503285585e-05, + "loss": 0.4634, + "num_input_tokens_seen": 9945056, + "step": 17135 + }, + { + "epoch": 2.5528745904081025, + "grad_norm": 9.11904525756836, + "learning_rate": 4.988376226115739e-05, + "loss": 0.3939, + "num_input_tokens_seen": 9947744, + "step": 17140 + }, + { + "epoch": 2.5536193029490617, + "grad_norm": 6.848167419433594, + "learning_rate": 4.9883449069073055e-05, + "loss": 0.3452, + "num_input_tokens_seen": 9950464, + "step": 17145 + }, + { + "epoch": 2.554364015490021, + "grad_norm": 10.237719535827637, + "learning_rate": 4.9883135456608146e-05, + "loss": 0.6564, + "num_input_tokens_seen": 9953632, + "step": 17150 + }, + { + "epoch": 2.55510872803098, + "grad_norm": 10.301454544067383, + "learning_rate": 4.988282142376795e-05, + "loss": 0.5036, + "num_input_tokens_seen": 9956736, + "step": 17155 + }, + { + "epoch": 2.5558534405719393, + "grad_norm": 12.663594245910645, + "learning_rate": 4.9882506970557766e-05, + "loss": 0.5585, + "num_input_tokens_seen": 9959424, + "step": 17160 + }, + { + "epoch": 2.5565981531128985, + "grad_norm": 12.941266059875488, + "learning_rate": 4.988219209698293e-05, + "loss": 0.5184, + "num_input_tokens_seen": 9962112, + "step": 17165 + }, + { + "epoch": 2.5573428656538577, + "grad_norm": 10.306009292602539, + "learning_rate": 4.988187680304874e-05, + "loss": 0.738, + "num_input_tokens_seen": 9964768, + "step": 17170 + }, + { + "epoch": 2.558087578194817, + "grad_norm": 5.547285556793213, + "learning_rate": 4.988156108876053e-05, + "loss": 0.5791, + "num_input_tokens_seen": 9967680, + "step": 17175 + }, + { + "epoch": 2.558832290735776, + "grad_norm": 5.428895473480225, + "learning_rate": 4.988124495412364e-05, + "loss": 0.5298, + "num_input_tokens_seen": 9970496, + "step": 17180 + }, + { + "epoch": 2.5595770032767353, + "grad_norm": 6.565059661865234, + "learning_rate": 4.98809283991434e-05, + "loss": 0.5559, + "num_input_tokens_seen": 9973152, + "step": 17185 + }, + { + "epoch": 2.5603217158176945, + "grad_norm": 17.510826110839844, + "learning_rate": 4.988061142382516e-05, + "loss": 0.6422, + "num_input_tokens_seen": 9976000, + "step": 17190 + }, + { + "epoch": 2.5610664283586537, + "grad_norm": 8.540008544921875, + "learning_rate": 4.988029402817428e-05, + "loss": 0.4708, + "num_input_tokens_seen": 9978656, + "step": 17195 + }, + { + "epoch": 2.5618111408996125, + "grad_norm": 2.513296127319336, + "learning_rate": 4.9879976212196124e-05, + "loss": 0.5774, + "num_input_tokens_seen": 9981696, + "step": 17200 + }, + { + "epoch": 2.562555853440572, + "grad_norm": 3.850874423980713, + "learning_rate": 4.987965797589605e-05, + "loss": 0.3599, + "num_input_tokens_seen": 9984608, + "step": 17205 + }, + { + "epoch": 2.563300565981531, + "grad_norm": 5.263801097869873, + "learning_rate": 4.987933931927944e-05, + "loss": 0.6985, + "num_input_tokens_seen": 9987616, + "step": 17210 + }, + { + "epoch": 2.5640452785224905, + "grad_norm": 13.965652465820312, + "learning_rate": 4.987902024235169e-05, + "loss": 0.7203, + "num_input_tokens_seen": 9990784, + "step": 17215 + }, + { + "epoch": 2.5647899910634493, + "grad_norm": 13.382460594177246, + "learning_rate": 4.987870074511817e-05, + "loss": 0.7214, + "num_input_tokens_seen": 9993696, + "step": 17220 + }, + { + "epoch": 2.565534703604409, + "grad_norm": 4.490419387817383, + "learning_rate": 4.987838082758428e-05, + "loss": 0.4543, + "num_input_tokens_seen": 9996416, + "step": 17225 + }, + { + "epoch": 2.5662794161453677, + "grad_norm": 6.7348175048828125, + "learning_rate": 4.987806048975544e-05, + "loss": 0.4291, + "num_input_tokens_seen": 9999424, + "step": 17230 + }, + { + "epoch": 2.5670241286863273, + "grad_norm": 7.144815921783447, + "learning_rate": 4.987773973163706e-05, + "loss": 0.5048, + "num_input_tokens_seen": 10002720, + "step": 17235 + }, + { + "epoch": 2.567768841227286, + "grad_norm": 5.628639221191406, + "learning_rate": 4.987741855323454e-05, + "loss": 0.5329, + "num_input_tokens_seen": 10005632, + "step": 17240 + }, + { + "epoch": 2.5685135537682453, + "grad_norm": 12.266886711120605, + "learning_rate": 4.9877096954553324e-05, + "loss": 0.6099, + "num_input_tokens_seen": 10008576, + "step": 17245 + }, + { + "epoch": 2.5692582663092045, + "grad_norm": 14.598294258117676, + "learning_rate": 4.987677493559883e-05, + "loss": 0.7025, + "num_input_tokens_seen": 10011456, + "step": 17250 + }, + { + "epoch": 2.5700029788501637, + "grad_norm": 10.408489227294922, + "learning_rate": 4.987645249637652e-05, + "loss": 0.5705, + "num_input_tokens_seen": 10014496, + "step": 17255 + }, + { + "epoch": 2.570747691391123, + "grad_norm": 6.493668556213379, + "learning_rate": 4.987612963689182e-05, + "loss": 0.6099, + "num_input_tokens_seen": 10017664, + "step": 17260 + }, + { + "epoch": 2.571492403932082, + "grad_norm": 7.287785053253174, + "learning_rate": 4.987580635715019e-05, + "loss": 0.5228, + "num_input_tokens_seen": 10020384, + "step": 17265 + }, + { + "epoch": 2.5722371164730413, + "grad_norm": 4.531379222869873, + "learning_rate": 4.98754826571571e-05, + "loss": 0.5515, + "num_input_tokens_seen": 10023360, + "step": 17270 + }, + { + "epoch": 2.5729818290140005, + "grad_norm": 6.158554553985596, + "learning_rate": 4.9875158536918015e-05, + "loss": 0.3287, + "num_input_tokens_seen": 10026208, + "step": 17275 + }, + { + "epoch": 2.5737265415549597, + "grad_norm": 5.779934883117676, + "learning_rate": 4.987483399643841e-05, + "loss": 0.4209, + "num_input_tokens_seen": 10029056, + "step": 17280 + }, + { + "epoch": 2.574471254095919, + "grad_norm": 10.87652587890625, + "learning_rate": 4.987450903572376e-05, + "loss": 0.5277, + "num_input_tokens_seen": 10031936, + "step": 17285 + }, + { + "epoch": 2.575215966636878, + "grad_norm": 5.638761043548584, + "learning_rate": 4.987418365477956e-05, + "loss": 0.3333, + "num_input_tokens_seen": 10034816, + "step": 17290 + }, + { + "epoch": 2.5759606791778373, + "grad_norm": 7.304035663604736, + "learning_rate": 4.987385785361131e-05, + "loss": 0.7029, + "num_input_tokens_seen": 10037856, + "step": 17295 + }, + { + "epoch": 2.5767053917187965, + "grad_norm": 12.970317840576172, + "learning_rate": 4.987353163222451e-05, + "loss": 0.572, + "num_input_tokens_seen": 10040576, + "step": 17300 + }, + { + "epoch": 2.5774501042597557, + "grad_norm": 9.05759334564209, + "learning_rate": 4.9873204990624676e-05, + "loss": 0.7065, + "num_input_tokens_seen": 10043392, + "step": 17305 + }, + { + "epoch": 2.578194816800715, + "grad_norm": 9.575845718383789, + "learning_rate": 4.987287792881733e-05, + "loss": 0.503, + "num_input_tokens_seen": 10045888, + "step": 17310 + }, + { + "epoch": 2.578939529341674, + "grad_norm": 2.868536949157715, + "learning_rate": 4.9872550446807986e-05, + "loss": 0.574, + "num_input_tokens_seen": 10049056, + "step": 17315 + }, + { + "epoch": 2.5796842418826333, + "grad_norm": 8.018128395080566, + "learning_rate": 4.987222254460218e-05, + "loss": 0.3888, + "num_input_tokens_seen": 10052032, + "step": 17320 + }, + { + "epoch": 2.5804289544235925, + "grad_norm": 10.183355331420898, + "learning_rate": 4.987189422220545e-05, + "loss": 0.5267, + "num_input_tokens_seen": 10054880, + "step": 17325 + }, + { + "epoch": 2.5811736669645517, + "grad_norm": 4.650959491729736, + "learning_rate": 4.987156547962335e-05, + "loss": 0.457, + "num_input_tokens_seen": 10057280, + "step": 17330 + }, + { + "epoch": 2.581918379505511, + "grad_norm": 3.0608062744140625, + "learning_rate": 4.987123631686143e-05, + "loss": 0.3293, + "num_input_tokens_seen": 10060288, + "step": 17335 + }, + { + "epoch": 2.58266309204647, + "grad_norm": 5.341984272003174, + "learning_rate": 4.987090673392525e-05, + "loss": 0.5312, + "num_input_tokens_seen": 10063072, + "step": 17340 + }, + { + "epoch": 2.5834078045874294, + "grad_norm": 3.9058189392089844, + "learning_rate": 4.987057673082038e-05, + "loss": 0.5202, + "num_input_tokens_seen": 10066208, + "step": 17345 + }, + { + "epoch": 2.5841525171283886, + "grad_norm": 11.183914184570312, + "learning_rate": 4.987024630755239e-05, + "loss": 0.6252, + "num_input_tokens_seen": 10069120, + "step": 17350 + }, + { + "epoch": 2.5848972296693478, + "grad_norm": 19.911008834838867, + "learning_rate": 4.986991546412687e-05, + "loss": 0.5052, + "num_input_tokens_seen": 10072000, + "step": 17355 + }, + { + "epoch": 2.585641942210307, + "grad_norm": 8.88884449005127, + "learning_rate": 4.98695842005494e-05, + "loss": 0.5103, + "num_input_tokens_seen": 10074720, + "step": 17360 + }, + { + "epoch": 2.586386654751266, + "grad_norm": 9.476363182067871, + "learning_rate": 4.9869252516825585e-05, + "loss": 0.5194, + "num_input_tokens_seen": 10077568, + "step": 17365 + }, + { + "epoch": 2.5871313672922254, + "grad_norm": 7.660337924957275, + "learning_rate": 4.986892041296102e-05, + "loss": 0.3793, + "num_input_tokens_seen": 10080288, + "step": 17370 + }, + { + "epoch": 2.587876079833184, + "grad_norm": 10.499715805053711, + "learning_rate": 4.9868587888961325e-05, + "loss": 0.3709, + "num_input_tokens_seen": 10083104, + "step": 17375 + }, + { + "epoch": 2.5886207923741438, + "grad_norm": 6.596726417541504, + "learning_rate": 4.986825494483211e-05, + "loss": 0.5158, + "num_input_tokens_seen": 10086080, + "step": 17380 + }, + { + "epoch": 2.5893655049151025, + "grad_norm": 8.694958686828613, + "learning_rate": 4.9867921580579e-05, + "loss": 0.4885, + "num_input_tokens_seen": 10089056, + "step": 17385 + }, + { + "epoch": 2.590110217456062, + "grad_norm": 7.799798965454102, + "learning_rate": 4.986758779620764e-05, + "loss": 0.5331, + "num_input_tokens_seen": 10091904, + "step": 17390 + }, + { + "epoch": 2.590854929997021, + "grad_norm": 14.893830299377441, + "learning_rate": 4.986725359172365e-05, + "loss": 0.4958, + "num_input_tokens_seen": 10094656, + "step": 17395 + }, + { + "epoch": 2.5915996425379806, + "grad_norm": 9.707938194274902, + "learning_rate": 4.986691896713269e-05, + "loss": 0.6633, + "num_input_tokens_seen": 10097408, + "step": 17400 + }, + { + "epoch": 2.5923443550789393, + "grad_norm": 14.566295623779297, + "learning_rate": 4.98665839224404e-05, + "loss": 0.7417, + "num_input_tokens_seen": 10100640, + "step": 17405 + }, + { + "epoch": 2.593089067619899, + "grad_norm": 6.909084320068359, + "learning_rate": 4.9866248457652455e-05, + "loss": 0.3673, + "num_input_tokens_seen": 10103328, + "step": 17410 + }, + { + "epoch": 2.5938337801608577, + "grad_norm": 4.713314056396484, + "learning_rate": 4.986591257277451e-05, + "loss": 0.5761, + "num_input_tokens_seen": 10106144, + "step": 17415 + }, + { + "epoch": 2.594578492701817, + "grad_norm": 3.4806509017944336, + "learning_rate": 4.986557626781224e-05, + "loss": 0.5649, + "num_input_tokens_seen": 10109216, + "step": 17420 + }, + { + "epoch": 2.595323205242776, + "grad_norm": 11.910219192504883, + "learning_rate": 4.9865239542771345e-05, + "loss": 0.5743, + "num_input_tokens_seen": 10112096, + "step": 17425 + }, + { + "epoch": 2.5960679177837354, + "grad_norm": 3.5956037044525146, + "learning_rate": 4.986490239765749e-05, + "loss": 0.5698, + "num_input_tokens_seen": 10114880, + "step": 17430 + }, + { + "epoch": 2.5968126303246946, + "grad_norm": 10.740285873413086, + "learning_rate": 4.9864564832476386e-05, + "loss": 0.372, + "num_input_tokens_seen": 10117824, + "step": 17435 + }, + { + "epoch": 2.5975573428656538, + "grad_norm": 9.555791854858398, + "learning_rate": 4.986422684723373e-05, + "loss": 0.933, + "num_input_tokens_seen": 10120736, + "step": 17440 + }, + { + "epoch": 2.598302055406613, + "grad_norm": 7.58923864364624, + "learning_rate": 4.986388844193523e-05, + "loss": 0.5556, + "num_input_tokens_seen": 10123456, + "step": 17445 + }, + { + "epoch": 2.599046767947572, + "grad_norm": 5.3797430992126465, + "learning_rate": 4.9863549616586604e-05, + "loss": 0.5314, + "num_input_tokens_seen": 10126464, + "step": 17450 + }, + { + "epoch": 2.5997914804885314, + "grad_norm": 6.474735736846924, + "learning_rate": 4.986321037119358e-05, + "loss": 0.6973, + "num_input_tokens_seen": 10129408, + "step": 17455 + }, + { + "epoch": 2.6005361930294906, + "grad_norm": 4.681642532348633, + "learning_rate": 4.986287070576188e-05, + "loss": 0.5924, + "num_input_tokens_seen": 10132160, + "step": 17460 + }, + { + "epoch": 2.6012809055704498, + "grad_norm": 6.964779853820801, + "learning_rate": 4.986253062029725e-05, + "loss": 0.6982, + "num_input_tokens_seen": 10135200, + "step": 17465 + }, + { + "epoch": 2.602025618111409, + "grad_norm": 6.620599746704102, + "learning_rate": 4.986219011480544e-05, + "loss": 0.5249, + "num_input_tokens_seen": 10138112, + "step": 17470 + }, + { + "epoch": 2.602770330652368, + "grad_norm": 3.9475433826446533, + "learning_rate": 4.98618491892922e-05, + "loss": 0.4592, + "num_input_tokens_seen": 10141184, + "step": 17475 + }, + { + "epoch": 2.6035150431933274, + "grad_norm": 5.505148887634277, + "learning_rate": 4.986150784376328e-05, + "loss": 0.5061, + "num_input_tokens_seen": 10143840, + "step": 17480 + }, + { + "epoch": 2.6042597557342866, + "grad_norm": 6.293529033660889, + "learning_rate": 4.986116607822445e-05, + "loss": 0.4012, + "num_input_tokens_seen": 10146848, + "step": 17485 + }, + { + "epoch": 2.605004468275246, + "grad_norm": 11.205371856689453, + "learning_rate": 4.9860823892681496e-05, + "loss": 0.6673, + "num_input_tokens_seen": 10149952, + "step": 17490 + }, + { + "epoch": 2.605749180816205, + "grad_norm": 4.122776508331299, + "learning_rate": 4.986048128714019e-05, + "loss": 0.5752, + "num_input_tokens_seen": 10152608, + "step": 17495 + }, + { + "epoch": 2.606493893357164, + "grad_norm": 8.502262115478516, + "learning_rate": 4.986013826160631e-05, + "loss": 0.627, + "num_input_tokens_seen": 10156384, + "step": 17500 + }, + { + "epoch": 2.6072386058981234, + "grad_norm": 7.181716442108154, + "learning_rate": 4.985979481608567e-05, + "loss": 0.5599, + "num_input_tokens_seen": 10159552, + "step": 17505 + }, + { + "epoch": 2.6079833184390826, + "grad_norm": 15.769152641296387, + "learning_rate": 4.9859450950584056e-05, + "loss": 0.6637, + "num_input_tokens_seen": 10162336, + "step": 17510 + }, + { + "epoch": 2.608728030980042, + "grad_norm": 8.564674377441406, + "learning_rate": 4.9859106665107294e-05, + "loss": 0.3956, + "num_input_tokens_seen": 10165504, + "step": 17515 + }, + { + "epoch": 2.609472743521001, + "grad_norm": 11.237431526184082, + "learning_rate": 4.985876195966118e-05, + "loss": 0.7607, + "num_input_tokens_seen": 10168288, + "step": 17520 + }, + { + "epoch": 2.61021745606196, + "grad_norm": 3.8668153285980225, + "learning_rate": 4.985841683425155e-05, + "loss": 0.2993, + "num_input_tokens_seen": 10170880, + "step": 17525 + }, + { + "epoch": 2.6109621686029194, + "grad_norm": 4.531137943267822, + "learning_rate": 4.9858071288884236e-05, + "loss": 0.8082, + "num_input_tokens_seen": 10173984, + "step": 17530 + }, + { + "epoch": 2.6117068811438786, + "grad_norm": 13.333683013916016, + "learning_rate": 4.985772532356507e-05, + "loss": 0.6631, + "num_input_tokens_seen": 10176800, + "step": 17535 + }, + { + "epoch": 2.612451593684838, + "grad_norm": 5.983775615692139, + "learning_rate": 4.9857378938299895e-05, + "loss": 0.5075, + "num_input_tokens_seen": 10179872, + "step": 17540 + }, + { + "epoch": 2.613196306225797, + "grad_norm": 5.841287612915039, + "learning_rate": 4.985703213309457e-05, + "loss": 0.6363, + "num_input_tokens_seen": 10182816, + "step": 17545 + }, + { + "epoch": 2.6139410187667558, + "grad_norm": 5.549128532409668, + "learning_rate": 4.9856684907954955e-05, + "loss": 0.6521, + "num_input_tokens_seen": 10185600, + "step": 17550 + }, + { + "epoch": 2.6146857313077154, + "grad_norm": 10.239112854003906, + "learning_rate": 4.985633726288691e-05, + "loss": 0.4515, + "num_input_tokens_seen": 10188384, + "step": 17555 + }, + { + "epoch": 2.615430443848674, + "grad_norm": 7.61749267578125, + "learning_rate": 4.985598919789631e-05, + "loss": 0.4081, + "num_input_tokens_seen": 10191488, + "step": 17560 + }, + { + "epoch": 2.616175156389634, + "grad_norm": 5.535088062286377, + "learning_rate": 4.9855640712989035e-05, + "loss": 0.5676, + "num_input_tokens_seen": 10194304, + "step": 17565 + }, + { + "epoch": 2.6169198689305926, + "grad_norm": 4.390429496765137, + "learning_rate": 4.9855291808170966e-05, + "loss": 0.5569, + "num_input_tokens_seen": 10196960, + "step": 17570 + }, + { + "epoch": 2.6176645814715522, + "grad_norm": 9.123336791992188, + "learning_rate": 4.985494248344801e-05, + "loss": 0.5823, + "num_input_tokens_seen": 10199936, + "step": 17575 + }, + { + "epoch": 2.618409294012511, + "grad_norm": 7.516604900360107, + "learning_rate": 4.9854592738826054e-05, + "loss": 0.5568, + "num_input_tokens_seen": 10202912, + "step": 17580 + }, + { + "epoch": 2.6191540065534706, + "grad_norm": 10.930060386657715, + "learning_rate": 4.985424257431103e-05, + "loss": 0.6127, + "num_input_tokens_seen": 10205888, + "step": 17585 + }, + { + "epoch": 2.6198987190944294, + "grad_norm": 5.230993270874023, + "learning_rate": 4.985389198990883e-05, + "loss": 0.4999, + "num_input_tokens_seen": 10209888, + "step": 17590 + }, + { + "epoch": 2.6206434316353886, + "grad_norm": 2.4646618366241455, + "learning_rate": 4.985354098562538e-05, + "loss": 0.2846, + "num_input_tokens_seen": 10212800, + "step": 17595 + }, + { + "epoch": 2.621388144176348, + "grad_norm": 13.56930923461914, + "learning_rate": 4.985318956146662e-05, + "loss": 0.4414, + "num_input_tokens_seen": 10215584, + "step": 17600 + }, + { + "epoch": 2.622132856717307, + "grad_norm": 3.837998867034912, + "learning_rate": 4.9852837717438485e-05, + "loss": 0.4813, + "num_input_tokens_seen": 10218656, + "step": 17605 + }, + { + "epoch": 2.622877569258266, + "grad_norm": 16.893949508666992, + "learning_rate": 4.985248545354692e-05, + "loss": 0.6484, + "num_input_tokens_seen": 10221632, + "step": 17610 + }, + { + "epoch": 2.6236222817992254, + "grad_norm": 14.824777603149414, + "learning_rate": 4.985213276979785e-05, + "loss": 0.8457, + "num_input_tokens_seen": 10224384, + "step": 17615 + }, + { + "epoch": 2.6243669943401846, + "grad_norm": 7.07521390914917, + "learning_rate": 4.985177966619727e-05, + "loss": 0.4911, + "num_input_tokens_seen": 10227392, + "step": 17620 + }, + { + "epoch": 2.625111706881144, + "grad_norm": 6.703554153442383, + "learning_rate": 4.985142614275114e-05, + "loss": 0.5836, + "num_input_tokens_seen": 10230080, + "step": 17625 + }, + { + "epoch": 2.625856419422103, + "grad_norm": 12.20709228515625, + "learning_rate": 4.985107219946541e-05, + "loss": 0.4247, + "num_input_tokens_seen": 10232928, + "step": 17630 + }, + { + "epoch": 2.626601131963062, + "grad_norm": 11.84780502319336, + "learning_rate": 4.985071783634608e-05, + "loss": 0.7797, + "num_input_tokens_seen": 10236000, + "step": 17635 + }, + { + "epoch": 2.6273458445040214, + "grad_norm": 7.415883541107178, + "learning_rate": 4.985036305339913e-05, + "loss": 0.643, + "num_input_tokens_seen": 10239008, + "step": 17640 + }, + { + "epoch": 2.6280905570449806, + "grad_norm": 16.969982147216797, + "learning_rate": 4.9850007850630545e-05, + "loss": 0.3975, + "num_input_tokens_seen": 10241888, + "step": 17645 + }, + { + "epoch": 2.62883526958594, + "grad_norm": 9.412151336669922, + "learning_rate": 4.984965222804634e-05, + "loss": 0.5571, + "num_input_tokens_seen": 10244736, + "step": 17650 + }, + { + "epoch": 2.629579982126899, + "grad_norm": 5.901156902313232, + "learning_rate": 4.984929618565252e-05, + "loss": 0.6433, + "num_input_tokens_seen": 10247424, + "step": 17655 + }, + { + "epoch": 2.6303246946678582, + "grad_norm": 14.83957576751709, + "learning_rate": 4.9848939723455085e-05, + "loss": 0.5503, + "num_input_tokens_seen": 10250144, + "step": 17660 + }, + { + "epoch": 2.6310694072088174, + "grad_norm": 3.5826528072357178, + "learning_rate": 4.984858284146008e-05, + "loss": 0.6477, + "num_input_tokens_seen": 10253184, + "step": 17665 + }, + { + "epoch": 2.6318141197497766, + "grad_norm": 6.3964619636535645, + "learning_rate": 4.9848225539673513e-05, + "loss": 0.4763, + "num_input_tokens_seen": 10256160, + "step": 17670 + }, + { + "epoch": 2.632558832290736, + "grad_norm": 9.436077117919922, + "learning_rate": 4.9847867818101436e-05, + "loss": 0.5744, + "num_input_tokens_seen": 10259072, + "step": 17675 + }, + { + "epoch": 2.633303544831695, + "grad_norm": 8.097932815551758, + "learning_rate": 4.984750967674989e-05, + "loss": 0.5041, + "num_input_tokens_seen": 10262144, + "step": 17680 + }, + { + "epoch": 2.6340482573726542, + "grad_norm": 5.632266044616699, + "learning_rate": 4.9847151115624916e-05, + "loss": 0.5422, + "num_input_tokens_seen": 10265024, + "step": 17685 + }, + { + "epoch": 2.6347929699136134, + "grad_norm": 10.253355979919434, + "learning_rate": 4.984679213473258e-05, + "loss": 0.6262, + "num_input_tokens_seen": 10267968, + "step": 17690 + }, + { + "epoch": 2.6355376824545726, + "grad_norm": 2.8749802112579346, + "learning_rate": 4.984643273407894e-05, + "loss": 0.3294, + "num_input_tokens_seen": 10270688, + "step": 17695 + }, + { + "epoch": 2.636282394995532, + "grad_norm": 5.422008514404297, + "learning_rate": 4.984607291367007e-05, + "loss": 0.51, + "num_input_tokens_seen": 10273568, + "step": 17700 + }, + { + "epoch": 2.637027107536491, + "grad_norm": 9.217559814453125, + "learning_rate": 4.984571267351206e-05, + "loss": 0.5822, + "num_input_tokens_seen": 10276416, + "step": 17705 + }, + { + "epoch": 2.6377718200774503, + "grad_norm": 12.355748176574707, + "learning_rate": 4.984535201361098e-05, + "loss": 0.5266, + "num_input_tokens_seen": 10279104, + "step": 17710 + }, + { + "epoch": 2.6385165326184095, + "grad_norm": 2.261732339859009, + "learning_rate": 4.984499093397294e-05, + "loss": 0.3289, + "num_input_tokens_seen": 10282112, + "step": 17715 + }, + { + "epoch": 2.6392612451593687, + "grad_norm": 11.143941879272461, + "learning_rate": 4.984462943460402e-05, + "loss": 0.4689, + "num_input_tokens_seen": 10285216, + "step": 17720 + }, + { + "epoch": 2.6400059577003274, + "grad_norm": 8.950729370117188, + "learning_rate": 4.984426751551033e-05, + "loss": 0.5823, + "num_input_tokens_seen": 10287904, + "step": 17725 + }, + { + "epoch": 2.640750670241287, + "grad_norm": 5.165647983551025, + "learning_rate": 4.9843905176698004e-05, + "loss": 0.576, + "num_input_tokens_seen": 10290912, + "step": 17730 + }, + { + "epoch": 2.641495382782246, + "grad_norm": 10.271660804748535, + "learning_rate": 4.984354241817314e-05, + "loss": 0.6343, + "num_input_tokens_seen": 10294048, + "step": 17735 + }, + { + "epoch": 2.6422400953232055, + "grad_norm": 13.31067180633545, + "learning_rate": 4.984317923994188e-05, + "loss": 0.53, + "num_input_tokens_seen": 10297056, + "step": 17740 + }, + { + "epoch": 2.6429848078641642, + "grad_norm": 5.06058931350708, + "learning_rate": 4.984281564201036e-05, + "loss": 0.5504, + "num_input_tokens_seen": 10299776, + "step": 17745 + }, + { + "epoch": 2.643729520405124, + "grad_norm": 3.0822596549987793, + "learning_rate": 4.9842451624384715e-05, + "loss": 0.4634, + "num_input_tokens_seen": 10302720, + "step": 17750 + }, + { + "epoch": 2.6444742329460826, + "grad_norm": 9.533676147460938, + "learning_rate": 4.98420871870711e-05, + "loss": 0.7402, + "num_input_tokens_seen": 10305472, + "step": 17755 + }, + { + "epoch": 2.645218945487042, + "grad_norm": 7.011179447174072, + "learning_rate": 4.984172233007567e-05, + "loss": 0.739, + "num_input_tokens_seen": 10308192, + "step": 17760 + }, + { + "epoch": 2.645963658028001, + "grad_norm": 8.010783195495605, + "learning_rate": 4.984135705340459e-05, + "loss": 0.5641, + "num_input_tokens_seen": 10311296, + "step": 17765 + }, + { + "epoch": 2.6467083705689602, + "grad_norm": 7.679373741149902, + "learning_rate": 4.984099135706402e-05, + "loss": 0.4908, + "num_input_tokens_seen": 10313984, + "step": 17770 + }, + { + "epoch": 2.6474530831099194, + "grad_norm": 6.043315887451172, + "learning_rate": 4.984062524106017e-05, + "loss": 0.542, + "num_input_tokens_seen": 10316768, + "step": 17775 + }, + { + "epoch": 2.6481977956508786, + "grad_norm": 19.653562545776367, + "learning_rate": 4.984025870539919e-05, + "loss": 0.5558, + "num_input_tokens_seen": 10319776, + "step": 17780 + }, + { + "epoch": 2.648942508191838, + "grad_norm": 8.3969144821167, + "learning_rate": 4.983989175008729e-05, + "loss": 0.5592, + "num_input_tokens_seen": 10322368, + "step": 17785 + }, + { + "epoch": 2.649687220732797, + "grad_norm": 3.2459144592285156, + "learning_rate": 4.983952437513066e-05, + "loss": 0.5099, + "num_input_tokens_seen": 10325600, + "step": 17790 + }, + { + "epoch": 2.6504319332737563, + "grad_norm": 6.637663841247559, + "learning_rate": 4.983915658053551e-05, + "loss": 0.6155, + "num_input_tokens_seen": 10328608, + "step": 17795 + }, + { + "epoch": 2.6511766458147155, + "grad_norm": 7.21382474899292, + "learning_rate": 4.983878836630806e-05, + "loss": 0.3742, + "num_input_tokens_seen": 10331456, + "step": 17800 + }, + { + "epoch": 2.6519213583556747, + "grad_norm": 5.613070011138916, + "learning_rate": 4.983841973245452e-05, + "loss": 0.5011, + "num_input_tokens_seen": 10334336, + "step": 17805 + }, + { + "epoch": 2.652666070896634, + "grad_norm": 6.397650241851807, + "learning_rate": 4.983805067898113e-05, + "loss": 0.6551, + "num_input_tokens_seen": 10337152, + "step": 17810 + }, + { + "epoch": 2.653410783437593, + "grad_norm": 3.2329297065734863, + "learning_rate": 4.983768120589411e-05, + "loss": 0.5321, + "num_input_tokens_seen": 10340000, + "step": 17815 + }, + { + "epoch": 2.6541554959785523, + "grad_norm": 5.3939528465271, + "learning_rate": 4.983731131319972e-05, + "loss": 0.6221, + "num_input_tokens_seen": 10342880, + "step": 17820 + }, + { + "epoch": 2.6549002085195115, + "grad_norm": 3.8631632328033447, + "learning_rate": 4.98369410009042e-05, + "loss": 0.3451, + "num_input_tokens_seen": 10345824, + "step": 17825 + }, + { + "epoch": 2.6556449210604707, + "grad_norm": 18.558637619018555, + "learning_rate": 4.9836570269013796e-05, + "loss": 0.5508, + "num_input_tokens_seen": 10348608, + "step": 17830 + }, + { + "epoch": 2.65638963360143, + "grad_norm": 4.738819599151611, + "learning_rate": 4.983619911753478e-05, + "loss": 0.3385, + "num_input_tokens_seen": 10351648, + "step": 17835 + }, + { + "epoch": 2.657134346142389, + "grad_norm": 7.061539173126221, + "learning_rate": 4.983582754647343e-05, + "loss": 0.4725, + "num_input_tokens_seen": 10354752, + "step": 17840 + }, + { + "epoch": 2.6578790586833483, + "grad_norm": 4.144767761230469, + "learning_rate": 4.983545555583601e-05, + "loss": 0.768, + "num_input_tokens_seen": 10357632, + "step": 17845 + }, + { + "epoch": 2.6586237712243075, + "grad_norm": 7.100921630859375, + "learning_rate": 4.9835083145628816e-05, + "loss": 0.8194, + "num_input_tokens_seen": 10360736, + "step": 17850 + }, + { + "epoch": 2.6593684837652667, + "grad_norm": 6.406223297119141, + "learning_rate": 4.9834710315858125e-05, + "loss": 0.4649, + "num_input_tokens_seen": 10363520, + "step": 17855 + }, + { + "epoch": 2.660113196306226, + "grad_norm": 8.300481796264648, + "learning_rate": 4.983433706653024e-05, + "loss": 0.3039, + "num_input_tokens_seen": 10366400, + "step": 17860 + }, + { + "epoch": 2.660857908847185, + "grad_norm": 7.873573303222656, + "learning_rate": 4.9833963397651485e-05, + "loss": 0.6477, + "num_input_tokens_seen": 10369248, + "step": 17865 + }, + { + "epoch": 2.6616026213881443, + "grad_norm": 4.863770008087158, + "learning_rate": 4.9833589309228154e-05, + "loss": 0.488, + "num_input_tokens_seen": 10372160, + "step": 17870 + }, + { + "epoch": 2.6623473339291035, + "grad_norm": 6.934057235717773, + "learning_rate": 4.9833214801266565e-05, + "loss": 0.5387, + "num_input_tokens_seen": 10374848, + "step": 17875 + }, + { + "epoch": 2.6630920464700627, + "grad_norm": 10.391139030456543, + "learning_rate": 4.9832839873773054e-05, + "loss": 0.6766, + "num_input_tokens_seen": 10377600, + "step": 17880 + }, + { + "epoch": 2.663836759011022, + "grad_norm": 6.790217876434326, + "learning_rate": 4.983246452675395e-05, + "loss": 0.4388, + "num_input_tokens_seen": 10380736, + "step": 17885 + }, + { + "epoch": 2.6645814715519807, + "grad_norm": 4.027559280395508, + "learning_rate": 4.983208876021561e-05, + "loss": 0.6311, + "num_input_tokens_seen": 10383808, + "step": 17890 + }, + { + "epoch": 2.6653261840929403, + "grad_norm": 7.743094444274902, + "learning_rate": 4.983171257416436e-05, + "loss": 0.4951, + "num_input_tokens_seen": 10386720, + "step": 17895 + }, + { + "epoch": 2.666070896633899, + "grad_norm": 6.708654403686523, + "learning_rate": 4.983133596860656e-05, + "loss": 0.4408, + "num_input_tokens_seen": 10389696, + "step": 17900 + }, + { + "epoch": 2.6668156091748587, + "grad_norm": 6.270368576049805, + "learning_rate": 4.983095894354858e-05, + "loss": 0.4949, + "num_input_tokens_seen": 10392544, + "step": 17905 + }, + { + "epoch": 2.6675603217158175, + "grad_norm": 10.489998817443848, + "learning_rate": 4.9830581498996784e-05, + "loss": 0.6, + "num_input_tokens_seen": 10395296, + "step": 17910 + }, + { + "epoch": 2.668305034256777, + "grad_norm": 8.034064292907715, + "learning_rate": 4.983020363495755e-05, + "loss": 0.3473, + "num_input_tokens_seen": 10398304, + "step": 17915 + }, + { + "epoch": 2.669049746797736, + "grad_norm": 12.404934883117676, + "learning_rate": 4.982982535143727e-05, + "loss": 0.7958, + "num_input_tokens_seen": 10401088, + "step": 17920 + }, + { + "epoch": 2.6697944593386955, + "grad_norm": 7.92365837097168, + "learning_rate": 4.982944664844231e-05, + "loss": 0.3889, + "num_input_tokens_seen": 10403808, + "step": 17925 + }, + { + "epoch": 2.6705391718796543, + "grad_norm": 10.442118644714355, + "learning_rate": 4.98290675259791e-05, + "loss": 0.4552, + "num_input_tokens_seen": 10406752, + "step": 17930 + }, + { + "epoch": 2.6712838844206135, + "grad_norm": 7.677522659301758, + "learning_rate": 4.9828687984054015e-05, + "loss": 0.4964, + "num_input_tokens_seen": 10409440, + "step": 17935 + }, + { + "epoch": 2.6720285969615727, + "grad_norm": 7.265384197235107, + "learning_rate": 4.9828308022673494e-05, + "loss": 0.5813, + "num_input_tokens_seen": 10412128, + "step": 17940 + }, + { + "epoch": 2.672773309502532, + "grad_norm": 15.261503219604492, + "learning_rate": 4.9827927641843944e-05, + "loss": 0.5248, + "num_input_tokens_seen": 10414752, + "step": 17945 + }, + { + "epoch": 2.673518022043491, + "grad_norm": 3.7306578159332275, + "learning_rate": 4.982754684157178e-05, + "loss": 0.3515, + "num_input_tokens_seen": 10417312, + "step": 17950 + }, + { + "epoch": 2.6742627345844503, + "grad_norm": 11.23680305480957, + "learning_rate": 4.982716562186345e-05, + "loss": 0.4176, + "num_input_tokens_seen": 10420288, + "step": 17955 + }, + { + "epoch": 2.6750074471254095, + "grad_norm": 9.713407516479492, + "learning_rate": 4.982678398272539e-05, + "loss": 0.6286, + "num_input_tokens_seen": 10423264, + "step": 17960 + }, + { + "epoch": 2.6757521596663687, + "grad_norm": 9.174901962280273, + "learning_rate": 4.982640192416404e-05, + "loss": 0.3874, + "num_input_tokens_seen": 10426336, + "step": 17965 + }, + { + "epoch": 2.676496872207328, + "grad_norm": 7.419769763946533, + "learning_rate": 4.982601944618588e-05, + "loss": 0.4513, + "num_input_tokens_seen": 10429312, + "step": 17970 + }, + { + "epoch": 2.677241584748287, + "grad_norm": 6.423212051391602, + "learning_rate": 4.982563654879734e-05, + "loss": 0.4446, + "num_input_tokens_seen": 10432256, + "step": 17975 + }, + { + "epoch": 2.6779862972892463, + "grad_norm": 17.353824615478516, + "learning_rate": 4.982525323200491e-05, + "loss": 0.8142, + "num_input_tokens_seen": 10434880, + "step": 17980 + }, + { + "epoch": 2.6787310098302055, + "grad_norm": 12.214193344116211, + "learning_rate": 4.982486949581505e-05, + "loss": 0.931, + "num_input_tokens_seen": 10437920, + "step": 17985 + }, + { + "epoch": 2.6794757223711647, + "grad_norm": 5.372147083282471, + "learning_rate": 4.982448534023426e-05, + "loss": 0.6713, + "num_input_tokens_seen": 10440928, + "step": 17990 + }, + { + "epoch": 2.680220434912124, + "grad_norm": 5.943185329437256, + "learning_rate": 4.982410076526901e-05, + "loss": 0.549, + "num_input_tokens_seen": 10443520, + "step": 17995 + }, + { + "epoch": 2.680965147453083, + "grad_norm": 4.796084880828857, + "learning_rate": 4.9823715770925814e-05, + "loss": 0.4077, + "num_input_tokens_seen": 10446464, + "step": 18000 + }, + { + "epoch": 2.6817098599940423, + "grad_norm": 11.854656219482422, + "learning_rate": 4.982333035721117e-05, + "loss": 0.5717, + "num_input_tokens_seen": 10449152, + "step": 18005 + }, + { + "epoch": 2.6824545725350015, + "grad_norm": 6.02214241027832, + "learning_rate": 4.982294452413159e-05, + "loss": 0.5123, + "num_input_tokens_seen": 10451776, + "step": 18010 + }, + { + "epoch": 2.6831992850759607, + "grad_norm": 9.239291191101074, + "learning_rate": 4.982255827169359e-05, + "loss": 0.6135, + "num_input_tokens_seen": 10454432, + "step": 18015 + }, + { + "epoch": 2.68394399761692, + "grad_norm": 3.7157142162323, + "learning_rate": 4.982217159990369e-05, + "loss": 0.5032, + "num_input_tokens_seen": 10457184, + "step": 18020 + }, + { + "epoch": 2.684688710157879, + "grad_norm": 21.200632095336914, + "learning_rate": 4.982178450876843e-05, + "loss": 0.6298, + "num_input_tokens_seen": 10460064, + "step": 18025 + }, + { + "epoch": 2.6854334226988383, + "grad_norm": 6.230946063995361, + "learning_rate": 4.9821396998294356e-05, + "loss": 0.4984, + "num_input_tokens_seen": 10462848, + "step": 18030 + }, + { + "epoch": 2.6861781352397975, + "grad_norm": 8.758615493774414, + "learning_rate": 4.982100906848801e-05, + "loss": 0.3846, + "num_input_tokens_seen": 10465472, + "step": 18035 + }, + { + "epoch": 2.6869228477807567, + "grad_norm": 5.05958366394043, + "learning_rate": 4.9820620719355934e-05, + "loss": 0.5227, + "num_input_tokens_seen": 10468384, + "step": 18040 + }, + { + "epoch": 2.687667560321716, + "grad_norm": 5.3555755615234375, + "learning_rate": 4.982023195090469e-05, + "loss": 0.5508, + "num_input_tokens_seen": 10471424, + "step": 18045 + }, + { + "epoch": 2.688412272862675, + "grad_norm": 2.9634809494018555, + "learning_rate": 4.981984276314087e-05, + "loss": 0.5591, + "num_input_tokens_seen": 10474272, + "step": 18050 + }, + { + "epoch": 2.6891569854036343, + "grad_norm": 7.321560859680176, + "learning_rate": 4.981945315607103e-05, + "loss": 0.5739, + "num_input_tokens_seen": 10477312, + "step": 18055 + }, + { + "epoch": 2.6899016979445936, + "grad_norm": 6.344597816467285, + "learning_rate": 4.981906312970175e-05, + "loss": 0.4881, + "num_input_tokens_seen": 10480128, + "step": 18060 + }, + { + "epoch": 2.6906464104855523, + "grad_norm": 16.541894912719727, + "learning_rate": 4.981867268403962e-05, + "loss": 0.5795, + "num_input_tokens_seen": 10483264, + "step": 18065 + }, + { + "epoch": 2.691391123026512, + "grad_norm": 14.750670433044434, + "learning_rate": 4.981828181909124e-05, + "loss": 0.6271, + "num_input_tokens_seen": 10486272, + "step": 18070 + }, + { + "epoch": 2.6921358355674707, + "grad_norm": 8.02044677734375, + "learning_rate": 4.981789053486322e-05, + "loss": 0.4371, + "num_input_tokens_seen": 10489344, + "step": 18075 + }, + { + "epoch": 2.6928805481084304, + "grad_norm": 13.270606994628906, + "learning_rate": 4.981749883136215e-05, + "loss": 0.5761, + "num_input_tokens_seen": 10492128, + "step": 18080 + }, + { + "epoch": 2.693625260649389, + "grad_norm": 7.301458835601807, + "learning_rate": 4.981710670859467e-05, + "loss": 0.5176, + "num_input_tokens_seen": 10495104, + "step": 18085 + }, + { + "epoch": 2.6943699731903488, + "grad_norm": 5.926111221313477, + "learning_rate": 4.98167141665674e-05, + "loss": 0.5887, + "num_input_tokens_seen": 10497888, + "step": 18090 + }, + { + "epoch": 2.6951146857313075, + "grad_norm": 9.491632461547852, + "learning_rate": 4.981632120528696e-05, + "loss": 0.5614, + "num_input_tokens_seen": 10500864, + "step": 18095 + }, + { + "epoch": 2.695859398272267, + "grad_norm": 7.271105766296387, + "learning_rate": 4.981592782476e-05, + "loss": 0.5724, + "num_input_tokens_seen": 10503456, + "step": 18100 + }, + { + "epoch": 2.696604110813226, + "grad_norm": 16.49163055419922, + "learning_rate": 4.981553402499316e-05, + "loss": 0.5015, + "num_input_tokens_seen": 10506784, + "step": 18105 + }, + { + "epoch": 2.697348823354185, + "grad_norm": 5.1339287757873535, + "learning_rate": 4.9815139805993086e-05, + "loss": 0.781, + "num_input_tokens_seen": 10509728, + "step": 18110 + }, + { + "epoch": 2.6980935358951443, + "grad_norm": 9.853870391845703, + "learning_rate": 4.9814745167766455e-05, + "loss": 0.5073, + "num_input_tokens_seen": 10512512, + "step": 18115 + }, + { + "epoch": 2.6988382484361035, + "grad_norm": 4.314340591430664, + "learning_rate": 4.981435011031992e-05, + "loss": 0.6831, + "num_input_tokens_seen": 10515488, + "step": 18120 + }, + { + "epoch": 2.6995829609770627, + "grad_norm": 12.044425964355469, + "learning_rate": 4.9813954633660166e-05, + "loss": 0.6365, + "num_input_tokens_seen": 10518336, + "step": 18125 + }, + { + "epoch": 2.700327673518022, + "grad_norm": 3.275686740875244, + "learning_rate": 4.9813558737793865e-05, + "loss": 0.4524, + "num_input_tokens_seen": 10521248, + "step": 18130 + }, + { + "epoch": 2.701072386058981, + "grad_norm": 4.43001651763916, + "learning_rate": 4.9813162422727705e-05, + "loss": 0.5867, + "num_input_tokens_seen": 10523968, + "step": 18135 + }, + { + "epoch": 2.7018170985999403, + "grad_norm": 4.251560211181641, + "learning_rate": 4.981276568846839e-05, + "loss": 0.4268, + "num_input_tokens_seen": 10526624, + "step": 18140 + }, + { + "epoch": 2.7025618111408996, + "grad_norm": 6.380756855010986, + "learning_rate": 4.981236853502261e-05, + "loss": 0.4971, + "num_input_tokens_seen": 10529344, + "step": 18145 + }, + { + "epoch": 2.7033065236818588, + "grad_norm": 4.662021160125732, + "learning_rate": 4.9811970962397095e-05, + "loss": 0.625, + "num_input_tokens_seen": 10532224, + "step": 18150 + }, + { + "epoch": 2.704051236222818, + "grad_norm": 4.710285663604736, + "learning_rate": 4.981157297059853e-05, + "loss": 0.4527, + "num_input_tokens_seen": 10535040, + "step": 18155 + }, + { + "epoch": 2.704795948763777, + "grad_norm": 9.265342712402344, + "learning_rate": 4.981117455963367e-05, + "loss": 0.6705, + "num_input_tokens_seen": 10537888, + "step": 18160 + }, + { + "epoch": 2.7055406613047364, + "grad_norm": 4.55898380279541, + "learning_rate": 4.981077572950923e-05, + "loss": 0.6297, + "num_input_tokens_seen": 10540640, + "step": 18165 + }, + { + "epoch": 2.7062853738456956, + "grad_norm": 10.47840404510498, + "learning_rate": 4.9810376480231944e-05, + "loss": 0.5276, + "num_input_tokens_seen": 10543552, + "step": 18170 + }, + { + "epoch": 2.7070300863866548, + "grad_norm": 6.71350622177124, + "learning_rate": 4.980997681180858e-05, + "loss": 0.512, + "num_input_tokens_seen": 10546432, + "step": 18175 + }, + { + "epoch": 2.707774798927614, + "grad_norm": 8.202059745788574, + "learning_rate": 4.980957672424586e-05, + "loss": 0.6153, + "num_input_tokens_seen": 10549408, + "step": 18180 + }, + { + "epoch": 2.708519511468573, + "grad_norm": 9.461752891540527, + "learning_rate": 4.980917621755056e-05, + "loss": 0.7565, + "num_input_tokens_seen": 10552224, + "step": 18185 + }, + { + "epoch": 2.7092642240095324, + "grad_norm": 5.2112603187561035, + "learning_rate": 4.9808775291729445e-05, + "loss": 0.5176, + "num_input_tokens_seen": 10554848, + "step": 18190 + }, + { + "epoch": 2.7100089365504916, + "grad_norm": 5.3252339363098145, + "learning_rate": 4.980837394678928e-05, + "loss": 0.5485, + "num_input_tokens_seen": 10557760, + "step": 18195 + }, + { + "epoch": 2.710753649091451, + "grad_norm": 8.992074966430664, + "learning_rate": 4.980797218273685e-05, + "loss": 0.4583, + "num_input_tokens_seen": 10560992, + "step": 18200 + }, + { + "epoch": 2.71149836163241, + "grad_norm": 18.96157455444336, + "learning_rate": 4.980756999957895e-05, + "loss": 0.5004, + "num_input_tokens_seen": 10563488, + "step": 18205 + }, + { + "epoch": 2.712243074173369, + "grad_norm": 6.96356725692749, + "learning_rate": 4.9807167397322376e-05, + "loss": 0.6231, + "num_input_tokens_seen": 10566304, + "step": 18210 + }, + { + "epoch": 2.7129877867143284, + "grad_norm": 8.889486312866211, + "learning_rate": 4.980676437597391e-05, + "loss": 0.6863, + "num_input_tokens_seen": 10568832, + "step": 18215 + }, + { + "epoch": 2.7137324992552876, + "grad_norm": 5.628002643585205, + "learning_rate": 4.980636093554038e-05, + "loss": 0.4635, + "num_input_tokens_seen": 10571808, + "step": 18220 + }, + { + "epoch": 2.714477211796247, + "grad_norm": 2.324946403503418, + "learning_rate": 4.980595707602858e-05, + "loss": 0.2908, + "num_input_tokens_seen": 10574624, + "step": 18225 + }, + { + "epoch": 2.715221924337206, + "grad_norm": 5.885067462921143, + "learning_rate": 4.980555279744535e-05, + "loss": 0.7051, + "num_input_tokens_seen": 10577632, + "step": 18230 + }, + { + "epoch": 2.715966636878165, + "grad_norm": 8.76275634765625, + "learning_rate": 4.980514809979753e-05, + "loss": 0.7448, + "num_input_tokens_seen": 10580352, + "step": 18235 + }, + { + "epoch": 2.716711349419124, + "grad_norm": 5.866318225860596, + "learning_rate": 4.9804742983091934e-05, + "loss": 0.6123, + "num_input_tokens_seen": 10583328, + "step": 18240 + }, + { + "epoch": 2.7174560619600836, + "grad_norm": 5.924191474914551, + "learning_rate": 4.9804337447335414e-05, + "loss": 0.6268, + "num_input_tokens_seen": 10586144, + "step": 18245 + }, + { + "epoch": 2.7182007745010424, + "grad_norm": 10.193225860595703, + "learning_rate": 4.980393149253483e-05, + "loss": 0.5161, + "num_input_tokens_seen": 10589408, + "step": 18250 + }, + { + "epoch": 2.718945487042002, + "grad_norm": 6.669487476348877, + "learning_rate": 4.980352511869703e-05, + "loss": 0.4149, + "num_input_tokens_seen": 10592480, + "step": 18255 + }, + { + "epoch": 2.7196901995829608, + "grad_norm": 22.760486602783203, + "learning_rate": 4.980311832582888e-05, + "loss": 0.6771, + "num_input_tokens_seen": 10595488, + "step": 18260 + }, + { + "epoch": 2.7204349121239204, + "grad_norm": 20.374279022216797, + "learning_rate": 4.980271111393726e-05, + "loss": 0.7762, + "num_input_tokens_seen": 10598880, + "step": 18265 + }, + { + "epoch": 2.721179624664879, + "grad_norm": 8.047669410705566, + "learning_rate": 4.980230348302904e-05, + "loss": 0.5125, + "num_input_tokens_seen": 10601632, + "step": 18270 + }, + { + "epoch": 2.721924337205839, + "grad_norm": 6.028156757354736, + "learning_rate": 4.9801895433111115e-05, + "loss": 0.7229, + "num_input_tokens_seen": 10604544, + "step": 18275 + }, + { + "epoch": 2.7226690497467976, + "grad_norm": 13.34218978881836, + "learning_rate": 4.9801486964190366e-05, + "loss": 0.6086, + "num_input_tokens_seen": 10607904, + "step": 18280 + }, + { + "epoch": 2.723413762287757, + "grad_norm": 7.51617431640625, + "learning_rate": 4.9801078076273704e-05, + "loss": 0.5488, + "num_input_tokens_seen": 10610752, + "step": 18285 + }, + { + "epoch": 2.724158474828716, + "grad_norm": 6.403872966766357, + "learning_rate": 4.980066876936804e-05, + "loss": 0.4698, + "num_input_tokens_seen": 10613376, + "step": 18290 + }, + { + "epoch": 2.724903187369675, + "grad_norm": 7.7394700050354, + "learning_rate": 4.980025904348028e-05, + "loss": 0.7665, + "num_input_tokens_seen": 10616064, + "step": 18295 + }, + { + "epoch": 2.7256478999106344, + "grad_norm": 9.904979705810547, + "learning_rate": 4.979984889861735e-05, + "loss": 0.5533, + "num_input_tokens_seen": 10618848, + "step": 18300 + }, + { + "epoch": 2.7263926124515936, + "grad_norm": 4.838565349578857, + "learning_rate": 4.9799438334786174e-05, + "loss": 0.4024, + "num_input_tokens_seen": 10621568, + "step": 18305 + }, + { + "epoch": 2.727137324992553, + "grad_norm": 7.025132656097412, + "learning_rate": 4.97990273519937e-05, + "loss": 0.387, + "num_input_tokens_seen": 10624448, + "step": 18310 + }, + { + "epoch": 2.727882037533512, + "grad_norm": 6.422702789306641, + "learning_rate": 4.9798615950246855e-05, + "loss": 0.6441, + "num_input_tokens_seen": 10627648, + "step": 18315 + }, + { + "epoch": 2.728626750074471, + "grad_norm": 10.608174324035645, + "learning_rate": 4.97982041295526e-05, + "loss": 0.5984, + "num_input_tokens_seen": 10630400, + "step": 18320 + }, + { + "epoch": 2.7293714626154304, + "grad_norm": 8.79152774810791, + "learning_rate": 4.97977918899179e-05, + "loss": 0.6222, + "num_input_tokens_seen": 10633248, + "step": 18325 + }, + { + "epoch": 2.7301161751563896, + "grad_norm": 10.105561256408691, + "learning_rate": 4.97973792313497e-05, + "loss": 0.581, + "num_input_tokens_seen": 10636448, + "step": 18330 + }, + { + "epoch": 2.730860887697349, + "grad_norm": 7.200457572937012, + "learning_rate": 4.979696615385499e-05, + "loss": 0.5516, + "num_input_tokens_seen": 10639136, + "step": 18335 + }, + { + "epoch": 2.731605600238308, + "grad_norm": 7.415533065795898, + "learning_rate": 4.979655265744072e-05, + "loss": 0.5567, + "num_input_tokens_seen": 10641952, + "step": 18340 + }, + { + "epoch": 2.732350312779267, + "grad_norm": 12.9534912109375, + "learning_rate": 4.979613874211391e-05, + "loss": 0.695, + "num_input_tokens_seen": 10644480, + "step": 18345 + }, + { + "epoch": 2.7330950253202264, + "grad_norm": 4.747674465179443, + "learning_rate": 4.979572440788154e-05, + "loss": 0.4868, + "num_input_tokens_seen": 10647488, + "step": 18350 + }, + { + "epoch": 2.7338397378611856, + "grad_norm": 7.324033260345459, + "learning_rate": 4.97953096547506e-05, + "loss": 0.6478, + "num_input_tokens_seen": 10650368, + "step": 18355 + }, + { + "epoch": 2.734584450402145, + "grad_norm": 7.9534430503845215, + "learning_rate": 4.9794894482728105e-05, + "loss": 0.5994, + "num_input_tokens_seen": 10653344, + "step": 18360 + }, + { + "epoch": 2.735329162943104, + "grad_norm": 4.540970802307129, + "learning_rate": 4.979447889182107e-05, + "loss": 0.6004, + "num_input_tokens_seen": 10656352, + "step": 18365 + }, + { + "epoch": 2.7360738754840632, + "grad_norm": 9.597318649291992, + "learning_rate": 4.979406288203651e-05, + "loss": 0.6003, + "num_input_tokens_seen": 10659264, + "step": 18370 + }, + { + "epoch": 2.7368185880250224, + "grad_norm": 6.191229343414307, + "learning_rate": 4.979364645338146e-05, + "loss": 0.3898, + "num_input_tokens_seen": 10662176, + "step": 18375 + }, + { + "epoch": 2.7375633005659816, + "grad_norm": 5.090923309326172, + "learning_rate": 4.979322960586296e-05, + "loss": 0.4375, + "num_input_tokens_seen": 10665024, + "step": 18380 + }, + { + "epoch": 2.738308013106941, + "grad_norm": 14.028031349182129, + "learning_rate": 4.979281233948803e-05, + "loss": 0.5494, + "num_input_tokens_seen": 10667904, + "step": 18385 + }, + { + "epoch": 2.7390527256479, + "grad_norm": 4.863871097564697, + "learning_rate": 4.9792394654263744e-05, + "loss": 0.5534, + "num_input_tokens_seen": 10670656, + "step": 18390 + }, + { + "epoch": 2.7397974381888592, + "grad_norm": 7.156909465789795, + "learning_rate": 4.9791976550197144e-05, + "loss": 0.6399, + "num_input_tokens_seen": 10673408, + "step": 18395 + }, + { + "epoch": 2.7405421507298184, + "grad_norm": 10.962543487548828, + "learning_rate": 4.9791558027295296e-05, + "loss": 0.6101, + "num_input_tokens_seen": 10676128, + "step": 18400 + }, + { + "epoch": 2.7412868632707776, + "grad_norm": 14.625473976135254, + "learning_rate": 4.9791139085565274e-05, + "loss": 0.7239, + "num_input_tokens_seen": 10679072, + "step": 18405 + }, + { + "epoch": 2.742031575811737, + "grad_norm": 10.981926918029785, + "learning_rate": 4.9790719725014154e-05, + "loss": 0.5951, + "num_input_tokens_seen": 10682080, + "step": 18410 + }, + { + "epoch": 2.7427762883526956, + "grad_norm": 3.763282299041748, + "learning_rate": 4.979029994564902e-05, + "loss": 0.5371, + "num_input_tokens_seen": 10684864, + "step": 18415 + }, + { + "epoch": 2.7435210008936552, + "grad_norm": 14.914880752563477, + "learning_rate": 4.978987974747697e-05, + "loss": 0.628, + "num_input_tokens_seen": 10687488, + "step": 18420 + }, + { + "epoch": 2.744265713434614, + "grad_norm": 6.872268199920654, + "learning_rate": 4.9789459130505086e-05, + "loss": 0.3772, + "num_input_tokens_seen": 10690560, + "step": 18425 + }, + { + "epoch": 2.7450104259755737, + "grad_norm": 7.992705345153809, + "learning_rate": 4.97890380947405e-05, + "loss": 0.3974, + "num_input_tokens_seen": 10693632, + "step": 18430 + }, + { + "epoch": 2.7457551385165324, + "grad_norm": 10.839364051818848, + "learning_rate": 4.97886166401903e-05, + "loss": 0.5054, + "num_input_tokens_seen": 10696640, + "step": 18435 + }, + { + "epoch": 2.746499851057492, + "grad_norm": 13.003988265991211, + "learning_rate": 4.978819476686162e-05, + "loss": 0.7395, + "num_input_tokens_seen": 10699456, + "step": 18440 + }, + { + "epoch": 2.747244563598451, + "grad_norm": 3.5736396312713623, + "learning_rate": 4.9787772474761575e-05, + "loss": 0.4899, + "num_input_tokens_seen": 10702176, + "step": 18445 + }, + { + "epoch": 2.7479892761394105, + "grad_norm": 7.048482418060303, + "learning_rate": 4.978734976389732e-05, + "loss": 0.4401, + "num_input_tokens_seen": 10705184, + "step": 18450 + }, + { + "epoch": 2.7487339886803692, + "grad_norm": 10.062297821044922, + "learning_rate": 4.9786926634275964e-05, + "loss": 0.6618, + "num_input_tokens_seen": 10708352, + "step": 18455 + }, + { + "epoch": 2.7494787012213284, + "grad_norm": 9.9910249710083, + "learning_rate": 4.978650308590469e-05, + "loss": 0.6353, + "num_input_tokens_seen": 10711136, + "step": 18460 + }, + { + "epoch": 2.7502234137622876, + "grad_norm": 4.925377368927002, + "learning_rate": 4.9786079118790635e-05, + "loss": 0.4731, + "num_input_tokens_seen": 10714016, + "step": 18465 + }, + { + "epoch": 2.750968126303247, + "grad_norm": 9.763704299926758, + "learning_rate": 4.9785654732940964e-05, + "loss": 0.4875, + "num_input_tokens_seen": 10716896, + "step": 18470 + }, + { + "epoch": 2.751712838844206, + "grad_norm": 12.471611976623535, + "learning_rate": 4.9785229928362854e-05, + "loss": 0.6284, + "num_input_tokens_seen": 10719744, + "step": 18475 + }, + { + "epoch": 2.7524575513851652, + "grad_norm": 7.258773326873779, + "learning_rate": 4.9784804705063465e-05, + "loss": 0.4835, + "num_input_tokens_seen": 10722752, + "step": 18480 + }, + { + "epoch": 2.7532022639261244, + "grad_norm": 11.10689640045166, + "learning_rate": 4.978437906304999e-05, + "loss": 0.4062, + "num_input_tokens_seen": 10725696, + "step": 18485 + }, + { + "epoch": 2.7539469764670836, + "grad_norm": 12.007857322692871, + "learning_rate": 4.978395300232963e-05, + "loss": 0.6947, + "num_input_tokens_seen": 10728384, + "step": 18490 + }, + { + "epoch": 2.754691689008043, + "grad_norm": 7.883004188537598, + "learning_rate": 4.978352652290956e-05, + "loss": 0.5342, + "num_input_tokens_seen": 10731520, + "step": 18495 + }, + { + "epoch": 2.755436401549002, + "grad_norm": 9.065021514892578, + "learning_rate": 4.978309962479701e-05, + "loss": 0.7218, + "num_input_tokens_seen": 10734688, + "step": 18500 + }, + { + "epoch": 2.7561811140899612, + "grad_norm": 6.648543834686279, + "learning_rate": 4.978267230799918e-05, + "loss": 0.7283, + "num_input_tokens_seen": 10737728, + "step": 18505 + }, + { + "epoch": 2.7569258266309205, + "grad_norm": 5.324173927307129, + "learning_rate": 4.9782244572523284e-05, + "loss": 0.7196, + "num_input_tokens_seen": 10740416, + "step": 18510 + }, + { + "epoch": 2.7576705391718797, + "grad_norm": 3.5620462894439697, + "learning_rate": 4.978181641837656e-05, + "loss": 0.5288, + "num_input_tokens_seen": 10743168, + "step": 18515 + }, + { + "epoch": 2.758415251712839, + "grad_norm": 6.0060930252075195, + "learning_rate": 4.978138784556623e-05, + "loss": 0.7543, + "num_input_tokens_seen": 10746240, + "step": 18520 + }, + { + "epoch": 2.759159964253798, + "grad_norm": 7.278413772583008, + "learning_rate": 4.9780958854099535e-05, + "loss": 0.7416, + "num_input_tokens_seen": 10749152, + "step": 18525 + }, + { + "epoch": 2.7599046767947573, + "grad_norm": 6.172918319702148, + "learning_rate": 4.978052944398373e-05, + "loss": 0.4558, + "num_input_tokens_seen": 10752288, + "step": 18530 + }, + { + "epoch": 2.7606493893357165, + "grad_norm": 7.57103157043457, + "learning_rate": 4.978009961522607e-05, + "loss": 0.6409, + "num_input_tokens_seen": 10755168, + "step": 18535 + }, + { + "epoch": 2.7613941018766757, + "grad_norm": 7.115747928619385, + "learning_rate": 4.9779669367833804e-05, + "loss": 0.4633, + "num_input_tokens_seen": 10758080, + "step": 18540 + }, + { + "epoch": 2.762138814417635, + "grad_norm": 8.017455101013184, + "learning_rate": 4.9779238701814214e-05, + "loss": 0.4235, + "num_input_tokens_seen": 10761120, + "step": 18545 + }, + { + "epoch": 2.762883526958594, + "grad_norm": 4.96358585357666, + "learning_rate": 4.977880761717457e-05, + "loss": 0.6073, + "num_input_tokens_seen": 10764064, + "step": 18550 + }, + { + "epoch": 2.7636282394995533, + "grad_norm": 6.52899694442749, + "learning_rate": 4.977837611392216e-05, + "loss": 0.4589, + "num_input_tokens_seen": 10766784, + "step": 18555 + }, + { + "epoch": 2.7643729520405125, + "grad_norm": 4.657859802246094, + "learning_rate": 4.9777944192064264e-05, + "loss": 0.8117, + "num_input_tokens_seen": 10769664, + "step": 18560 + }, + { + "epoch": 2.7651176645814717, + "grad_norm": 4.179421424865723, + "learning_rate": 4.9777511851608185e-05, + "loss": 0.429, + "num_input_tokens_seen": 10772544, + "step": 18565 + }, + { + "epoch": 2.765862377122431, + "grad_norm": 7.278858661651611, + "learning_rate": 4.9777079092561224e-05, + "loss": 0.5182, + "num_input_tokens_seen": 10775648, + "step": 18570 + }, + { + "epoch": 2.76660708966339, + "grad_norm": 6.271049976348877, + "learning_rate": 4.97766459149307e-05, + "loss": 0.4135, + "num_input_tokens_seen": 10778368, + "step": 18575 + }, + { + "epoch": 2.7673518022043493, + "grad_norm": 10.653107643127441, + "learning_rate": 4.977621231872392e-05, + "loss": 0.5678, + "num_input_tokens_seen": 10781216, + "step": 18580 + }, + { + "epoch": 2.7680965147453085, + "grad_norm": 8.27678394317627, + "learning_rate": 4.977577830394822e-05, + "loss": 0.5597, + "num_input_tokens_seen": 10783904, + "step": 18585 + }, + { + "epoch": 2.7688412272862672, + "grad_norm": 4.0806193351745605, + "learning_rate": 4.977534387061091e-05, + "loss": 0.5736, + "num_input_tokens_seen": 10786752, + "step": 18590 + }, + { + "epoch": 2.769585939827227, + "grad_norm": 10.819046974182129, + "learning_rate": 4.977490901871936e-05, + "loss": 0.5481, + "num_input_tokens_seen": 10789760, + "step": 18595 + }, + { + "epoch": 2.7703306523681857, + "grad_norm": 12.25411319732666, + "learning_rate": 4.97744737482809e-05, + "loss": 0.4372, + "num_input_tokens_seen": 10792640, + "step": 18600 + }, + { + "epoch": 2.7710753649091453, + "grad_norm": 12.331649780273438, + "learning_rate": 4.977403805930288e-05, + "loss": 0.7075, + "num_input_tokens_seen": 10795392, + "step": 18605 + }, + { + "epoch": 2.771820077450104, + "grad_norm": 8.712668418884277, + "learning_rate": 4.977360195179268e-05, + "loss": 0.497, + "num_input_tokens_seen": 10798368, + "step": 18610 + }, + { + "epoch": 2.7725647899910637, + "grad_norm": 7.9518842697143555, + "learning_rate": 4.9773165425757646e-05, + "loss": 0.5405, + "num_input_tokens_seen": 10801248, + "step": 18615 + }, + { + "epoch": 2.7733095025320225, + "grad_norm": 7.112497329711914, + "learning_rate": 4.977272848120516e-05, + "loss": 0.5704, + "num_input_tokens_seen": 10803936, + "step": 18620 + }, + { + "epoch": 2.7740542150729817, + "grad_norm": 7.765688896179199, + "learning_rate": 4.9772291118142604e-05, + "loss": 0.3579, + "num_input_tokens_seen": 10807008, + "step": 18625 + }, + { + "epoch": 2.774798927613941, + "grad_norm": 5.678055763244629, + "learning_rate": 4.9771853336577366e-05, + "loss": 0.4661, + "num_input_tokens_seen": 10809952, + "step": 18630 + }, + { + "epoch": 2.7755436401549, + "grad_norm": 11.108316421508789, + "learning_rate": 4.9771415136516846e-05, + "loss": 0.5666, + "num_input_tokens_seen": 10813120, + "step": 18635 + }, + { + "epoch": 2.7762883526958593, + "grad_norm": 9.368971824645996, + "learning_rate": 4.977097651796844e-05, + "loss": 0.4815, + "num_input_tokens_seen": 10816000, + "step": 18640 + }, + { + "epoch": 2.7770330652368185, + "grad_norm": 4.599431991577148, + "learning_rate": 4.977053748093956e-05, + "loss": 0.3287, + "num_input_tokens_seen": 10818592, + "step": 18645 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 8.109158515930176, + "learning_rate": 4.9770098025437634e-05, + "loss": 0.6609, + "num_input_tokens_seen": 10821312, + "step": 18650 + }, + { + "epoch": 2.778522490318737, + "grad_norm": 15.95436954498291, + "learning_rate": 4.9769658151470075e-05, + "loss": 0.5519, + "num_input_tokens_seen": 10824128, + "step": 18655 + }, + { + "epoch": 2.779267202859696, + "grad_norm": 14.845227241516113, + "learning_rate": 4.976921785904431e-05, + "loss": 0.7538, + "num_input_tokens_seen": 10827104, + "step": 18660 + }, + { + "epoch": 2.7800119154006553, + "grad_norm": 4.919472694396973, + "learning_rate": 4.976877714816779e-05, + "loss": 0.3844, + "num_input_tokens_seen": 10829888, + "step": 18665 + }, + { + "epoch": 2.7807566279416145, + "grad_norm": 13.519367218017578, + "learning_rate": 4.976833601884795e-05, + "loss": 0.5896, + "num_input_tokens_seen": 10832896, + "step": 18670 + }, + { + "epoch": 2.7815013404825737, + "grad_norm": 11.73331356048584, + "learning_rate": 4.9767894471092246e-05, + "loss": 0.4544, + "num_input_tokens_seen": 10835776, + "step": 18675 + }, + { + "epoch": 2.782246053023533, + "grad_norm": 3.7773537635803223, + "learning_rate": 4.9767452504908143e-05, + "loss": 0.4656, + "num_input_tokens_seen": 10838784, + "step": 18680 + }, + { + "epoch": 2.782990765564492, + "grad_norm": 12.281787872314453, + "learning_rate": 4.9767010120303094e-05, + "loss": 0.3597, + "num_input_tokens_seen": 10841440, + "step": 18685 + }, + { + "epoch": 2.7837354781054513, + "grad_norm": 7.987802982330322, + "learning_rate": 4.9766567317284585e-05, + "loss": 0.533, + "num_input_tokens_seen": 10844480, + "step": 18690 + }, + { + "epoch": 2.7844801906464105, + "grad_norm": 14.071751594543457, + "learning_rate": 4.976612409586009e-05, + "loss": 0.6457, + "num_input_tokens_seen": 10847488, + "step": 18695 + }, + { + "epoch": 2.7852249031873697, + "grad_norm": 3.5013787746429443, + "learning_rate": 4.9765680456037106e-05, + "loss": 0.4687, + "num_input_tokens_seen": 10850304, + "step": 18700 + }, + { + "epoch": 2.785969615728329, + "grad_norm": 9.010144233703613, + "learning_rate": 4.976523639782312e-05, + "loss": 0.8256, + "num_input_tokens_seen": 10852896, + "step": 18705 + }, + { + "epoch": 2.786714328269288, + "grad_norm": 5.625434398651123, + "learning_rate": 4.976479192122563e-05, + "loss": 0.2833, + "num_input_tokens_seen": 10855584, + "step": 18710 + }, + { + "epoch": 2.7874590408102473, + "grad_norm": 9.628339767456055, + "learning_rate": 4.9764347026252156e-05, + "loss": 0.4074, + "num_input_tokens_seen": 10858688, + "step": 18715 + }, + { + "epoch": 2.7882037533512065, + "grad_norm": 4.9228596687316895, + "learning_rate": 4.97639017129102e-05, + "loss": 0.4757, + "num_input_tokens_seen": 10861728, + "step": 18720 + }, + { + "epoch": 2.7889484658921657, + "grad_norm": 7.328104496002197, + "learning_rate": 4.9763455981207305e-05, + "loss": 0.6437, + "num_input_tokens_seen": 10864576, + "step": 18725 + }, + { + "epoch": 2.789693178433125, + "grad_norm": 13.72642993927002, + "learning_rate": 4.976300983115099e-05, + "loss": 0.5131, + "num_input_tokens_seen": 10867296, + "step": 18730 + }, + { + "epoch": 2.790437890974084, + "grad_norm": 7.859494209289551, + "learning_rate": 4.976256326274878e-05, + "loss": 0.5343, + "num_input_tokens_seen": 10870336, + "step": 18735 + }, + { + "epoch": 2.7911826035150433, + "grad_norm": 7.375358581542969, + "learning_rate": 4.976211627600823e-05, + "loss": 0.3482, + "num_input_tokens_seen": 10873184, + "step": 18740 + }, + { + "epoch": 2.7919273160560025, + "grad_norm": 10.407639503479004, + "learning_rate": 4.976166887093691e-05, + "loss": 0.4954, + "num_input_tokens_seen": 10876128, + "step": 18745 + }, + { + "epoch": 2.7926720285969617, + "grad_norm": 13.021106719970703, + "learning_rate": 4.976122104754235e-05, + "loss": 0.5795, + "num_input_tokens_seen": 10879104, + "step": 18750 + }, + { + "epoch": 2.7934167411379205, + "grad_norm": 13.267992973327637, + "learning_rate": 4.976077280583212e-05, + "loss": 0.8076, + "num_input_tokens_seen": 10881920, + "step": 18755 + }, + { + "epoch": 2.79416145367888, + "grad_norm": 5.345602512359619, + "learning_rate": 4.9760324145813806e-05, + "loss": 0.6285, + "num_input_tokens_seen": 10884960, + "step": 18760 + }, + { + "epoch": 2.794906166219839, + "grad_norm": 14.891082763671875, + "learning_rate": 4.975987506749499e-05, + "loss": 0.4184, + "num_input_tokens_seen": 10887936, + "step": 18765 + }, + { + "epoch": 2.7956508787607985, + "grad_norm": 5.036073207855225, + "learning_rate": 4.975942557088324e-05, + "loss": 0.6723, + "num_input_tokens_seen": 10890976, + "step": 18770 + }, + { + "epoch": 2.7963955913017573, + "grad_norm": 10.977106094360352, + "learning_rate": 4.9758975655986164e-05, + "loss": 0.7899, + "num_input_tokens_seen": 10893920, + "step": 18775 + }, + { + "epoch": 2.797140303842717, + "grad_norm": 3.918970823287964, + "learning_rate": 4.975852532281135e-05, + "loss": 0.4976, + "num_input_tokens_seen": 10896672, + "step": 18780 + }, + { + "epoch": 2.7978850163836757, + "grad_norm": 7.410213947296143, + "learning_rate": 4.975807457136642e-05, + "loss": 0.4647, + "num_input_tokens_seen": 10900160, + "step": 18785 + }, + { + "epoch": 2.7986297289246354, + "grad_norm": 10.628881454467773, + "learning_rate": 4.975762340165898e-05, + "loss": 0.4343, + "num_input_tokens_seen": 10903008, + "step": 18790 + }, + { + "epoch": 2.799374441465594, + "grad_norm": 4.028867721557617, + "learning_rate": 4.975717181369666e-05, + "loss": 0.6006, + "num_input_tokens_seen": 10905760, + "step": 18795 + }, + { + "epoch": 2.8001191540065533, + "grad_norm": 5.40278434753418, + "learning_rate": 4.9756719807487076e-05, + "loss": 0.4571, + "num_input_tokens_seen": 10908608, + "step": 18800 + }, + { + "epoch": 2.8008638665475125, + "grad_norm": 7.720542907714844, + "learning_rate": 4.975626738303788e-05, + "loss": 0.3378, + "num_input_tokens_seen": 10911584, + "step": 18805 + }, + { + "epoch": 2.8016085790884717, + "grad_norm": 15.063467025756836, + "learning_rate": 4.975581454035671e-05, + "loss": 0.4003, + "num_input_tokens_seen": 10914432, + "step": 18810 + }, + { + "epoch": 2.802353291629431, + "grad_norm": 9.561576843261719, + "learning_rate": 4.975536127945121e-05, + "loss": 0.7365, + "num_input_tokens_seen": 10917344, + "step": 18815 + }, + { + "epoch": 2.80309800417039, + "grad_norm": 11.003007888793945, + "learning_rate": 4.975490760032904e-05, + "loss": 0.6448, + "num_input_tokens_seen": 10920256, + "step": 18820 + }, + { + "epoch": 2.8038427167113493, + "grad_norm": 8.727051734924316, + "learning_rate": 4.975445350299787e-05, + "loss": 0.4473, + "num_input_tokens_seen": 10923040, + "step": 18825 + }, + { + "epoch": 2.8045874292523085, + "grad_norm": 5.636202812194824, + "learning_rate": 4.975399898746536e-05, + "loss": 0.4239, + "num_input_tokens_seen": 10926144, + "step": 18830 + }, + { + "epoch": 2.8053321417932677, + "grad_norm": 4.86838960647583, + "learning_rate": 4.9753544053739197e-05, + "loss": 0.3341, + "num_input_tokens_seen": 10928992, + "step": 18835 + }, + { + "epoch": 2.806076854334227, + "grad_norm": 5.220808982849121, + "learning_rate": 4.975308870182707e-05, + "loss": 0.6267, + "num_input_tokens_seen": 10931872, + "step": 18840 + }, + { + "epoch": 2.806821566875186, + "grad_norm": 8.974771499633789, + "learning_rate": 4.9752632931736665e-05, + "loss": 0.6949, + "num_input_tokens_seen": 10934848, + "step": 18845 + }, + { + "epoch": 2.8075662794161453, + "grad_norm": 6.210578441619873, + "learning_rate": 4.9752176743475684e-05, + "loss": 0.6, + "num_input_tokens_seen": 10937792, + "step": 18850 + }, + { + "epoch": 2.8083109919571045, + "grad_norm": 4.206488609313965, + "learning_rate": 4.9751720137051836e-05, + "loss": 0.3959, + "num_input_tokens_seen": 10940960, + "step": 18855 + }, + { + "epoch": 2.8090557044980637, + "grad_norm": 1.9644484519958496, + "learning_rate": 4.9751263112472834e-05, + "loss": 0.3405, + "num_input_tokens_seen": 10943616, + "step": 18860 + }, + { + "epoch": 2.809800417039023, + "grad_norm": 6.830678462982178, + "learning_rate": 4.9750805669746395e-05, + "loss": 0.6584, + "num_input_tokens_seen": 10946720, + "step": 18865 + }, + { + "epoch": 2.810545129579982, + "grad_norm": 19.285953521728516, + "learning_rate": 4.975034780888025e-05, + "loss": 0.4886, + "num_input_tokens_seen": 10949376, + "step": 18870 + }, + { + "epoch": 2.8112898421209414, + "grad_norm": 6.981959819793701, + "learning_rate": 4.9749889529882134e-05, + "loss": 0.3787, + "num_input_tokens_seen": 10952128, + "step": 18875 + }, + { + "epoch": 2.8120345546619006, + "grad_norm": 8.709568977355957, + "learning_rate": 4.974943083275979e-05, + "loss": 0.5644, + "num_input_tokens_seen": 10954880, + "step": 18880 + }, + { + "epoch": 2.8127792672028598, + "grad_norm": 7.976146221160889, + "learning_rate": 4.974897171752097e-05, + "loss": 0.5682, + "num_input_tokens_seen": 10957696, + "step": 18885 + }, + { + "epoch": 2.813523979743819, + "grad_norm": 8.130324363708496, + "learning_rate": 4.9748512184173416e-05, + "loss": 0.5114, + "num_input_tokens_seen": 10960736, + "step": 18890 + }, + { + "epoch": 2.814268692284778, + "grad_norm": 5.346437931060791, + "learning_rate": 4.9748052232724905e-05, + "loss": 0.6877, + "num_input_tokens_seen": 10963424, + "step": 18895 + }, + { + "epoch": 2.8150134048257374, + "grad_norm": 5.235092639923096, + "learning_rate": 4.974759186318321e-05, + "loss": 0.5183, + "num_input_tokens_seen": 10966272, + "step": 18900 + }, + { + "epoch": 2.8157581173666966, + "grad_norm": 8.026276588439941, + "learning_rate": 4.97471310755561e-05, + "loss": 0.4, + "num_input_tokens_seen": 10969504, + "step": 18905 + }, + { + "epoch": 2.8165028299076558, + "grad_norm": 7.8419270515441895, + "learning_rate": 4.974666986985136e-05, + "loss": 0.5237, + "num_input_tokens_seen": 10972704, + "step": 18910 + }, + { + "epoch": 2.817247542448615, + "grad_norm": 14.326380729675293, + "learning_rate": 4.974620824607679e-05, + "loss": 0.674, + "num_input_tokens_seen": 10975776, + "step": 18915 + }, + { + "epoch": 2.817992254989574, + "grad_norm": 8.844779014587402, + "learning_rate": 4.9745746204240175e-05, + "loss": 0.3213, + "num_input_tokens_seen": 10978432, + "step": 18920 + }, + { + "epoch": 2.8187369675305334, + "grad_norm": 8.155999183654785, + "learning_rate": 4.974528374434934e-05, + "loss": 0.7046, + "num_input_tokens_seen": 10981568, + "step": 18925 + }, + { + "epoch": 2.819481680071492, + "grad_norm": 5.543934345245361, + "learning_rate": 4.974482086641207e-05, + "loss": 0.5364, + "num_input_tokens_seen": 10984352, + "step": 18930 + }, + { + "epoch": 2.820226392612452, + "grad_norm": 8.177021026611328, + "learning_rate": 4.974435757043621e-05, + "loss": 0.4134, + "num_input_tokens_seen": 10987168, + "step": 18935 + }, + { + "epoch": 2.8209711051534105, + "grad_norm": 4.898314952850342, + "learning_rate": 4.974389385642958e-05, + "loss": 0.4204, + "num_input_tokens_seen": 10990144, + "step": 18940 + }, + { + "epoch": 2.82171581769437, + "grad_norm": 10.62987232208252, + "learning_rate": 4.9743429724400007e-05, + "loss": 0.4816, + "num_input_tokens_seen": 10993056, + "step": 18945 + }, + { + "epoch": 2.822460530235329, + "grad_norm": 2.7902607917785645, + "learning_rate": 4.974296517435534e-05, + "loss": 0.4121, + "num_input_tokens_seen": 10996064, + "step": 18950 + }, + { + "epoch": 2.8232052427762886, + "grad_norm": 11.397307395935059, + "learning_rate": 4.974250020630342e-05, + "loss": 0.7313, + "num_input_tokens_seen": 10999008, + "step": 18955 + }, + { + "epoch": 2.8239499553172474, + "grad_norm": 4.581914901733398, + "learning_rate": 4.9742034820252116e-05, + "loss": 0.3501, + "num_input_tokens_seen": 11001824, + "step": 18960 + }, + { + "epoch": 2.824694667858207, + "grad_norm": 6.077952861785889, + "learning_rate": 4.974156901620927e-05, + "loss": 0.3354, + "num_input_tokens_seen": 11004896, + "step": 18965 + }, + { + "epoch": 2.8254393803991658, + "grad_norm": 9.703363418579102, + "learning_rate": 4.974110279418277e-05, + "loss": 0.4808, + "num_input_tokens_seen": 11007712, + "step": 18970 + }, + { + "epoch": 2.826184092940125, + "grad_norm": 16.93855857849121, + "learning_rate": 4.9740636154180476e-05, + "loss": 0.4548, + "num_input_tokens_seen": 11010656, + "step": 18975 + }, + { + "epoch": 2.826928805481084, + "grad_norm": 11.216774940490723, + "learning_rate": 4.974016909621029e-05, + "loss": 0.6048, + "num_input_tokens_seen": 11013536, + "step": 18980 + }, + { + "epoch": 2.8276735180220434, + "grad_norm": 7.038873672485352, + "learning_rate": 4.9739701620280076e-05, + "loss": 0.5195, + "num_input_tokens_seen": 11016224, + "step": 18985 + }, + { + "epoch": 2.8284182305630026, + "grad_norm": 10.929656028747559, + "learning_rate": 4.973923372639776e-05, + "loss": 0.6236, + "num_input_tokens_seen": 11019328, + "step": 18990 + }, + { + "epoch": 2.8291629431039618, + "grad_norm": 5.878871440887451, + "learning_rate": 4.973876541457123e-05, + "loss": 0.6282, + "num_input_tokens_seen": 11022208, + "step": 18995 + }, + { + "epoch": 2.829907655644921, + "grad_norm": 5.4845805168151855, + "learning_rate": 4.97382966848084e-05, + "loss": 0.3295, + "num_input_tokens_seen": 11025568, + "step": 19000 + }, + { + "epoch": 2.83065236818588, + "grad_norm": 5.9545369148254395, + "learning_rate": 4.9737827537117196e-05, + "loss": 0.7047, + "num_input_tokens_seen": 11028384, + "step": 19005 + }, + { + "epoch": 2.8313970807268394, + "grad_norm": 10.201156616210938, + "learning_rate": 4.973735797150553e-05, + "loss": 0.5296, + "num_input_tokens_seen": 11031328, + "step": 19010 + }, + { + "epoch": 2.8321417932677986, + "grad_norm": 11.007808685302734, + "learning_rate": 4.973688798798135e-05, + "loss": 0.8511, + "num_input_tokens_seen": 11034464, + "step": 19015 + }, + { + "epoch": 2.832886505808758, + "grad_norm": 5.789057731628418, + "learning_rate": 4.973641758655259e-05, + "loss": 0.3971, + "num_input_tokens_seen": 11037216, + "step": 19020 + }, + { + "epoch": 2.833631218349717, + "grad_norm": 7.307093620300293, + "learning_rate": 4.973594676722719e-05, + "loss": 0.4703, + "num_input_tokens_seen": 11040384, + "step": 19025 + }, + { + "epoch": 2.834375930890676, + "grad_norm": 4.882385730743408, + "learning_rate": 4.973547553001311e-05, + "loss": 0.2792, + "num_input_tokens_seen": 11043424, + "step": 19030 + }, + { + "epoch": 2.8351206434316354, + "grad_norm": 18.204233169555664, + "learning_rate": 4.9735003874918314e-05, + "loss": 0.6656, + "num_input_tokens_seen": 11046336, + "step": 19035 + }, + { + "epoch": 2.8358653559725946, + "grad_norm": 11.852654457092285, + "learning_rate": 4.9734531801950765e-05, + "loss": 0.5076, + "num_input_tokens_seen": 11049216, + "step": 19040 + }, + { + "epoch": 2.836610068513554, + "grad_norm": 7.9820356369018555, + "learning_rate": 4.9734059311118444e-05, + "loss": 0.5866, + "num_input_tokens_seen": 11052064, + "step": 19045 + }, + { + "epoch": 2.837354781054513, + "grad_norm": 9.617956161499023, + "learning_rate": 4.973358640242932e-05, + "loss": 0.4279, + "num_input_tokens_seen": 11054976, + "step": 19050 + }, + { + "epoch": 2.838099493595472, + "grad_norm": 4.827209949493408, + "learning_rate": 4.97331130758914e-05, + "loss": 0.4797, + "num_input_tokens_seen": 11057984, + "step": 19055 + }, + { + "epoch": 2.8388442061364314, + "grad_norm": 7.384057998657227, + "learning_rate": 4.9732639331512675e-05, + "loss": 0.5841, + "num_input_tokens_seen": 11060768, + "step": 19060 + }, + { + "epoch": 2.8395889186773906, + "grad_norm": 8.478850364685059, + "learning_rate": 4.973216516930114e-05, + "loss": 0.5458, + "num_input_tokens_seen": 11063744, + "step": 19065 + }, + { + "epoch": 2.84033363121835, + "grad_norm": 3.4477572441101074, + "learning_rate": 4.973169058926481e-05, + "loss": 0.5176, + "num_input_tokens_seen": 11066592, + "step": 19070 + }, + { + "epoch": 2.841078343759309, + "grad_norm": 8.598390579223633, + "learning_rate": 4.973121559141171e-05, + "loss": 0.5237, + "num_input_tokens_seen": 11069280, + "step": 19075 + }, + { + "epoch": 2.841823056300268, + "grad_norm": 18.02614402770996, + "learning_rate": 4.9730740175749854e-05, + "loss": 0.5305, + "num_input_tokens_seen": 11072384, + "step": 19080 + }, + { + "epoch": 2.8425677688412274, + "grad_norm": 8.1873779296875, + "learning_rate": 4.973026434228728e-05, + "loss": 0.608, + "num_input_tokens_seen": 11075072, + "step": 19085 + }, + { + "epoch": 2.8433124813821866, + "grad_norm": 3.8390066623687744, + "learning_rate": 4.972978809103202e-05, + "loss": 0.5388, + "num_input_tokens_seen": 11077856, + "step": 19090 + }, + { + "epoch": 2.844057193923146, + "grad_norm": 6.613680839538574, + "learning_rate": 4.972931142199213e-05, + "loss": 0.4908, + "num_input_tokens_seen": 11080672, + "step": 19095 + }, + { + "epoch": 2.844801906464105, + "grad_norm": 13.875588417053223, + "learning_rate": 4.972883433517566e-05, + "loss": 0.4428, + "num_input_tokens_seen": 11083360, + "step": 19100 + }, + { + "epoch": 2.845546619005064, + "grad_norm": 12.93568229675293, + "learning_rate": 4.972835683059065e-05, + "loss": 0.4801, + "num_input_tokens_seen": 11086464, + "step": 19105 + }, + { + "epoch": 2.8462913315460234, + "grad_norm": 5.822288513183594, + "learning_rate": 4.97278789082452e-05, + "loss": 0.6519, + "num_input_tokens_seen": 11089504, + "step": 19110 + }, + { + "epoch": 2.847036044086982, + "grad_norm": 8.065295219421387, + "learning_rate": 4.9727400568147364e-05, + "loss": 0.8126, + "num_input_tokens_seen": 11092288, + "step": 19115 + }, + { + "epoch": 2.847780756627942, + "grad_norm": 9.535904884338379, + "learning_rate": 4.972692181030523e-05, + "loss": 0.5534, + "num_input_tokens_seen": 11095136, + "step": 19120 + }, + { + "epoch": 2.8485254691689006, + "grad_norm": 8.251324653625488, + "learning_rate": 4.972644263472688e-05, + "loss": 0.6475, + "num_input_tokens_seen": 11098112, + "step": 19125 + }, + { + "epoch": 2.8492701817098602, + "grad_norm": 1.637345790863037, + "learning_rate": 4.972596304142041e-05, + "loss": 0.4838, + "num_input_tokens_seen": 11100992, + "step": 19130 + }, + { + "epoch": 2.850014894250819, + "grad_norm": 5.584628582000732, + "learning_rate": 4.9725483030393924e-05, + "loss": 0.3927, + "num_input_tokens_seen": 11104000, + "step": 19135 + }, + { + "epoch": 2.8507596067917786, + "grad_norm": 5.065086841583252, + "learning_rate": 4.972500260165555e-05, + "loss": 0.7173, + "num_input_tokens_seen": 11106976, + "step": 19140 + }, + { + "epoch": 2.8515043193327374, + "grad_norm": 12.781935691833496, + "learning_rate": 4.972452175521337e-05, + "loss": 0.5246, + "num_input_tokens_seen": 11109728, + "step": 19145 + }, + { + "epoch": 2.8522490318736966, + "grad_norm": 5.3051347732543945, + "learning_rate": 4.972404049107552e-05, + "loss": 0.565, + "num_input_tokens_seen": 11112576, + "step": 19150 + }, + { + "epoch": 2.852993744414656, + "grad_norm": 1.846602439880371, + "learning_rate": 4.972355880925014e-05, + "loss": 0.5809, + "num_input_tokens_seen": 11115328, + "step": 19155 + }, + { + "epoch": 2.853738456955615, + "grad_norm": 8.318689346313477, + "learning_rate": 4.9723076709745365e-05, + "loss": 0.8534, + "num_input_tokens_seen": 11118528, + "step": 19160 + }, + { + "epoch": 2.854483169496574, + "grad_norm": 4.432710647583008, + "learning_rate": 4.972259419256933e-05, + "loss": 0.4829, + "num_input_tokens_seen": 11121216, + "step": 19165 + }, + { + "epoch": 2.8552278820375334, + "grad_norm": 6.7560930252075195, + "learning_rate": 4.97221112577302e-05, + "loss": 0.4178, + "num_input_tokens_seen": 11123904, + "step": 19170 + }, + { + "epoch": 2.8559725945784926, + "grad_norm": 15.795478820800781, + "learning_rate": 4.972162790523612e-05, + "loss": 0.4659, + "num_input_tokens_seen": 11126752, + "step": 19175 + }, + { + "epoch": 2.856717307119452, + "grad_norm": 10.49484634399414, + "learning_rate": 4.9721144135095265e-05, + "loss": 0.503, + "num_input_tokens_seen": 11129888, + "step": 19180 + }, + { + "epoch": 2.857462019660411, + "grad_norm": 8.34773063659668, + "learning_rate": 4.9720659947315815e-05, + "loss": 0.6383, + "num_input_tokens_seen": 11132608, + "step": 19185 + }, + { + "epoch": 2.8582067322013702, + "grad_norm": 11.872075080871582, + "learning_rate": 4.972017534190593e-05, + "loss": 0.5022, + "num_input_tokens_seen": 11135392, + "step": 19190 + }, + { + "epoch": 2.8589514447423294, + "grad_norm": 2.593200922012329, + "learning_rate": 4.971969031887381e-05, + "loss": 0.5808, + "num_input_tokens_seen": 11138720, + "step": 19195 + }, + { + "epoch": 2.8596961572832886, + "grad_norm": 8.563961029052734, + "learning_rate": 4.971920487822764e-05, + "loss": 0.4819, + "num_input_tokens_seen": 11141504, + "step": 19200 + }, + { + "epoch": 2.860440869824248, + "grad_norm": 8.693145751953125, + "learning_rate": 4.971871901997563e-05, + "loss": 0.8271, + "num_input_tokens_seen": 11144352, + "step": 19205 + }, + { + "epoch": 2.861185582365207, + "grad_norm": 1.7105870246887207, + "learning_rate": 4.9718232744125995e-05, + "loss": 0.434, + "num_input_tokens_seen": 11147424, + "step": 19210 + }, + { + "epoch": 2.8619302949061662, + "grad_norm": 4.7051496505737305, + "learning_rate": 4.9717746050686925e-05, + "loss": 0.4396, + "num_input_tokens_seen": 11150400, + "step": 19215 + }, + { + "epoch": 2.8626750074471254, + "grad_norm": 9.717394828796387, + "learning_rate": 4.9717258939666663e-05, + "loss": 0.4082, + "num_input_tokens_seen": 11153344, + "step": 19220 + }, + { + "epoch": 2.8634197199880846, + "grad_norm": 4.502432823181152, + "learning_rate": 4.9716771411073436e-05, + "loss": 0.548, + "num_input_tokens_seen": 11156032, + "step": 19225 + }, + { + "epoch": 2.864164432529044, + "grad_norm": 5.556743621826172, + "learning_rate": 4.9716283464915484e-05, + "loss": 0.4339, + "num_input_tokens_seen": 11158720, + "step": 19230 + }, + { + "epoch": 2.864909145070003, + "grad_norm": 7.195456504821777, + "learning_rate": 4.9715795101201025e-05, + "loss": 0.4961, + "num_input_tokens_seen": 11161440, + "step": 19235 + }, + { + "epoch": 2.8656538576109623, + "grad_norm": 8.774287223815918, + "learning_rate": 4.9715306319938335e-05, + "loss": 0.586, + "num_input_tokens_seen": 11164256, + "step": 19240 + }, + { + "epoch": 2.8663985701519215, + "grad_norm": 9.39794635772705, + "learning_rate": 4.971481712113567e-05, + "loss": 0.4882, + "num_input_tokens_seen": 11167200, + "step": 19245 + }, + { + "epoch": 2.8671432826928807, + "grad_norm": 6.307684421539307, + "learning_rate": 4.9714327504801286e-05, + "loss": 0.5609, + "num_input_tokens_seen": 11170208, + "step": 19250 + }, + { + "epoch": 2.86788799523384, + "grad_norm": 14.023735046386719, + "learning_rate": 4.971383747094346e-05, + "loss": 0.5849, + "num_input_tokens_seen": 11173056, + "step": 19255 + }, + { + "epoch": 2.868632707774799, + "grad_norm": 4.484900951385498, + "learning_rate": 4.9713347019570465e-05, + "loss": 0.6181, + "num_input_tokens_seen": 11175968, + "step": 19260 + }, + { + "epoch": 2.8693774203157583, + "grad_norm": 4.741603374481201, + "learning_rate": 4.971285615069059e-05, + "loss": 0.3472, + "num_input_tokens_seen": 11178880, + "step": 19265 + }, + { + "epoch": 2.8701221328567175, + "grad_norm": 8.515109062194824, + "learning_rate": 4.9712364864312125e-05, + "loss": 0.4413, + "num_input_tokens_seen": 11181696, + "step": 19270 + }, + { + "epoch": 2.8708668453976767, + "grad_norm": 5.525561809539795, + "learning_rate": 4.9711873160443375e-05, + "loss": 0.6977, + "num_input_tokens_seen": 11184448, + "step": 19275 + }, + { + "epoch": 2.8716115579386354, + "grad_norm": 4.973440170288086, + "learning_rate": 4.971138103909264e-05, + "loss": 0.5869, + "num_input_tokens_seen": 11187776, + "step": 19280 + }, + { + "epoch": 2.872356270479595, + "grad_norm": 4.40120792388916, + "learning_rate": 4.9710888500268236e-05, + "loss": 0.5503, + "num_input_tokens_seen": 11190752, + "step": 19285 + }, + { + "epoch": 2.873100983020554, + "grad_norm": 12.275306701660156, + "learning_rate": 4.9710395543978495e-05, + "loss": 0.4353, + "num_input_tokens_seen": 11193536, + "step": 19290 + }, + { + "epoch": 2.8738456955615135, + "grad_norm": 5.297493934631348, + "learning_rate": 4.970990217023173e-05, + "loss": 0.5804, + "num_input_tokens_seen": 11196352, + "step": 19295 + }, + { + "epoch": 2.8745904081024722, + "grad_norm": 6.765635013580322, + "learning_rate": 4.9709408379036284e-05, + "loss": 0.5358, + "num_input_tokens_seen": 11199328, + "step": 19300 + }, + { + "epoch": 2.875335120643432, + "grad_norm": 5.388837814331055, + "learning_rate": 4.97089141704005e-05, + "loss": 0.532, + "num_input_tokens_seen": 11202176, + "step": 19305 + }, + { + "epoch": 2.8760798331843906, + "grad_norm": 6.82075834274292, + "learning_rate": 4.970841954433272e-05, + "loss": 0.4663, + "num_input_tokens_seen": 11204992, + "step": 19310 + }, + { + "epoch": 2.8768245457253503, + "grad_norm": 8.202371597290039, + "learning_rate": 4.97079245008413e-05, + "loss": 0.6141, + "num_input_tokens_seen": 11207616, + "step": 19315 + }, + { + "epoch": 2.877569258266309, + "grad_norm": 5.740225315093994, + "learning_rate": 4.970742903993462e-05, + "loss": 0.5539, + "num_input_tokens_seen": 11210720, + "step": 19320 + }, + { + "epoch": 2.8783139708072683, + "grad_norm": 6.7378411293029785, + "learning_rate": 4.970693316162103e-05, + "loss": 0.6133, + "num_input_tokens_seen": 11213600, + "step": 19325 + }, + { + "epoch": 2.8790586833482275, + "grad_norm": 6.425870418548584, + "learning_rate": 4.9706436865908915e-05, + "loss": 0.5852, + "num_input_tokens_seen": 11216320, + "step": 19330 + }, + { + "epoch": 2.8798033958891867, + "grad_norm": 9.478104591369629, + "learning_rate": 4.970594015280665e-05, + "loss": 0.6282, + "num_input_tokens_seen": 11219424, + "step": 19335 + }, + { + "epoch": 2.880548108430146, + "grad_norm": 4.750995635986328, + "learning_rate": 4.970544302232265e-05, + "loss": 0.5176, + "num_input_tokens_seen": 11222336, + "step": 19340 + }, + { + "epoch": 2.881292820971105, + "grad_norm": 5.574585914611816, + "learning_rate": 4.97049454744653e-05, + "loss": 0.481, + "num_input_tokens_seen": 11225376, + "step": 19345 + }, + { + "epoch": 2.8820375335120643, + "grad_norm": 8.483912467956543, + "learning_rate": 4.9704447509243e-05, + "loss": 0.5277, + "num_input_tokens_seen": 11228448, + "step": 19350 + }, + { + "epoch": 2.8827822460530235, + "grad_norm": 10.193196296691895, + "learning_rate": 4.970394912666416e-05, + "loss": 0.4784, + "num_input_tokens_seen": 11231072, + "step": 19355 + }, + { + "epoch": 2.8835269585939827, + "grad_norm": 10.77878475189209, + "learning_rate": 4.970345032673722e-05, + "loss": 0.6291, + "num_input_tokens_seen": 11234048, + "step": 19360 + }, + { + "epoch": 2.884271671134942, + "grad_norm": 4.674905300140381, + "learning_rate": 4.97029511094706e-05, + "loss": 0.4404, + "num_input_tokens_seen": 11236928, + "step": 19365 + }, + { + "epoch": 2.885016383675901, + "grad_norm": 7.244287014007568, + "learning_rate": 4.970245147487271e-05, + "loss": 0.4078, + "num_input_tokens_seen": 11240064, + "step": 19370 + }, + { + "epoch": 2.8857610962168603, + "grad_norm": 7.4827351570129395, + "learning_rate": 4.970195142295202e-05, + "loss": 0.702, + "num_input_tokens_seen": 11242656, + "step": 19375 + }, + { + "epoch": 2.8865058087578195, + "grad_norm": 11.935651779174805, + "learning_rate": 4.9701450953716965e-05, + "loss": 0.5165, + "num_input_tokens_seen": 11245792, + "step": 19380 + }, + { + "epoch": 2.8872505212987787, + "grad_norm": 10.149253845214844, + "learning_rate": 4.9700950067176e-05, + "loss": 0.4502, + "num_input_tokens_seen": 11248800, + "step": 19385 + }, + { + "epoch": 2.887995233839738, + "grad_norm": 10.396343231201172, + "learning_rate": 4.970044876333759e-05, + "loss": 0.5267, + "num_input_tokens_seen": 11251904, + "step": 19390 + }, + { + "epoch": 2.888739946380697, + "grad_norm": 5.548306941986084, + "learning_rate": 4.9699947042210196e-05, + "loss": 0.453, + "num_input_tokens_seen": 11254944, + "step": 19395 + }, + { + "epoch": 2.8894846589216563, + "grad_norm": 6.685764312744141, + "learning_rate": 4.96994449038023e-05, + "loss": 0.4415, + "num_input_tokens_seen": 11257728, + "step": 19400 + }, + { + "epoch": 2.8902293714626155, + "grad_norm": 3.6654727458953857, + "learning_rate": 4.9698942348122404e-05, + "loss": 0.5344, + "num_input_tokens_seen": 11260832, + "step": 19405 + }, + { + "epoch": 2.8909740840035747, + "grad_norm": 7.2616472244262695, + "learning_rate": 4.9698439375178965e-05, + "loss": 0.417, + "num_input_tokens_seen": 11263712, + "step": 19410 + }, + { + "epoch": 2.891718796544534, + "grad_norm": 4.640027046203613, + "learning_rate": 4.9697935984980496e-05, + "loss": 0.5589, + "num_input_tokens_seen": 11266848, + "step": 19415 + }, + { + "epoch": 2.892463509085493, + "grad_norm": 15.115915298461914, + "learning_rate": 4.96974321775355e-05, + "loss": 0.6224, + "num_input_tokens_seen": 11269440, + "step": 19420 + }, + { + "epoch": 2.8932082216264523, + "grad_norm": 10.375988006591797, + "learning_rate": 4.969692795285249e-05, + "loss": 0.6032, + "num_input_tokens_seen": 11272288, + "step": 19425 + }, + { + "epoch": 2.8939529341674115, + "grad_norm": 6.360004901885986, + "learning_rate": 4.9696423310939985e-05, + "loss": 0.4254, + "num_input_tokens_seen": 11275008, + "step": 19430 + }, + { + "epoch": 2.8946976467083707, + "grad_norm": 3.892963171005249, + "learning_rate": 4.9695918251806506e-05, + "loss": 0.5143, + "num_input_tokens_seen": 11277856, + "step": 19435 + }, + { + "epoch": 2.89544235924933, + "grad_norm": 18.889812469482422, + "learning_rate": 4.969541277546059e-05, + "loss": 0.6083, + "num_input_tokens_seen": 11280480, + "step": 19440 + }, + { + "epoch": 2.896187071790289, + "grad_norm": 5.486353397369385, + "learning_rate": 4.9694906881910776e-05, + "loss": 0.4706, + "num_input_tokens_seen": 11283360, + "step": 19445 + }, + { + "epoch": 2.8969317843312483, + "grad_norm": 10.690281867980957, + "learning_rate": 4.969440057116561e-05, + "loss": 0.4825, + "num_input_tokens_seen": 11286400, + "step": 19450 + }, + { + "epoch": 2.897676496872207, + "grad_norm": 19.34095001220703, + "learning_rate": 4.969389384323364e-05, + "loss": 0.5551, + "num_input_tokens_seen": 11289184, + "step": 19455 + }, + { + "epoch": 2.8984212094131667, + "grad_norm": 13.83873176574707, + "learning_rate": 4.969338669812343e-05, + "loss": 0.4262, + "num_input_tokens_seen": 11291712, + "step": 19460 + }, + { + "epoch": 2.8991659219541255, + "grad_norm": 48.79484176635742, + "learning_rate": 4.969287913584355e-05, + "loss": 0.5853, + "num_input_tokens_seen": 11294688, + "step": 19465 + }, + { + "epoch": 2.899910634495085, + "grad_norm": 6.702036380767822, + "learning_rate": 4.969237115640258e-05, + "loss": 0.48, + "num_input_tokens_seen": 11297440, + "step": 19470 + }, + { + "epoch": 2.900655347036044, + "grad_norm": 5.903171539306641, + "learning_rate": 4.969186275980909e-05, + "loss": 0.686, + "num_input_tokens_seen": 11300544, + "step": 19475 + }, + { + "epoch": 2.9014000595770035, + "grad_norm": 10.944589614868164, + "learning_rate": 4.969135394607167e-05, + "loss": 0.5996, + "num_input_tokens_seen": 11303552, + "step": 19480 + }, + { + "epoch": 2.9021447721179623, + "grad_norm": 11.369937896728516, + "learning_rate": 4.969084471519893e-05, + "loss": 0.5648, + "num_input_tokens_seen": 11306304, + "step": 19485 + }, + { + "epoch": 2.9028894846589215, + "grad_norm": 5.996232032775879, + "learning_rate": 4.9690335067199464e-05, + "loss": 0.5073, + "num_input_tokens_seen": 11309056, + "step": 19490 + }, + { + "epoch": 2.9036341971998807, + "grad_norm": 13.215302467346191, + "learning_rate": 4.9689825002081866e-05, + "loss": 0.5477, + "num_input_tokens_seen": 11312608, + "step": 19495 + }, + { + "epoch": 2.90437890974084, + "grad_norm": 21.703227996826172, + "learning_rate": 4.9689314519854786e-05, + "loss": 0.5609, + "num_input_tokens_seen": 11315328, + "step": 19500 + }, + { + "epoch": 2.905123622281799, + "grad_norm": 6.760852336883545, + "learning_rate": 4.968880362052682e-05, + "loss": 0.4356, + "num_input_tokens_seen": 11318208, + "step": 19505 + }, + { + "epoch": 2.9058683348227583, + "grad_norm": 8.87224006652832, + "learning_rate": 4.968829230410661e-05, + "loss": 0.5027, + "num_input_tokens_seen": 11320928, + "step": 19510 + }, + { + "epoch": 2.9066130473637175, + "grad_norm": 5.157261848449707, + "learning_rate": 4.96877805706028e-05, + "loss": 0.4261, + "num_input_tokens_seen": 11324160, + "step": 19515 + }, + { + "epoch": 2.9073577599046767, + "grad_norm": 8.105158805847168, + "learning_rate": 4.968726842002402e-05, + "loss": 0.4003, + "num_input_tokens_seen": 11326912, + "step": 19520 + }, + { + "epoch": 2.908102472445636, + "grad_norm": 5.381863117218018, + "learning_rate": 4.968675585237894e-05, + "loss": 0.3514, + "num_input_tokens_seen": 11329824, + "step": 19525 + }, + { + "epoch": 2.908847184986595, + "grad_norm": 12.75245189666748, + "learning_rate": 4.9686242867676204e-05, + "loss": 0.514, + "num_input_tokens_seen": 11332512, + "step": 19530 + }, + { + "epoch": 2.9095918975275543, + "grad_norm": 10.321676254272461, + "learning_rate": 4.968572946592448e-05, + "loss": 0.4514, + "num_input_tokens_seen": 11335360, + "step": 19535 + }, + { + "epoch": 2.9103366100685135, + "grad_norm": 14.674690246582031, + "learning_rate": 4.968521564713246e-05, + "loss": 0.5566, + "num_input_tokens_seen": 11338080, + "step": 19540 + }, + { + "epoch": 2.9110813226094727, + "grad_norm": 8.504311561584473, + "learning_rate": 4.9684701411308796e-05, + "loss": 0.4054, + "num_input_tokens_seen": 11340832, + "step": 19545 + }, + { + "epoch": 2.911826035150432, + "grad_norm": 40.81649398803711, + "learning_rate": 4.9684186758462205e-05, + "loss": 0.5533, + "num_input_tokens_seen": 11343456, + "step": 19550 + }, + { + "epoch": 2.912570747691391, + "grad_norm": 4.381656646728516, + "learning_rate": 4.968367168860136e-05, + "loss": 0.4408, + "num_input_tokens_seen": 11346176, + "step": 19555 + }, + { + "epoch": 2.9133154602323503, + "grad_norm": 14.087868690490723, + "learning_rate": 4.968315620173496e-05, + "loss": 0.619, + "num_input_tokens_seen": 11348928, + "step": 19560 + }, + { + "epoch": 2.9140601727733095, + "grad_norm": 7.480255126953125, + "learning_rate": 4.968264029787173e-05, + "loss": 0.758, + "num_input_tokens_seen": 11351520, + "step": 19565 + }, + { + "epoch": 2.9148048853142687, + "grad_norm": 14.375978469848633, + "learning_rate": 4.9682123977020385e-05, + "loss": 0.5351, + "num_input_tokens_seen": 11354464, + "step": 19570 + }, + { + "epoch": 2.915549597855228, + "grad_norm": 9.851240158081055, + "learning_rate": 4.968160723918963e-05, + "loss": 0.685, + "num_input_tokens_seen": 11357632, + "step": 19575 + }, + { + "epoch": 2.916294310396187, + "grad_norm": 8.905508995056152, + "learning_rate": 4.968109008438821e-05, + "loss": 0.4603, + "num_input_tokens_seen": 11360320, + "step": 19580 + }, + { + "epoch": 2.9170390229371463, + "grad_norm": 18.965173721313477, + "learning_rate": 4.9680572512624865e-05, + "loss": 0.6172, + "num_input_tokens_seen": 11363136, + "step": 19585 + }, + { + "epoch": 2.9177837354781055, + "grad_norm": 4.796047687530518, + "learning_rate": 4.968005452390832e-05, + "loss": 0.3626, + "num_input_tokens_seen": 11365888, + "step": 19590 + }, + { + "epoch": 2.9185284480190647, + "grad_norm": 10.322097778320312, + "learning_rate": 4.967953611824735e-05, + "loss": 0.5258, + "num_input_tokens_seen": 11368832, + "step": 19595 + }, + { + "epoch": 2.919273160560024, + "grad_norm": 8.903066635131836, + "learning_rate": 4.9679017295650694e-05, + "loss": 0.4676, + "num_input_tokens_seen": 11371648, + "step": 19600 + }, + { + "epoch": 2.920017873100983, + "grad_norm": 9.653099060058594, + "learning_rate": 4.9678498056127124e-05, + "loss": 0.5046, + "num_input_tokens_seen": 11374560, + "step": 19605 + }, + { + "epoch": 2.9207625856419424, + "grad_norm": 10.699766159057617, + "learning_rate": 4.967797839968541e-05, + "loss": 0.6178, + "num_input_tokens_seen": 11377600, + "step": 19610 + }, + { + "epoch": 2.9215072981829016, + "grad_norm": 8.202529907226562, + "learning_rate": 4.9677458326334336e-05, + "loss": 0.5809, + "num_input_tokens_seen": 11380544, + "step": 19615 + }, + { + "epoch": 2.9222520107238603, + "grad_norm": 5.369953155517578, + "learning_rate": 4.967693783608268e-05, + "loss": 0.6969, + "num_input_tokens_seen": 11383520, + "step": 19620 + }, + { + "epoch": 2.92299672326482, + "grad_norm": 7.82187557220459, + "learning_rate": 4.967641692893924e-05, + "loss": 0.6541, + "num_input_tokens_seen": 11386336, + "step": 19625 + }, + { + "epoch": 2.9237414358057787, + "grad_norm": 7.216286659240723, + "learning_rate": 4.967589560491282e-05, + "loss": 0.502, + "num_input_tokens_seen": 11388960, + "step": 19630 + }, + { + "epoch": 2.9244861483467384, + "grad_norm": 3.928318500518799, + "learning_rate": 4.967537386401222e-05, + "loss": 0.5179, + "num_input_tokens_seen": 11391520, + "step": 19635 + }, + { + "epoch": 2.925230860887697, + "grad_norm": 7.481530666351318, + "learning_rate": 4.967485170624625e-05, + "loss": 0.5245, + "num_input_tokens_seen": 11394624, + "step": 19640 + }, + { + "epoch": 2.9259755734286568, + "grad_norm": 9.096415519714355, + "learning_rate": 4.9674329131623756e-05, + "loss": 0.4829, + "num_input_tokens_seen": 11397408, + "step": 19645 + }, + { + "epoch": 2.9267202859696155, + "grad_norm": 7.20328426361084, + "learning_rate": 4.967380614015354e-05, + "loss": 0.5741, + "num_input_tokens_seen": 11400256, + "step": 19650 + }, + { + "epoch": 2.927464998510575, + "grad_norm": 7.444544315338135, + "learning_rate": 4.9673282731844444e-05, + "loss": 0.4423, + "num_input_tokens_seen": 11403136, + "step": 19655 + }, + { + "epoch": 2.928209711051534, + "grad_norm": 5.196988105773926, + "learning_rate": 4.967275890670532e-05, + "loss": 0.5877, + "num_input_tokens_seen": 11406016, + "step": 19660 + }, + { + "epoch": 2.928954423592493, + "grad_norm": 10.605137825012207, + "learning_rate": 4.967223466474501e-05, + "loss": 0.6966, + "num_input_tokens_seen": 11408736, + "step": 19665 + }, + { + "epoch": 2.9296991361334523, + "grad_norm": 5.751176357269287, + "learning_rate": 4.967171000597236e-05, + "loss": 0.7345, + "num_input_tokens_seen": 11411968, + "step": 19670 + }, + { + "epoch": 2.9304438486744115, + "grad_norm": 3.8192782402038574, + "learning_rate": 4.967118493039625e-05, + "loss": 0.525, + "num_input_tokens_seen": 11414880, + "step": 19675 + }, + { + "epoch": 2.9311885612153707, + "grad_norm": 8.290472984313965, + "learning_rate": 4.9670659438025545e-05, + "loss": 0.6215, + "num_input_tokens_seen": 11417632, + "step": 19680 + }, + { + "epoch": 2.93193327375633, + "grad_norm": 5.844400405883789, + "learning_rate": 4.967013352886913e-05, + "loss": 0.5564, + "num_input_tokens_seen": 11420416, + "step": 19685 + }, + { + "epoch": 2.932677986297289, + "grad_norm": 6.3628315925598145, + "learning_rate": 4.9669607202935876e-05, + "loss": 0.4701, + "num_input_tokens_seen": 11423328, + "step": 19690 + }, + { + "epoch": 2.9334226988382484, + "grad_norm": 6.325534343719482, + "learning_rate": 4.966908046023468e-05, + "loss": 0.4309, + "num_input_tokens_seen": 11426048, + "step": 19695 + }, + { + "epoch": 2.9341674113792076, + "grad_norm": 6.449359893798828, + "learning_rate": 4.966855330077445e-05, + "loss": 0.3243, + "num_input_tokens_seen": 11428800, + "step": 19700 + }, + { + "epoch": 2.9349121239201668, + "grad_norm": 7.917624473571777, + "learning_rate": 4.966802572456408e-05, + "loss": 0.5841, + "num_input_tokens_seen": 11431872, + "step": 19705 + }, + { + "epoch": 2.935656836461126, + "grad_norm": 14.541606903076172, + "learning_rate": 4.966749773161249e-05, + "loss": 0.6115, + "num_input_tokens_seen": 11434976, + "step": 19710 + }, + { + "epoch": 2.936401549002085, + "grad_norm": 9.335003852844238, + "learning_rate": 4.966696932192859e-05, + "loss": 0.8317, + "num_input_tokens_seen": 11437792, + "step": 19715 + }, + { + "epoch": 2.9371462615430444, + "grad_norm": 3.7015645503997803, + "learning_rate": 4.9666440495521313e-05, + "loss": 0.433, + "num_input_tokens_seen": 11440544, + "step": 19720 + }, + { + "epoch": 2.9378909740840036, + "grad_norm": 10.163095474243164, + "learning_rate": 4.96659112523996e-05, + "loss": 0.5531, + "num_input_tokens_seen": 11443360, + "step": 19725 + }, + { + "epoch": 2.9386356866249628, + "grad_norm": 8.442062377929688, + "learning_rate": 4.9665381592572387e-05, + "loss": 0.6463, + "num_input_tokens_seen": 11446272, + "step": 19730 + }, + { + "epoch": 2.939380399165922, + "grad_norm": 12.472432136535645, + "learning_rate": 4.9664851516048615e-05, + "loss": 0.6126, + "num_input_tokens_seen": 11449568, + "step": 19735 + }, + { + "epoch": 2.940125111706881, + "grad_norm": 7.721389293670654, + "learning_rate": 4.9664321022837244e-05, + "loss": 0.4846, + "num_input_tokens_seen": 11452512, + "step": 19740 + }, + { + "epoch": 2.9408698242478404, + "grad_norm": 10.591473579406738, + "learning_rate": 4.966379011294724e-05, + "loss": 0.577, + "num_input_tokens_seen": 11455168, + "step": 19745 + }, + { + "epoch": 2.9416145367887996, + "grad_norm": 6.582980155944824, + "learning_rate": 4.966325878638757e-05, + "loss": 0.6757, + "num_input_tokens_seen": 11458592, + "step": 19750 + }, + { + "epoch": 2.942359249329759, + "grad_norm": 4.158749103546143, + "learning_rate": 4.966272704316721e-05, + "loss": 0.461, + "num_input_tokens_seen": 11461408, + "step": 19755 + }, + { + "epoch": 2.943103961870718, + "grad_norm": 3.6971986293792725, + "learning_rate": 4.966219488329514e-05, + "loss": 0.5758, + "num_input_tokens_seen": 11464256, + "step": 19760 + }, + { + "epoch": 2.943848674411677, + "grad_norm": 7.463389873504639, + "learning_rate": 4.966166230678035e-05, + "loss": 0.4571, + "num_input_tokens_seen": 11467072, + "step": 19765 + }, + { + "epoch": 2.9445933869526364, + "grad_norm": 12.601655960083008, + "learning_rate": 4.966112931363185e-05, + "loss": 0.427, + "num_input_tokens_seen": 11469856, + "step": 19770 + }, + { + "epoch": 2.9453380994935956, + "grad_norm": 4.954195976257324, + "learning_rate": 4.966059590385863e-05, + "loss": 0.4148, + "num_input_tokens_seen": 11472736, + "step": 19775 + }, + { + "epoch": 2.946082812034555, + "grad_norm": 7.0021796226501465, + "learning_rate": 4.9660062077469706e-05, + "loss": 0.5891, + "num_input_tokens_seen": 11476000, + "step": 19780 + }, + { + "epoch": 2.946827524575514, + "grad_norm": 5.610217571258545, + "learning_rate": 4.965952783447409e-05, + "loss": 0.6356, + "num_input_tokens_seen": 11479104, + "step": 19785 + }, + { + "epoch": 2.947572237116473, + "grad_norm": 1.0525164604187012, + "learning_rate": 4.965899317488082e-05, + "loss": 0.3404, + "num_input_tokens_seen": 11481888, + "step": 19790 + }, + { + "epoch": 2.948316949657432, + "grad_norm": 3.6382126808166504, + "learning_rate": 4.9658458098698926e-05, + "loss": 0.4478, + "num_input_tokens_seen": 11484768, + "step": 19795 + }, + { + "epoch": 2.9490616621983916, + "grad_norm": 16.15779685974121, + "learning_rate": 4.965792260593744e-05, + "loss": 0.3739, + "num_input_tokens_seen": 11488064, + "step": 19800 + }, + { + "epoch": 2.9498063747393504, + "grad_norm": 16.9874324798584, + "learning_rate": 4.965738669660541e-05, + "loss": 0.6848, + "num_input_tokens_seen": 11490944, + "step": 19805 + }, + { + "epoch": 2.95055108728031, + "grad_norm": 8.226366996765137, + "learning_rate": 4.96568503707119e-05, + "loss": 0.6431, + "num_input_tokens_seen": 11493920, + "step": 19810 + }, + { + "epoch": 2.9512957998212688, + "grad_norm": 8.159316062927246, + "learning_rate": 4.965631362826596e-05, + "loss": 0.5195, + "num_input_tokens_seen": 11496800, + "step": 19815 + }, + { + "epoch": 2.9520405123622284, + "grad_norm": 5.486931324005127, + "learning_rate": 4.965577646927666e-05, + "loss": 0.4289, + "num_input_tokens_seen": 11499456, + "step": 19820 + }, + { + "epoch": 2.952785224903187, + "grad_norm": 6.640462398529053, + "learning_rate": 4.965523889375308e-05, + "loss": 0.6198, + "num_input_tokens_seen": 11502368, + "step": 19825 + }, + { + "epoch": 2.953529937444147, + "grad_norm": 12.157713890075684, + "learning_rate": 4.9654700901704286e-05, + "loss": 0.637, + "num_input_tokens_seen": 11504896, + "step": 19830 + }, + { + "epoch": 2.9542746499851056, + "grad_norm": 5.33467960357666, + "learning_rate": 4.965416249313939e-05, + "loss": 0.4904, + "num_input_tokens_seen": 11507488, + "step": 19835 + }, + { + "epoch": 2.955019362526065, + "grad_norm": 5.182480335235596, + "learning_rate": 4.965362366806747e-05, + "loss": 0.3671, + "num_input_tokens_seen": 11510368, + "step": 19840 + }, + { + "epoch": 2.955764075067024, + "grad_norm": 5.2631683349609375, + "learning_rate": 4.9653084426497633e-05, + "loss": 0.3767, + "num_input_tokens_seen": 11513088, + "step": 19845 + }, + { + "epoch": 2.956508787607983, + "grad_norm": 8.426655769348145, + "learning_rate": 4.965254476843899e-05, + "loss": 0.3541, + "num_input_tokens_seen": 11515872, + "step": 19850 + }, + { + "epoch": 2.9572535001489424, + "grad_norm": 5.659445762634277, + "learning_rate": 4.965200469390067e-05, + "loss": 0.6624, + "num_input_tokens_seen": 11518528, + "step": 19855 + }, + { + "epoch": 2.9579982126899016, + "grad_norm": 5.86555814743042, + "learning_rate": 4.965146420289177e-05, + "loss": 0.6156, + "num_input_tokens_seen": 11521568, + "step": 19860 + }, + { + "epoch": 2.958742925230861, + "grad_norm": 14.054978370666504, + "learning_rate": 4.965092329542145e-05, + "loss": 0.5035, + "num_input_tokens_seen": 11524224, + "step": 19865 + }, + { + "epoch": 2.95948763777182, + "grad_norm": 6.952120304107666, + "learning_rate": 4.9650381971498824e-05, + "loss": 0.576, + "num_input_tokens_seen": 11527136, + "step": 19870 + }, + { + "epoch": 2.960232350312779, + "grad_norm": 6.672202110290527, + "learning_rate": 4.964984023113306e-05, + "loss": 0.5111, + "num_input_tokens_seen": 11529760, + "step": 19875 + }, + { + "epoch": 2.9609770628537384, + "grad_norm": 8.755898475646973, + "learning_rate": 4.9649298074333294e-05, + "loss": 0.4894, + "num_input_tokens_seen": 11532480, + "step": 19880 + }, + { + "epoch": 2.9617217753946976, + "grad_norm": 1.904990553855896, + "learning_rate": 4.964875550110869e-05, + "loss": 0.5896, + "num_input_tokens_seen": 11535552, + "step": 19885 + }, + { + "epoch": 2.962466487935657, + "grad_norm": 9.04889965057373, + "learning_rate": 4.964821251146841e-05, + "loss": 0.3767, + "num_input_tokens_seen": 11538368, + "step": 19890 + }, + { + "epoch": 2.963211200476616, + "grad_norm": 10.160759925842285, + "learning_rate": 4.964766910542164e-05, + "loss": 0.4046, + "num_input_tokens_seen": 11540768, + "step": 19895 + }, + { + "epoch": 2.963955913017575, + "grad_norm": 6.4514946937561035, + "learning_rate": 4.9647125282977536e-05, + "loss": 0.3645, + "num_input_tokens_seen": 11543808, + "step": 19900 + }, + { + "epoch": 2.9647006255585344, + "grad_norm": 7.086756229400635, + "learning_rate": 4.964658104414531e-05, + "loss": 0.364, + "num_input_tokens_seen": 11547232, + "step": 19905 + }, + { + "epoch": 2.9654453380994936, + "grad_norm": 12.701719284057617, + "learning_rate": 4.964603638893415e-05, + "loss": 0.5671, + "num_input_tokens_seen": 11550400, + "step": 19910 + }, + { + "epoch": 2.966190050640453, + "grad_norm": 6.727903366088867, + "learning_rate": 4.9645491317353246e-05, + "loss": 0.6347, + "num_input_tokens_seen": 11553120, + "step": 19915 + }, + { + "epoch": 2.966934763181412, + "grad_norm": 9.08043098449707, + "learning_rate": 4.9644945829411815e-05, + "loss": 0.4865, + "num_input_tokens_seen": 11555968, + "step": 19920 + }, + { + "epoch": 2.9676794757223712, + "grad_norm": 16.66672706604004, + "learning_rate": 4.964439992511908e-05, + "loss": 0.7312, + "num_input_tokens_seen": 11558784, + "step": 19925 + }, + { + "epoch": 2.9684241882633304, + "grad_norm": 15.154720306396484, + "learning_rate": 4.964385360448425e-05, + "loss": 0.6247, + "num_input_tokens_seen": 11561664, + "step": 19930 + }, + { + "epoch": 2.9691689008042896, + "grad_norm": 4.658358097076416, + "learning_rate": 4.964330686751656e-05, + "loss": 0.4052, + "num_input_tokens_seen": 11564512, + "step": 19935 + }, + { + "epoch": 2.969913613345249, + "grad_norm": 9.437546730041504, + "learning_rate": 4.964275971422525e-05, + "loss": 0.4813, + "num_input_tokens_seen": 11567136, + "step": 19940 + }, + { + "epoch": 2.970658325886208, + "grad_norm": 4.998653888702393, + "learning_rate": 4.964221214461956e-05, + "loss": 0.5742, + "num_input_tokens_seen": 11569856, + "step": 19945 + }, + { + "epoch": 2.9714030384271672, + "grad_norm": 8.291851043701172, + "learning_rate": 4.964166415870874e-05, + "loss": 0.491, + "num_input_tokens_seen": 11573024, + "step": 19950 + }, + { + "epoch": 2.9721477509681264, + "grad_norm": 11.224231719970703, + "learning_rate": 4.964111575650205e-05, + "loss": 0.4538, + "num_input_tokens_seen": 11575616, + "step": 19955 + }, + { + "epoch": 2.9728924635090856, + "grad_norm": 7.8958611488342285, + "learning_rate": 4.9640566938008745e-05, + "loss": 0.5562, + "num_input_tokens_seen": 11578976, + "step": 19960 + }, + { + "epoch": 2.973637176050045, + "grad_norm": 12.414276123046875, + "learning_rate": 4.964001770323812e-05, + "loss": 0.571, + "num_input_tokens_seen": 11582016, + "step": 19965 + }, + { + "epoch": 2.9743818885910036, + "grad_norm": 8.183180809020996, + "learning_rate": 4.9639468052199426e-05, + "loss": 0.6679, + "num_input_tokens_seen": 11585376, + "step": 19970 + }, + { + "epoch": 2.9751266011319633, + "grad_norm": 5.179427146911621, + "learning_rate": 4.963891798490197e-05, + "loss": 0.4564, + "num_input_tokens_seen": 11588224, + "step": 19975 + }, + { + "epoch": 2.975871313672922, + "grad_norm": 33.152496337890625, + "learning_rate": 4.963836750135503e-05, + "loss": 0.6682, + "num_input_tokens_seen": 11591072, + "step": 19980 + }, + { + "epoch": 2.9766160262138817, + "grad_norm": 2.7579894065856934, + "learning_rate": 4.963781660156792e-05, + "loss": 0.4161, + "num_input_tokens_seen": 11593728, + "step": 19985 + }, + { + "epoch": 2.9773607387548404, + "grad_norm": 7.513052463531494, + "learning_rate": 4.9637265285549935e-05, + "loss": 0.5415, + "num_input_tokens_seen": 11596384, + "step": 19990 + }, + { + "epoch": 2.9781054512958, + "grad_norm": 13.136794090270996, + "learning_rate": 4.9636713553310396e-05, + "loss": 0.6263, + "num_input_tokens_seen": 11599456, + "step": 19995 + }, + { + "epoch": 2.978850163836759, + "grad_norm": 5.042469501495361, + "learning_rate": 4.963616140485862e-05, + "loss": 0.6322, + "num_input_tokens_seen": 11602208, + "step": 20000 + }, + { + "epoch": 2.9795948763777185, + "grad_norm": 4.0470991134643555, + "learning_rate": 4.963560884020393e-05, + "loss": 0.6653, + "num_input_tokens_seen": 11605184, + "step": 20005 + }, + { + "epoch": 2.9803395889186772, + "grad_norm": 7.114273548126221, + "learning_rate": 4.963505585935567e-05, + "loss": 0.619, + "num_input_tokens_seen": 11608000, + "step": 20010 + }, + { + "epoch": 2.9810843014596364, + "grad_norm": 7.397030353546143, + "learning_rate": 4.9634502462323186e-05, + "loss": 0.6194, + "num_input_tokens_seen": 11610848, + "step": 20015 + }, + { + "epoch": 2.9818290140005956, + "grad_norm": 3.413109540939331, + "learning_rate": 4.9633948649115816e-05, + "loss": 0.4142, + "num_input_tokens_seen": 11613696, + "step": 20020 + }, + { + "epoch": 2.982573726541555, + "grad_norm": 9.987261772155762, + "learning_rate": 4.9633394419742917e-05, + "loss": 0.6475, + "num_input_tokens_seen": 11616864, + "step": 20025 + }, + { + "epoch": 2.983318439082514, + "grad_norm": 5.864038467407227, + "learning_rate": 4.963283977421386e-05, + "loss": 0.5079, + "num_input_tokens_seen": 11619904, + "step": 20030 + }, + { + "epoch": 2.9840631516234732, + "grad_norm": 9.091145515441895, + "learning_rate": 4.9632284712538005e-05, + "loss": 0.8093, + "num_input_tokens_seen": 11622816, + "step": 20035 + }, + { + "epoch": 2.9848078641644324, + "grad_norm": 5.6600565910339355, + "learning_rate": 4.9631729234724736e-05, + "loss": 0.363, + "num_input_tokens_seen": 11625888, + "step": 20040 + }, + { + "epoch": 2.9855525767053916, + "grad_norm": 8.338574409484863, + "learning_rate": 4.9631173340783445e-05, + "loss": 0.4641, + "num_input_tokens_seen": 11629088, + "step": 20045 + }, + { + "epoch": 2.986297289246351, + "grad_norm": 7.3539042472839355, + "learning_rate": 4.96306170307235e-05, + "loss": 0.5482, + "num_input_tokens_seen": 11632128, + "step": 20050 + }, + { + "epoch": 2.98704200178731, + "grad_norm": 4.682465553283691, + "learning_rate": 4.963006030455433e-05, + "loss": 0.474, + "num_input_tokens_seen": 11634912, + "step": 20055 + }, + { + "epoch": 2.9877867143282693, + "grad_norm": 11.576263427734375, + "learning_rate": 4.962950316228532e-05, + "loss": 0.5718, + "num_input_tokens_seen": 11637664, + "step": 20060 + }, + { + "epoch": 2.9885314268692285, + "grad_norm": 5.888980388641357, + "learning_rate": 4.9628945603925884e-05, + "loss": 0.684, + "num_input_tokens_seen": 11640416, + "step": 20065 + }, + { + "epoch": 2.9892761394101877, + "grad_norm": 5.7527642250061035, + "learning_rate": 4.9628387629485435e-05, + "loss": 0.3773, + "num_input_tokens_seen": 11643424, + "step": 20070 + }, + { + "epoch": 2.990020851951147, + "grad_norm": 7.7624993324279785, + "learning_rate": 4.962782923897342e-05, + "loss": 0.4619, + "num_input_tokens_seen": 11646400, + "step": 20075 + }, + { + "epoch": 2.990765564492106, + "grad_norm": 5.037032127380371, + "learning_rate": 4.962727043239925e-05, + "loss": 0.4113, + "num_input_tokens_seen": 11649664, + "step": 20080 + }, + { + "epoch": 2.9915102770330653, + "grad_norm": 8.549290657043457, + "learning_rate": 4.962671120977238e-05, + "loss": 0.577, + "num_input_tokens_seen": 11652768, + "step": 20085 + }, + { + "epoch": 2.9922549895740245, + "grad_norm": 5.955373764038086, + "learning_rate": 4.962615157110226e-05, + "loss": 0.4656, + "num_input_tokens_seen": 11655552, + "step": 20090 + }, + { + "epoch": 2.9929997021149837, + "grad_norm": 9.750846862792969, + "learning_rate": 4.9625591516398336e-05, + "loss": 0.5355, + "num_input_tokens_seen": 11658272, + "step": 20095 + }, + { + "epoch": 2.993744414655943, + "grad_norm": 4.935070514678955, + "learning_rate": 4.962503104567007e-05, + "loss": 0.6363, + "num_input_tokens_seen": 11661248, + "step": 20100 + }, + { + "epoch": 2.994489127196902, + "grad_norm": 10.167156219482422, + "learning_rate": 4.9624470158926925e-05, + "loss": 0.5703, + "num_input_tokens_seen": 11664000, + "step": 20105 + }, + { + "epoch": 2.9952338397378613, + "grad_norm": 7.278628826141357, + "learning_rate": 4.962390885617839e-05, + "loss": 0.5398, + "num_input_tokens_seen": 11666848, + "step": 20110 + }, + { + "epoch": 2.9959785522788205, + "grad_norm": 13.522287368774414, + "learning_rate": 4.9623347137433954e-05, + "loss": 0.476, + "num_input_tokens_seen": 11669600, + "step": 20115 + }, + { + "epoch": 2.9967232648197797, + "grad_norm": 8.179055213928223, + "learning_rate": 4.962278500270307e-05, + "loss": 0.6483, + "num_input_tokens_seen": 11672352, + "step": 20120 + }, + { + "epoch": 2.997467977360739, + "grad_norm": 4.886775970458984, + "learning_rate": 4.9622222451995274e-05, + "loss": 0.4783, + "num_input_tokens_seen": 11675104, + "step": 20125 + }, + { + "epoch": 2.998212689901698, + "grad_norm": 7.164129257202148, + "learning_rate": 4.962165948532006e-05, + "loss": 0.5414, + "num_input_tokens_seen": 11677696, + "step": 20130 + }, + { + "epoch": 2.9989574024426573, + "grad_norm": 6.11353874206543, + "learning_rate": 4.962109610268692e-05, + "loss": 0.6125, + "num_input_tokens_seen": 11680640, + "step": 20135 + }, + { + "epoch": 2.9997021149836165, + "grad_norm": 13.611282348632812, + "learning_rate": 4.9620532304105385e-05, + "loss": 0.5869, + "num_input_tokens_seen": 11683648, + "step": 20140 + }, + { + "epoch": 3.0, + "eval_loss": 0.6375064253807068, + "eval_runtime": 51.243, + "eval_samples_per_second": 58.232, + "eval_steps_per_second": 14.558, + "num_input_tokens_seen": 11684296, + "step": 20142 + }, + { + "epoch": 3.0004468275245757, + "grad_norm": 3.8920633792877197, + "learning_rate": 4.961996808958499e-05, + "loss": 0.4793, + "num_input_tokens_seen": 11685992, + "step": 20145 + }, + { + "epoch": 3.001191540065535, + "grad_norm": 3.5269083976745605, + "learning_rate": 4.961940345913525e-05, + "loss": 0.3669, + "num_input_tokens_seen": 11688648, + "step": 20150 + }, + { + "epoch": 3.001936252606494, + "grad_norm": 8.750724792480469, + "learning_rate": 4.961883841276571e-05, + "loss": 0.4052, + "num_input_tokens_seen": 11691816, + "step": 20155 + }, + { + "epoch": 3.002680965147453, + "grad_norm": 5.189514636993408, + "learning_rate": 4.961827295048592e-05, + "loss": 0.277, + "num_input_tokens_seen": 11694664, + "step": 20160 + }, + { + "epoch": 3.003425677688412, + "grad_norm": 4.903550624847412, + "learning_rate": 4.961770707230543e-05, + "loss": 0.3988, + "num_input_tokens_seen": 11697512, + "step": 20165 + }, + { + "epoch": 3.0041703902293713, + "grad_norm": 7.451105117797852, + "learning_rate": 4.961714077823379e-05, + "loss": 0.4487, + "num_input_tokens_seen": 11700392, + "step": 20170 + }, + { + "epoch": 3.0049151027703305, + "grad_norm": 6.064476490020752, + "learning_rate": 4.961657406828059e-05, + "loss": 0.2268, + "num_input_tokens_seen": 11703688, + "step": 20175 + }, + { + "epoch": 3.0056598153112897, + "grad_norm": 11.541114807128906, + "learning_rate": 4.961600694245539e-05, + "loss": 0.3197, + "num_input_tokens_seen": 11706312, + "step": 20180 + }, + { + "epoch": 3.006404527852249, + "grad_norm": 17.164596557617188, + "learning_rate": 4.961543940076776e-05, + "loss": 0.5504, + "num_input_tokens_seen": 11709064, + "step": 20185 + }, + { + "epoch": 3.007149240393208, + "grad_norm": 6.550293445587158, + "learning_rate": 4.961487144322731e-05, + "loss": 0.3814, + "num_input_tokens_seen": 11711816, + "step": 20190 + }, + { + "epoch": 3.0078939529341673, + "grad_norm": 3.7137444019317627, + "learning_rate": 4.961430306984362e-05, + "loss": 0.3615, + "num_input_tokens_seen": 11714760, + "step": 20195 + }, + { + "epoch": 3.0086386654751265, + "grad_norm": 4.421490669250488, + "learning_rate": 4.9613734280626287e-05, + "loss": 0.3009, + "num_input_tokens_seen": 11717416, + "step": 20200 + }, + { + "epoch": 3.0093833780160857, + "grad_norm": 5.929668426513672, + "learning_rate": 4.961316507558494e-05, + "loss": 0.3682, + "num_input_tokens_seen": 11720168, + "step": 20205 + }, + { + "epoch": 3.010128090557045, + "grad_norm": 12.216864585876465, + "learning_rate": 4.961259545472918e-05, + "loss": 0.4037, + "num_input_tokens_seen": 11723144, + "step": 20210 + }, + { + "epoch": 3.010872803098004, + "grad_norm": 5.6644511222839355, + "learning_rate": 4.961202541806864e-05, + "loss": 0.3715, + "num_input_tokens_seen": 11725896, + "step": 20215 + }, + { + "epoch": 3.0116175156389633, + "grad_norm": 2.8642425537109375, + "learning_rate": 4.9611454965612944e-05, + "loss": 0.5333, + "num_input_tokens_seen": 11728840, + "step": 20220 + }, + { + "epoch": 3.0123622281799225, + "grad_norm": 7.702581882476807, + "learning_rate": 4.9610884097371736e-05, + "loss": 0.3137, + "num_input_tokens_seen": 11731464, + "step": 20225 + }, + { + "epoch": 3.0131069407208817, + "grad_norm": 17.727123260498047, + "learning_rate": 4.961031281335464e-05, + "loss": 0.3721, + "num_input_tokens_seen": 11734408, + "step": 20230 + }, + { + "epoch": 3.013851653261841, + "grad_norm": 6.38857889175415, + "learning_rate": 4.9609741113571336e-05, + "loss": 0.3689, + "num_input_tokens_seen": 11737064, + "step": 20235 + }, + { + "epoch": 3.0145963658028, + "grad_norm": 7.74995756149292, + "learning_rate": 4.960916899803146e-05, + "loss": 0.3276, + "num_input_tokens_seen": 11739848, + "step": 20240 + }, + { + "epoch": 3.0153410783437593, + "grad_norm": 11.757488250732422, + "learning_rate": 4.960859646674469e-05, + "loss": 0.2715, + "num_input_tokens_seen": 11742760, + "step": 20245 + }, + { + "epoch": 3.0160857908847185, + "grad_norm": 2.990440607070923, + "learning_rate": 4.960802351972069e-05, + "loss": 0.4449, + "num_input_tokens_seen": 11745768, + "step": 20250 + }, + { + "epoch": 3.0168305034256777, + "grad_norm": 0.09047140926122665, + "learning_rate": 4.960745015696914e-05, + "loss": 0.2267, + "num_input_tokens_seen": 11748808, + "step": 20255 + }, + { + "epoch": 3.017575215966637, + "grad_norm": 6.113170623779297, + "learning_rate": 4.960687637849974e-05, + "loss": 0.303, + "num_input_tokens_seen": 11751368, + "step": 20260 + }, + { + "epoch": 3.018319928507596, + "grad_norm": 11.418872833251953, + "learning_rate": 4.960630218432216e-05, + "loss": 0.4261, + "num_input_tokens_seen": 11754376, + "step": 20265 + }, + { + "epoch": 3.0190646410485553, + "grad_norm": 2.8255746364593506, + "learning_rate": 4.960572757444612e-05, + "loss": 0.2679, + "num_input_tokens_seen": 11757768, + "step": 20270 + }, + { + "epoch": 3.0198093535895145, + "grad_norm": 5.819061756134033, + "learning_rate": 4.960515254888133e-05, + "loss": 0.5637, + "num_input_tokens_seen": 11760488, + "step": 20275 + }, + { + "epoch": 3.0205540661304737, + "grad_norm": 17.698930740356445, + "learning_rate": 4.9604577107637484e-05, + "loss": 0.4902, + "num_input_tokens_seen": 11763304, + "step": 20280 + }, + { + "epoch": 3.021298778671433, + "grad_norm": 26.994626998901367, + "learning_rate": 4.960400125072431e-05, + "loss": 0.3453, + "num_input_tokens_seen": 11766152, + "step": 20285 + }, + { + "epoch": 3.022043491212392, + "grad_norm": 0.30414411425590515, + "learning_rate": 4.960342497815155e-05, + "loss": 0.2376, + "num_input_tokens_seen": 11769032, + "step": 20290 + }, + { + "epoch": 3.0227882037533513, + "grad_norm": 17.390533447265625, + "learning_rate": 4.9602848289928926e-05, + "loss": 0.2657, + "num_input_tokens_seen": 11771880, + "step": 20295 + }, + { + "epoch": 3.0235329162943105, + "grad_norm": 17.907583236694336, + "learning_rate": 4.9602271186066194e-05, + "loss": 0.3686, + "num_input_tokens_seen": 11774856, + "step": 20300 + }, + { + "epoch": 3.0242776288352697, + "grad_norm": 1.992503046989441, + "learning_rate": 4.960169366657309e-05, + "loss": 0.3719, + "num_input_tokens_seen": 11777640, + "step": 20305 + }, + { + "epoch": 3.025022341376229, + "grad_norm": 23.594175338745117, + "learning_rate": 4.960111573145937e-05, + "loss": 0.414, + "num_input_tokens_seen": 11780552, + "step": 20310 + }, + { + "epoch": 3.025767053917188, + "grad_norm": 9.420414924621582, + "learning_rate": 4.960053738073481e-05, + "loss": 0.2895, + "num_input_tokens_seen": 11783496, + "step": 20315 + }, + { + "epoch": 3.0265117664581473, + "grad_norm": 9.026688575744629, + "learning_rate": 4.959995861440917e-05, + "loss": 0.2629, + "num_input_tokens_seen": 11786184, + "step": 20320 + }, + { + "epoch": 3.0272564789991065, + "grad_norm": 8.271778106689453, + "learning_rate": 4.959937943249223e-05, + "loss": 0.269, + "num_input_tokens_seen": 11788904, + "step": 20325 + }, + { + "epoch": 3.0280011915400658, + "grad_norm": 5.992258548736572, + "learning_rate": 4.9598799834993784e-05, + "loss": 0.5256, + "num_input_tokens_seen": 11791848, + "step": 20330 + }, + { + "epoch": 3.0287459040810245, + "grad_norm": 11.40615177154541, + "learning_rate": 4.9598219821923605e-05, + "loss": 0.3747, + "num_input_tokens_seen": 11794856, + "step": 20335 + }, + { + "epoch": 3.0294906166219837, + "grad_norm": 13.295090675354004, + "learning_rate": 4.959763939329152e-05, + "loss": 0.6635, + "num_input_tokens_seen": 11797608, + "step": 20340 + }, + { + "epoch": 3.030235329162943, + "grad_norm": 18.037639617919922, + "learning_rate": 4.95970585491073e-05, + "loss": 0.4096, + "num_input_tokens_seen": 11800456, + "step": 20345 + }, + { + "epoch": 3.030980041703902, + "grad_norm": 12.912919998168945, + "learning_rate": 4.9596477289380786e-05, + "loss": 0.3888, + "num_input_tokens_seen": 11803048, + "step": 20350 + }, + { + "epoch": 3.0317247542448613, + "grad_norm": 15.80896282196045, + "learning_rate": 4.959589561412178e-05, + "loss": 0.4213, + "num_input_tokens_seen": 11805960, + "step": 20355 + }, + { + "epoch": 3.0324694667858205, + "grad_norm": 7.543644905090332, + "learning_rate": 4.959531352334012e-05, + "loss": 0.5014, + "num_input_tokens_seen": 11808776, + "step": 20360 + }, + { + "epoch": 3.0332141793267797, + "grad_norm": 5.109939098358154, + "learning_rate": 4.959473101704563e-05, + "loss": 0.3354, + "num_input_tokens_seen": 11811656, + "step": 20365 + }, + { + "epoch": 3.033958891867739, + "grad_norm": 43.542625427246094, + "learning_rate": 4.959414809524816e-05, + "loss": 0.2877, + "num_input_tokens_seen": 11814600, + "step": 20370 + }, + { + "epoch": 3.034703604408698, + "grad_norm": 9.160861015319824, + "learning_rate": 4.9593564757957554e-05, + "loss": 0.4816, + "num_input_tokens_seen": 11817352, + "step": 20375 + }, + { + "epoch": 3.0354483169496573, + "grad_norm": 18.951196670532227, + "learning_rate": 4.959298100518367e-05, + "loss": 0.2545, + "num_input_tokens_seen": 11820360, + "step": 20380 + }, + { + "epoch": 3.0361930294906165, + "grad_norm": 11.452112197875977, + "learning_rate": 4.959239683693636e-05, + "loss": 0.4, + "num_input_tokens_seen": 11823432, + "step": 20385 + }, + { + "epoch": 3.0369377420315757, + "grad_norm": 19.749820709228516, + "learning_rate": 4.959181225322551e-05, + "loss": 0.4484, + "num_input_tokens_seen": 11826440, + "step": 20390 + }, + { + "epoch": 3.037682454572535, + "grad_norm": 32.84326171875, + "learning_rate": 4.959122725406098e-05, + "loss": 0.5251, + "num_input_tokens_seen": 11829192, + "step": 20395 + }, + { + "epoch": 3.038427167113494, + "grad_norm": 3.4851341247558594, + "learning_rate": 4.959064183945266e-05, + "loss": 0.4237, + "num_input_tokens_seen": 11832104, + "step": 20400 + }, + { + "epoch": 3.0391718796544533, + "grad_norm": 13.023396492004395, + "learning_rate": 4.959005600941043e-05, + "loss": 0.381, + "num_input_tokens_seen": 11835240, + "step": 20405 + }, + { + "epoch": 3.0399165921954125, + "grad_norm": 8.87573528289795, + "learning_rate": 4.958946976394421e-05, + "loss": 0.3863, + "num_input_tokens_seen": 11837960, + "step": 20410 + }, + { + "epoch": 3.0406613047363718, + "grad_norm": 16.416288375854492, + "learning_rate": 4.958888310306389e-05, + "loss": 0.4553, + "num_input_tokens_seen": 11840584, + "step": 20415 + }, + { + "epoch": 3.041406017277331, + "grad_norm": 9.91463565826416, + "learning_rate": 4.958829602677937e-05, + "loss": 0.4419, + "num_input_tokens_seen": 11843496, + "step": 20420 + }, + { + "epoch": 3.04215072981829, + "grad_norm": 12.640117645263672, + "learning_rate": 4.9587708535100584e-05, + "loss": 0.3301, + "num_input_tokens_seen": 11846184, + "step": 20425 + }, + { + "epoch": 3.0428954423592494, + "grad_norm": 1.0708783864974976, + "learning_rate": 4.958712062803745e-05, + "loss": 0.3338, + "num_input_tokens_seen": 11848872, + "step": 20430 + }, + { + "epoch": 3.0436401549002086, + "grad_norm": 21.51947593688965, + "learning_rate": 4.958653230559991e-05, + "loss": 0.2465, + "num_input_tokens_seen": 11851848, + "step": 20435 + }, + { + "epoch": 3.0443848674411678, + "grad_norm": 46.33895492553711, + "learning_rate": 4.958594356779789e-05, + "loss": 0.3877, + "num_input_tokens_seen": 11854888, + "step": 20440 + }, + { + "epoch": 3.045129579982127, + "grad_norm": 30.230953216552734, + "learning_rate": 4.958535441464134e-05, + "loss": 0.3958, + "num_input_tokens_seen": 11857800, + "step": 20445 + }, + { + "epoch": 3.045874292523086, + "grad_norm": 21.48341178894043, + "learning_rate": 4.958476484614022e-05, + "loss": 0.7895, + "num_input_tokens_seen": 11860968, + "step": 20450 + }, + { + "epoch": 3.0466190050640454, + "grad_norm": 22.10698127746582, + "learning_rate": 4.958417486230448e-05, + "loss": 0.2571, + "num_input_tokens_seen": 11863944, + "step": 20455 + }, + { + "epoch": 3.0473637176050046, + "grad_norm": 12.064224243164062, + "learning_rate": 4.95835844631441e-05, + "loss": 0.4983, + "num_input_tokens_seen": 11867496, + "step": 20460 + }, + { + "epoch": 3.0481084301459638, + "grad_norm": 19.89446258544922, + "learning_rate": 4.958299364866903e-05, + "loss": 0.2036, + "num_input_tokens_seen": 11870568, + "step": 20465 + }, + { + "epoch": 3.048853142686923, + "grad_norm": 3.6409287452697754, + "learning_rate": 4.958240241888928e-05, + "loss": 0.3056, + "num_input_tokens_seen": 11873512, + "step": 20470 + }, + { + "epoch": 3.049597855227882, + "grad_norm": 19.68024444580078, + "learning_rate": 4.958181077381482e-05, + "loss": 0.2499, + "num_input_tokens_seen": 11876360, + "step": 20475 + }, + { + "epoch": 3.0503425677688414, + "grad_norm": 6.1567888259887695, + "learning_rate": 4.958121871345565e-05, + "loss": 0.4061, + "num_input_tokens_seen": 11879048, + "step": 20480 + }, + { + "epoch": 3.0510872803098006, + "grad_norm": 9.327616691589355, + "learning_rate": 4.958062623782178e-05, + "loss": 0.4306, + "num_input_tokens_seen": 11882024, + "step": 20485 + }, + { + "epoch": 3.05183199285076, + "grad_norm": 6.1623125076293945, + "learning_rate": 4.958003334692321e-05, + "loss": 0.3871, + "num_input_tokens_seen": 11884808, + "step": 20490 + }, + { + "epoch": 3.052576705391719, + "grad_norm": 13.323195457458496, + "learning_rate": 4.957944004076995e-05, + "loss": 0.5307, + "num_input_tokens_seen": 11888040, + "step": 20495 + }, + { + "epoch": 3.053321417932678, + "grad_norm": 14.76095199584961, + "learning_rate": 4.957884631937204e-05, + "loss": 0.7311, + "num_input_tokens_seen": 11890696, + "step": 20500 + }, + { + "epoch": 3.054066130473637, + "grad_norm": 6.624760627746582, + "learning_rate": 4.9578252182739506e-05, + "loss": 0.3305, + "num_input_tokens_seen": 11894248, + "step": 20505 + }, + { + "epoch": 3.054810843014596, + "grad_norm": 9.454984664916992, + "learning_rate": 4.957765763088237e-05, + "loss": 0.2898, + "num_input_tokens_seen": 11896936, + "step": 20510 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 7.351417064666748, + "learning_rate": 4.95770626638107e-05, + "loss": 0.396, + "num_input_tokens_seen": 11899432, + "step": 20515 + }, + { + "epoch": 3.0563002680965146, + "grad_norm": 33.685028076171875, + "learning_rate": 4.9576467281534526e-05, + "loss": 0.3184, + "num_input_tokens_seen": 11902568, + "step": 20520 + }, + { + "epoch": 3.0570449806374738, + "grad_norm": 8.494322776794434, + "learning_rate": 4.9575871484063915e-05, + "loss": 0.5699, + "num_input_tokens_seen": 11905384, + "step": 20525 + }, + { + "epoch": 3.057789693178433, + "grad_norm": 2.4991402626037598, + "learning_rate": 4.9575275271408944e-05, + "loss": 0.3142, + "num_input_tokens_seen": 11908200, + "step": 20530 + }, + { + "epoch": 3.058534405719392, + "grad_norm": 15.512465476989746, + "learning_rate": 4.957467864357967e-05, + "loss": 0.3225, + "num_input_tokens_seen": 11911016, + "step": 20535 + }, + { + "epoch": 3.0592791182603514, + "grad_norm": 10.394381523132324, + "learning_rate": 4.9574081600586175e-05, + "loss": 0.312, + "num_input_tokens_seen": 11914088, + "step": 20540 + }, + { + "epoch": 3.0600238308013106, + "grad_norm": 16.21776580810547, + "learning_rate": 4.957348414243855e-05, + "loss": 0.638, + "num_input_tokens_seen": 11917128, + "step": 20545 + }, + { + "epoch": 3.0607685433422698, + "grad_norm": 3.1969285011291504, + "learning_rate": 4.9572886269146877e-05, + "loss": 0.2575, + "num_input_tokens_seen": 11920104, + "step": 20550 + }, + { + "epoch": 3.061513255883229, + "grad_norm": 4.132532119750977, + "learning_rate": 4.957228798072128e-05, + "loss": 0.2634, + "num_input_tokens_seen": 11923048, + "step": 20555 + }, + { + "epoch": 3.062257968424188, + "grad_norm": 12.614140510559082, + "learning_rate": 4.957168927717184e-05, + "loss": 0.5274, + "num_input_tokens_seen": 11925800, + "step": 20560 + }, + { + "epoch": 3.0630026809651474, + "grad_norm": 9.310614585876465, + "learning_rate": 4.957109015850868e-05, + "loss": 0.3804, + "num_input_tokens_seen": 11928680, + "step": 20565 + }, + { + "epoch": 3.0637473935061066, + "grad_norm": 9.8181734085083, + "learning_rate": 4.957049062474194e-05, + "loss": 0.3622, + "num_input_tokens_seen": 11931976, + "step": 20570 + }, + { + "epoch": 3.064492106047066, + "grad_norm": 5.595479488372803, + "learning_rate": 4.956989067588172e-05, + "loss": 0.3717, + "num_input_tokens_seen": 11935048, + "step": 20575 + }, + { + "epoch": 3.065236818588025, + "grad_norm": 15.692148208618164, + "learning_rate": 4.956929031193817e-05, + "loss": 0.3696, + "num_input_tokens_seen": 11937928, + "step": 20580 + }, + { + "epoch": 3.065981531128984, + "grad_norm": 3.385343551635742, + "learning_rate": 4.956868953292143e-05, + "loss": 0.4382, + "num_input_tokens_seen": 11940872, + "step": 20585 + }, + { + "epoch": 3.0667262436699434, + "grad_norm": 6.0100202560424805, + "learning_rate": 4.9568088338841664e-05, + "loss": 0.3412, + "num_input_tokens_seen": 11943656, + "step": 20590 + }, + { + "epoch": 3.0674709562109026, + "grad_norm": 15.758886337280273, + "learning_rate": 4.9567486729709e-05, + "loss": 0.3515, + "num_input_tokens_seen": 11946440, + "step": 20595 + }, + { + "epoch": 3.068215668751862, + "grad_norm": 3.9621996879577637, + "learning_rate": 4.956688470553363e-05, + "loss": 0.4049, + "num_input_tokens_seen": 11949320, + "step": 20600 + }, + { + "epoch": 3.068960381292821, + "grad_norm": 14.075225830078125, + "learning_rate": 4.95662822663257e-05, + "loss": 0.3411, + "num_input_tokens_seen": 11952104, + "step": 20605 + }, + { + "epoch": 3.06970509383378, + "grad_norm": 27.96648406982422, + "learning_rate": 4.9565679412095415e-05, + "loss": 0.3101, + "num_input_tokens_seen": 11954920, + "step": 20610 + }, + { + "epoch": 3.0704498063747394, + "grad_norm": 9.119560241699219, + "learning_rate": 4.956507614285293e-05, + "loss": 0.2181, + "num_input_tokens_seen": 11957608, + "step": 20615 + }, + { + "epoch": 3.0711945189156986, + "grad_norm": 11.84537410736084, + "learning_rate": 4.9564472458608445e-05, + "loss": 0.4677, + "num_input_tokens_seen": 11960392, + "step": 20620 + }, + { + "epoch": 3.071939231456658, + "grad_norm": 6.893884181976318, + "learning_rate": 4.956386835937218e-05, + "loss": 0.3985, + "num_input_tokens_seen": 11963496, + "step": 20625 + }, + { + "epoch": 3.072683943997617, + "grad_norm": 25.80426597595215, + "learning_rate": 4.9563263845154315e-05, + "loss": 0.6703, + "num_input_tokens_seen": 11966408, + "step": 20630 + }, + { + "epoch": 3.073428656538576, + "grad_norm": 15.830364227294922, + "learning_rate": 4.9562658915965075e-05, + "loss": 0.3466, + "num_input_tokens_seen": 11969192, + "step": 20635 + }, + { + "epoch": 3.0741733690795354, + "grad_norm": 1.5046311616897583, + "learning_rate": 4.956205357181467e-05, + "loss": 0.1605, + "num_input_tokens_seen": 11971912, + "step": 20640 + }, + { + "epoch": 3.0749180816204946, + "grad_norm": 2.489490509033203, + "learning_rate": 4.9561447812713345e-05, + "loss": 0.4073, + "num_input_tokens_seen": 11974760, + "step": 20645 + }, + { + "epoch": 3.075662794161454, + "grad_norm": 3.9388482570648193, + "learning_rate": 4.956084163867132e-05, + "loss": 0.2415, + "num_input_tokens_seen": 11977384, + "step": 20650 + }, + { + "epoch": 3.076407506702413, + "grad_norm": 10.222204208374023, + "learning_rate": 4.9560235049698834e-05, + "loss": 0.4239, + "num_input_tokens_seen": 11981128, + "step": 20655 + }, + { + "epoch": 3.0771522192433722, + "grad_norm": 8.95782470703125, + "learning_rate": 4.955962804580614e-05, + "loss": 0.5984, + "num_input_tokens_seen": 11984072, + "step": 20660 + }, + { + "epoch": 3.0778969317843314, + "grad_norm": 24.102863311767578, + "learning_rate": 4.9559020627003494e-05, + "loss": 0.6277, + "num_input_tokens_seen": 11986952, + "step": 20665 + }, + { + "epoch": 3.0786416443252906, + "grad_norm": 16.137622833251953, + "learning_rate": 4.955841279330115e-05, + "loss": 0.3787, + "num_input_tokens_seen": 11989960, + "step": 20670 + }, + { + "epoch": 3.07938635686625, + "grad_norm": 10.06838607788086, + "learning_rate": 4.9557804544709385e-05, + "loss": 0.491, + "num_input_tokens_seen": 11992840, + "step": 20675 + }, + { + "epoch": 3.0801310694072086, + "grad_norm": 14.732213973999023, + "learning_rate": 4.955719588123847e-05, + "loss": 0.4191, + "num_input_tokens_seen": 11995784, + "step": 20680 + }, + { + "epoch": 3.080875781948168, + "grad_norm": 6.53666353225708, + "learning_rate": 4.955658680289869e-05, + "loss": 0.3988, + "num_input_tokens_seen": 11998888, + "step": 20685 + }, + { + "epoch": 3.081620494489127, + "grad_norm": 8.550853729248047, + "learning_rate": 4.955597730970034e-05, + "loss": 0.4162, + "num_input_tokens_seen": 12001832, + "step": 20690 + }, + { + "epoch": 3.082365207030086, + "grad_norm": 0.5659246444702148, + "learning_rate": 4.95553674016537e-05, + "loss": 0.2309, + "num_input_tokens_seen": 12004584, + "step": 20695 + }, + { + "epoch": 3.0831099195710454, + "grad_norm": 9.465850830078125, + "learning_rate": 4.9554757078769095e-05, + "loss": 0.3981, + "num_input_tokens_seen": 12007624, + "step": 20700 + }, + { + "epoch": 3.0838546321120046, + "grad_norm": 14.021127700805664, + "learning_rate": 4.955414634105682e-05, + "loss": 0.4724, + "num_input_tokens_seen": 12010536, + "step": 20705 + }, + { + "epoch": 3.084599344652964, + "grad_norm": 0.9071028232574463, + "learning_rate": 4.95535351885272e-05, + "loss": 0.5135, + "num_input_tokens_seen": 12013672, + "step": 20710 + }, + { + "epoch": 3.085344057193923, + "grad_norm": 1.4598515033721924, + "learning_rate": 4.955292362119055e-05, + "loss": 0.3017, + "num_input_tokens_seen": 12016232, + "step": 20715 + }, + { + "epoch": 3.086088769734882, + "grad_norm": 21.412437438964844, + "learning_rate": 4.955231163905723e-05, + "loss": 0.4586, + "num_input_tokens_seen": 12019496, + "step": 20720 + }, + { + "epoch": 3.0868334822758414, + "grad_norm": 7.8113694190979, + "learning_rate": 4.955169924213754e-05, + "loss": 0.2181, + "num_input_tokens_seen": 12022440, + "step": 20725 + }, + { + "epoch": 3.0875781948168006, + "grad_norm": 24.85430145263672, + "learning_rate": 4.955108643044185e-05, + "loss": 0.3639, + "num_input_tokens_seen": 12025448, + "step": 20730 + }, + { + "epoch": 3.08832290735776, + "grad_norm": 26.44330406188965, + "learning_rate": 4.955047320398051e-05, + "loss": 0.3152, + "num_input_tokens_seen": 12028168, + "step": 20735 + }, + { + "epoch": 3.089067619898719, + "grad_norm": 29.428943634033203, + "learning_rate": 4.954985956276388e-05, + "loss": 0.2398, + "num_input_tokens_seen": 12030952, + "step": 20740 + }, + { + "epoch": 3.0898123324396782, + "grad_norm": 32.27655792236328, + "learning_rate": 4.954924550680231e-05, + "loss": 0.632, + "num_input_tokens_seen": 12033768, + "step": 20745 + }, + { + "epoch": 3.0905570449806374, + "grad_norm": 19.88972282409668, + "learning_rate": 4.95486310361062e-05, + "loss": 0.7968, + "num_input_tokens_seen": 12036488, + "step": 20750 + }, + { + "epoch": 3.0913017575215966, + "grad_norm": 9.244441032409668, + "learning_rate": 4.954801615068592e-05, + "loss": 0.6642, + "num_input_tokens_seen": 12039496, + "step": 20755 + }, + { + "epoch": 3.092046470062556, + "grad_norm": 5.489071846008301, + "learning_rate": 4.9547400850551853e-05, + "loss": 0.3673, + "num_input_tokens_seen": 12042280, + "step": 20760 + }, + { + "epoch": 3.092791182603515, + "grad_norm": 6.087139129638672, + "learning_rate": 4.9546785135714394e-05, + "loss": 0.2654, + "num_input_tokens_seen": 12045256, + "step": 20765 + }, + { + "epoch": 3.0935358951444742, + "grad_norm": 11.897324562072754, + "learning_rate": 4.954616900618395e-05, + "loss": 0.3142, + "num_input_tokens_seen": 12047944, + "step": 20770 + }, + { + "epoch": 3.0942806076854334, + "grad_norm": 1.8893988132476807, + "learning_rate": 4.954555246197093e-05, + "loss": 0.2924, + "num_input_tokens_seen": 12050728, + "step": 20775 + }, + { + "epoch": 3.0950253202263927, + "grad_norm": 7.996466159820557, + "learning_rate": 4.954493550308575e-05, + "loss": 0.4172, + "num_input_tokens_seen": 12053448, + "step": 20780 + }, + { + "epoch": 3.095770032767352, + "grad_norm": 11.25286865234375, + "learning_rate": 4.9544318129538824e-05, + "loss": 0.3043, + "num_input_tokens_seen": 12056296, + "step": 20785 + }, + { + "epoch": 3.096514745308311, + "grad_norm": 1.1489591598510742, + "learning_rate": 4.95437003413406e-05, + "loss": 0.3708, + "num_input_tokens_seen": 12058952, + "step": 20790 + }, + { + "epoch": 3.0972594578492703, + "grad_norm": 1.2172905206680298, + "learning_rate": 4.9543082138501495e-05, + "loss": 0.3957, + "num_input_tokens_seen": 12061768, + "step": 20795 + }, + { + "epoch": 3.0980041703902295, + "grad_norm": 5.6545939445495605, + "learning_rate": 4.954246352103197e-05, + "loss": 0.4764, + "num_input_tokens_seen": 12064392, + "step": 20800 + }, + { + "epoch": 3.0987488829311887, + "grad_norm": 13.225614547729492, + "learning_rate": 4.954184448894246e-05, + "loss": 0.6491, + "num_input_tokens_seen": 12067208, + "step": 20805 + }, + { + "epoch": 3.099493595472148, + "grad_norm": 23.367341995239258, + "learning_rate": 4.954122504224343e-05, + "loss": 0.5255, + "num_input_tokens_seen": 12070216, + "step": 20810 + }, + { + "epoch": 3.100238308013107, + "grad_norm": 5.408106803894043, + "learning_rate": 4.954060518094535e-05, + "loss": 0.3241, + "num_input_tokens_seen": 12072936, + "step": 20815 + }, + { + "epoch": 3.1009830205540663, + "grad_norm": 16.200653076171875, + "learning_rate": 4.953998490505868e-05, + "loss": 0.4539, + "num_input_tokens_seen": 12075784, + "step": 20820 + }, + { + "epoch": 3.1017277330950255, + "grad_norm": 6.353426933288574, + "learning_rate": 4.953936421459392e-05, + "loss": 0.5457, + "num_input_tokens_seen": 12078760, + "step": 20825 + }, + { + "epoch": 3.1024724456359847, + "grad_norm": 9.633793830871582, + "learning_rate": 4.953874310956153e-05, + "loss": 0.3219, + "num_input_tokens_seen": 12081416, + "step": 20830 + }, + { + "epoch": 3.103217158176944, + "grad_norm": 9.950981140136719, + "learning_rate": 4.953812158997202e-05, + "loss": 0.3971, + "num_input_tokens_seen": 12084104, + "step": 20835 + }, + { + "epoch": 3.103961870717903, + "grad_norm": 6.931919097900391, + "learning_rate": 4.953749965583588e-05, + "loss": 0.2362, + "num_input_tokens_seen": 12087080, + "step": 20840 + }, + { + "epoch": 3.1047065832588623, + "grad_norm": 12.606389045715332, + "learning_rate": 4.953687730716363e-05, + "loss": 0.4142, + "num_input_tokens_seen": 12090344, + "step": 20845 + }, + { + "epoch": 3.1054512957998215, + "grad_norm": 29.299591064453125, + "learning_rate": 4.9536254543965775e-05, + "loss": 0.5908, + "num_input_tokens_seen": 12093416, + "step": 20850 + }, + { + "epoch": 3.1061960083407802, + "grad_norm": 9.884966850280762, + "learning_rate": 4.953563136625283e-05, + "loss": 0.3635, + "num_input_tokens_seen": 12096040, + "step": 20855 + }, + { + "epoch": 3.1069407208817394, + "grad_norm": 36.71110153198242, + "learning_rate": 4.9535007774035335e-05, + "loss": 0.3538, + "num_input_tokens_seen": 12099176, + "step": 20860 + }, + { + "epoch": 3.1076854334226987, + "grad_norm": 4.084207057952881, + "learning_rate": 4.9534383767323825e-05, + "loss": 0.2983, + "num_input_tokens_seen": 12102408, + "step": 20865 + }, + { + "epoch": 3.108430145963658, + "grad_norm": 31.6407413482666, + "learning_rate": 4.9533759346128824e-05, + "loss": 0.3225, + "num_input_tokens_seen": 12105160, + "step": 20870 + }, + { + "epoch": 3.109174858504617, + "grad_norm": 12.510236740112305, + "learning_rate": 4.953313451046091e-05, + "loss": 0.3473, + "num_input_tokens_seen": 12108360, + "step": 20875 + }, + { + "epoch": 3.1099195710455763, + "grad_norm": 0.22230641543865204, + "learning_rate": 4.9532509260330615e-05, + "loss": 0.2478, + "num_input_tokens_seen": 12111528, + "step": 20880 + }, + { + "epoch": 3.1106642835865355, + "grad_norm": 9.871585845947266, + "learning_rate": 4.953188359574851e-05, + "loss": 0.2928, + "num_input_tokens_seen": 12114184, + "step": 20885 + }, + { + "epoch": 3.1114089961274947, + "grad_norm": 13.32874584197998, + "learning_rate": 4.953125751672516e-05, + "loss": 0.2199, + "num_input_tokens_seen": 12117160, + "step": 20890 + }, + { + "epoch": 3.112153708668454, + "grad_norm": 11.379549980163574, + "learning_rate": 4.953063102327115e-05, + "loss": 0.2681, + "num_input_tokens_seen": 12120488, + "step": 20895 + }, + { + "epoch": 3.112898421209413, + "grad_norm": 8.153714179992676, + "learning_rate": 4.953000411539706e-05, + "loss": 0.4943, + "num_input_tokens_seen": 12123240, + "step": 20900 + }, + { + "epoch": 3.1136431337503723, + "grad_norm": 8.64140796661377, + "learning_rate": 4.952937679311348e-05, + "loss": 0.3525, + "num_input_tokens_seen": 12125960, + "step": 20905 + }, + { + "epoch": 3.1143878462913315, + "grad_norm": 22.55837631225586, + "learning_rate": 4.9528749056431015e-05, + "loss": 0.5213, + "num_input_tokens_seen": 12128680, + "step": 20910 + }, + { + "epoch": 3.1151325588322907, + "grad_norm": 12.730315208435059, + "learning_rate": 4.9528120905360265e-05, + "loss": 0.7737, + "num_input_tokens_seen": 12131304, + "step": 20915 + }, + { + "epoch": 3.11587727137325, + "grad_norm": 4.1191487312316895, + "learning_rate": 4.9527492339911836e-05, + "loss": 0.2912, + "num_input_tokens_seen": 12133992, + "step": 20920 + }, + { + "epoch": 3.116621983914209, + "grad_norm": 4.784219264984131, + "learning_rate": 4.952686336009635e-05, + "loss": 0.4072, + "num_input_tokens_seen": 12136616, + "step": 20925 + }, + { + "epoch": 3.1173666964551683, + "grad_norm": 2.822983503341675, + "learning_rate": 4.952623396592445e-05, + "loss": 0.1639, + "num_input_tokens_seen": 12139368, + "step": 20930 + }, + { + "epoch": 3.1181114089961275, + "grad_norm": 26.173250198364258, + "learning_rate": 4.952560415740674e-05, + "loss": 0.359, + "num_input_tokens_seen": 12142248, + "step": 20935 + }, + { + "epoch": 3.1188561215370867, + "grad_norm": 4.739068031311035, + "learning_rate": 4.9524973934553884e-05, + "loss": 0.3454, + "num_input_tokens_seen": 12145416, + "step": 20940 + }, + { + "epoch": 3.119600834078046, + "grad_norm": 16.305404663085938, + "learning_rate": 4.952434329737651e-05, + "loss": 0.4306, + "num_input_tokens_seen": 12148648, + "step": 20945 + }, + { + "epoch": 3.120345546619005, + "grad_norm": 4.451169967651367, + "learning_rate": 4.952371224588529e-05, + "loss": 0.2643, + "num_input_tokens_seen": 12151464, + "step": 20950 + }, + { + "epoch": 3.1210902591599643, + "grad_norm": 8.344619750976562, + "learning_rate": 4.952308078009087e-05, + "loss": 0.3168, + "num_input_tokens_seen": 12154216, + "step": 20955 + }, + { + "epoch": 3.1218349717009235, + "grad_norm": 18.411529541015625, + "learning_rate": 4.9522448900003925e-05, + "loss": 0.3854, + "num_input_tokens_seen": 12156936, + "step": 20960 + }, + { + "epoch": 3.1225796842418827, + "grad_norm": 9.148361206054688, + "learning_rate": 4.952181660563514e-05, + "loss": 0.3401, + "num_input_tokens_seen": 12159560, + "step": 20965 + }, + { + "epoch": 3.123324396782842, + "grad_norm": 17.959638595581055, + "learning_rate": 4.952118389699517e-05, + "loss": 0.3216, + "num_input_tokens_seen": 12162280, + "step": 20970 + }, + { + "epoch": 3.124069109323801, + "grad_norm": 9.266571998596191, + "learning_rate": 4.9520550774094735e-05, + "loss": 0.313, + "num_input_tokens_seen": 12165448, + "step": 20975 + }, + { + "epoch": 3.1248138218647603, + "grad_norm": 14.1454439163208, + "learning_rate": 4.9519917236944504e-05, + "loss": 0.2134, + "num_input_tokens_seen": 12168520, + "step": 20980 + }, + { + "epoch": 3.1255585344057195, + "grad_norm": 7.178711414337158, + "learning_rate": 4.9519283285555195e-05, + "loss": 0.4151, + "num_input_tokens_seen": 12171656, + "step": 20985 + }, + { + "epoch": 3.1263032469466787, + "grad_norm": 15.401716232299805, + "learning_rate": 4.951864891993752e-05, + "loss": 0.2774, + "num_input_tokens_seen": 12174728, + "step": 20990 + }, + { + "epoch": 3.127047959487638, + "grad_norm": 12.89404582977295, + "learning_rate": 4.951801414010219e-05, + "loss": 0.3333, + "num_input_tokens_seen": 12177640, + "step": 20995 + }, + { + "epoch": 3.127792672028597, + "grad_norm": 6.815556526184082, + "learning_rate": 4.9517378946059936e-05, + "loss": 0.3411, + "num_input_tokens_seen": 12180392, + "step": 21000 + }, + { + "epoch": 3.1285373845695563, + "grad_norm": 9.537663459777832, + "learning_rate": 4.951674333782147e-05, + "loss": 0.2525, + "num_input_tokens_seen": 12183016, + "step": 21005 + }, + { + "epoch": 3.1292820971105155, + "grad_norm": 14.276694297790527, + "learning_rate": 4.9516107315397554e-05, + "loss": 0.6669, + "num_input_tokens_seen": 12185992, + "step": 21010 + }, + { + "epoch": 3.1300268096514747, + "grad_norm": 34.75044250488281, + "learning_rate": 4.951547087879891e-05, + "loss": 0.8574, + "num_input_tokens_seen": 12188776, + "step": 21015 + }, + { + "epoch": 3.1307715221924335, + "grad_norm": 3.5687601566314697, + "learning_rate": 4.951483402803631e-05, + "loss": 0.4316, + "num_input_tokens_seen": 12191336, + "step": 21020 + }, + { + "epoch": 3.131516234733393, + "grad_norm": 11.318717956542969, + "learning_rate": 4.95141967631205e-05, + "loss": 0.5453, + "num_input_tokens_seen": 12194312, + "step": 21025 + }, + { + "epoch": 3.132260947274352, + "grad_norm": 4.748692512512207, + "learning_rate": 4.951355908406226e-05, + "loss": 0.2185, + "num_input_tokens_seen": 12197288, + "step": 21030 + }, + { + "epoch": 3.133005659815311, + "grad_norm": 22.558320999145508, + "learning_rate": 4.951292099087235e-05, + "loss": 0.3609, + "num_input_tokens_seen": 12200072, + "step": 21035 + }, + { + "epoch": 3.1337503723562703, + "grad_norm": 14.345991134643555, + "learning_rate": 4.951228248356155e-05, + "loss": 0.284, + "num_input_tokens_seen": 12202856, + "step": 21040 + }, + { + "epoch": 3.1344950848972295, + "grad_norm": 10.108922958374023, + "learning_rate": 4.951164356214065e-05, + "loss": 0.3939, + "num_input_tokens_seen": 12205864, + "step": 21045 + }, + { + "epoch": 3.1352397974381887, + "grad_norm": 8.459851264953613, + "learning_rate": 4.951100422662045e-05, + "loss": 0.3697, + "num_input_tokens_seen": 12209064, + "step": 21050 + }, + { + "epoch": 3.135984509979148, + "grad_norm": 25.3325138092041, + "learning_rate": 4.951036447701174e-05, + "loss": 0.3258, + "num_input_tokens_seen": 12211752, + "step": 21055 + }, + { + "epoch": 3.136729222520107, + "grad_norm": 16.389366149902344, + "learning_rate": 4.950972431332534e-05, + "loss": 0.4153, + "num_input_tokens_seen": 12214632, + "step": 21060 + }, + { + "epoch": 3.1374739350610663, + "grad_norm": 3.5411369800567627, + "learning_rate": 4.9509083735572055e-05, + "loss": 0.5384, + "num_input_tokens_seen": 12217224, + "step": 21065 + }, + { + "epoch": 3.1382186476020255, + "grad_norm": 7.8539204597473145, + "learning_rate": 4.950844274376271e-05, + "loss": 0.1589, + "num_input_tokens_seen": 12220424, + "step": 21070 + }, + { + "epoch": 3.1389633601429847, + "grad_norm": 11.182391166687012, + "learning_rate": 4.950780133790813e-05, + "loss": 0.2227, + "num_input_tokens_seen": 12223560, + "step": 21075 + }, + { + "epoch": 3.139708072683944, + "grad_norm": 16.127710342407227, + "learning_rate": 4.950715951801916e-05, + "loss": 0.4753, + "num_input_tokens_seen": 12226440, + "step": 21080 + }, + { + "epoch": 3.140452785224903, + "grad_norm": 6.213385105133057, + "learning_rate": 4.950651728410663e-05, + "loss": 0.4716, + "num_input_tokens_seen": 12229256, + "step": 21085 + }, + { + "epoch": 3.1411974977658623, + "grad_norm": 15.124560356140137, + "learning_rate": 4.9505874636181414e-05, + "loss": 0.5214, + "num_input_tokens_seen": 12232104, + "step": 21090 + }, + { + "epoch": 3.1419422103068215, + "grad_norm": 17.946903228759766, + "learning_rate": 4.950523157425434e-05, + "loss": 0.4856, + "num_input_tokens_seen": 12234984, + "step": 21095 + }, + { + "epoch": 3.1426869228477807, + "grad_norm": 10.995722770690918, + "learning_rate": 4.950458809833629e-05, + "loss": 0.1966, + "num_input_tokens_seen": 12237832, + "step": 21100 + }, + { + "epoch": 3.14343163538874, + "grad_norm": 8.678595542907715, + "learning_rate": 4.9503944208438124e-05, + "loss": 0.3779, + "num_input_tokens_seen": 12240776, + "step": 21105 + }, + { + "epoch": 3.144176347929699, + "grad_norm": 9.409261703491211, + "learning_rate": 4.950329990457073e-05, + "loss": 0.2646, + "num_input_tokens_seen": 12243720, + "step": 21110 + }, + { + "epoch": 3.1449210604706583, + "grad_norm": 13.310940742492676, + "learning_rate": 4.950265518674498e-05, + "loss": 0.3381, + "num_input_tokens_seen": 12246888, + "step": 21115 + }, + { + "epoch": 3.1456657730116175, + "grad_norm": 7.811596870422363, + "learning_rate": 4.950201005497179e-05, + "loss": 0.3854, + "num_input_tokens_seen": 12249640, + "step": 21120 + }, + { + "epoch": 3.1464104855525767, + "grad_norm": 13.351661682128906, + "learning_rate": 4.950136450926203e-05, + "loss": 0.546, + "num_input_tokens_seen": 12252712, + "step": 21125 + }, + { + "epoch": 3.147155198093536, + "grad_norm": 8.123114585876465, + "learning_rate": 4.950071854962662e-05, + "loss": 0.3326, + "num_input_tokens_seen": 12255464, + "step": 21130 + }, + { + "epoch": 3.147899910634495, + "grad_norm": 5.002377033233643, + "learning_rate": 4.950007217607647e-05, + "loss": 0.2231, + "num_input_tokens_seen": 12258280, + "step": 21135 + }, + { + "epoch": 3.1486446231754543, + "grad_norm": 8.189407348632812, + "learning_rate": 4.949942538862251e-05, + "loss": 0.3559, + "num_input_tokens_seen": 12261128, + "step": 21140 + }, + { + "epoch": 3.1493893357164136, + "grad_norm": 16.72909164428711, + "learning_rate": 4.949877818727565e-05, + "loss": 0.3514, + "num_input_tokens_seen": 12264104, + "step": 21145 + }, + { + "epoch": 3.1501340482573728, + "grad_norm": 26.695558547973633, + "learning_rate": 4.949813057204684e-05, + "loss": 0.3381, + "num_input_tokens_seen": 12266760, + "step": 21150 + }, + { + "epoch": 3.150878760798332, + "grad_norm": 8.753900527954102, + "learning_rate": 4.9497482542947004e-05, + "loss": 0.5209, + "num_input_tokens_seen": 12269448, + "step": 21155 + }, + { + "epoch": 3.151623473339291, + "grad_norm": 12.584732055664062, + "learning_rate": 4.9496834099987106e-05, + "loss": 0.542, + "num_input_tokens_seen": 12272264, + "step": 21160 + }, + { + "epoch": 3.1523681858802504, + "grad_norm": 10.447847366333008, + "learning_rate": 4.949618524317809e-05, + "loss": 0.512, + "num_input_tokens_seen": 12275016, + "step": 21165 + }, + { + "epoch": 3.1531128984212096, + "grad_norm": 13.1034574508667, + "learning_rate": 4.9495535972530924e-05, + "loss": 0.3351, + "num_input_tokens_seen": 12278088, + "step": 21170 + }, + { + "epoch": 3.1538576109621688, + "grad_norm": 16.28922462463379, + "learning_rate": 4.949488628805657e-05, + "loss": 0.3484, + "num_input_tokens_seen": 12280808, + "step": 21175 + }, + { + "epoch": 3.154602323503128, + "grad_norm": 8.5625638961792, + "learning_rate": 4.9494236189766005e-05, + "loss": 0.5253, + "num_input_tokens_seen": 12283976, + "step": 21180 + }, + { + "epoch": 3.155347036044087, + "grad_norm": 11.133434295654297, + "learning_rate": 4.9493585677670216e-05, + "loss": 0.3377, + "num_input_tokens_seen": 12287112, + "step": 21185 + }, + { + "epoch": 3.1560917485850464, + "grad_norm": 6.9026265144348145, + "learning_rate": 4.94929347517802e-05, + "loss": 0.3003, + "num_input_tokens_seen": 12290088, + "step": 21190 + }, + { + "epoch": 3.156836461126005, + "grad_norm": 11.585456848144531, + "learning_rate": 4.9492283412106934e-05, + "loss": 0.5968, + "num_input_tokens_seen": 12292904, + "step": 21195 + }, + { + "epoch": 3.157581173666965, + "grad_norm": 4.663967609405518, + "learning_rate": 4.9491631658661436e-05, + "loss": 0.3525, + "num_input_tokens_seen": 12295560, + "step": 21200 + }, + { + "epoch": 3.1583258862079235, + "grad_norm": 11.085931777954102, + "learning_rate": 4.9490979491454716e-05, + "loss": 0.3879, + "num_input_tokens_seen": 12298728, + "step": 21205 + }, + { + "epoch": 3.1590705987488827, + "grad_norm": 12.957762718200684, + "learning_rate": 4.9490326910497786e-05, + "loss": 0.4063, + "num_input_tokens_seen": 12301640, + "step": 21210 + }, + { + "epoch": 3.159815311289842, + "grad_norm": 7.781863212585449, + "learning_rate": 4.948967391580167e-05, + "loss": 0.4451, + "num_input_tokens_seen": 12304488, + "step": 21215 + }, + { + "epoch": 3.160560023830801, + "grad_norm": 6.866255760192871, + "learning_rate": 4.948902050737741e-05, + "loss": 0.2838, + "num_input_tokens_seen": 12307432, + "step": 21220 + }, + { + "epoch": 3.1613047363717603, + "grad_norm": 24.13614273071289, + "learning_rate": 4.948836668523604e-05, + "loss": 0.52, + "num_input_tokens_seen": 12310408, + "step": 21225 + }, + { + "epoch": 3.1620494489127196, + "grad_norm": 4.132468223571777, + "learning_rate": 4.9487712449388604e-05, + "loss": 0.4451, + "num_input_tokens_seen": 12313448, + "step": 21230 + }, + { + "epoch": 3.1627941614536788, + "grad_norm": 6.989225387573242, + "learning_rate": 4.948705779984614e-05, + "loss": 0.5453, + "num_input_tokens_seen": 12316264, + "step": 21235 + }, + { + "epoch": 3.163538873994638, + "grad_norm": 7.981618404388428, + "learning_rate": 4.9486402736619736e-05, + "loss": 0.446, + "num_input_tokens_seen": 12319112, + "step": 21240 + }, + { + "epoch": 3.164283586535597, + "grad_norm": 17.1885986328125, + "learning_rate": 4.9485747259720435e-05, + "loss": 0.3878, + "num_input_tokens_seen": 12322152, + "step": 21245 + }, + { + "epoch": 3.1650282990765564, + "grad_norm": 7.879520893096924, + "learning_rate": 4.9485091369159334e-05, + "loss": 0.3566, + "num_input_tokens_seen": 12325032, + "step": 21250 + }, + { + "epoch": 3.1657730116175156, + "grad_norm": 9.607588768005371, + "learning_rate": 4.948443506494749e-05, + "loss": 0.4307, + "num_input_tokens_seen": 12328040, + "step": 21255 + }, + { + "epoch": 3.1665177241584748, + "grad_norm": 3.8740129470825195, + "learning_rate": 4.9483778347096e-05, + "loss": 0.3977, + "num_input_tokens_seen": 12330920, + "step": 21260 + }, + { + "epoch": 3.167262436699434, + "grad_norm": 12.611063957214355, + "learning_rate": 4.948312121561596e-05, + "loss": 0.4135, + "num_input_tokens_seen": 12333832, + "step": 21265 + }, + { + "epoch": 3.168007149240393, + "grad_norm": 19.1776180267334, + "learning_rate": 4.9482463670518476e-05, + "loss": 0.3181, + "num_input_tokens_seen": 12336872, + "step": 21270 + }, + { + "epoch": 3.1687518617813524, + "grad_norm": 11.73106861114502, + "learning_rate": 4.9481805711814645e-05, + "loss": 0.3381, + "num_input_tokens_seen": 12339848, + "step": 21275 + }, + { + "epoch": 3.1694965743223116, + "grad_norm": 7.69075345993042, + "learning_rate": 4.948114733951559e-05, + "loss": 0.2691, + "num_input_tokens_seen": 12342824, + "step": 21280 + }, + { + "epoch": 3.170241286863271, + "grad_norm": 5.920578956604004, + "learning_rate": 4.948048855363243e-05, + "loss": 0.2048, + "num_input_tokens_seen": 12345800, + "step": 21285 + }, + { + "epoch": 3.17098599940423, + "grad_norm": 9.702552795410156, + "learning_rate": 4.94798293541763e-05, + "loss": 0.418, + "num_input_tokens_seen": 12349128, + "step": 21290 + }, + { + "epoch": 3.171730711945189, + "grad_norm": 12.29761028289795, + "learning_rate": 4.9479169741158336e-05, + "loss": 0.3257, + "num_input_tokens_seen": 12352232, + "step": 21295 + }, + { + "epoch": 3.1724754244861484, + "grad_norm": 17.6606388092041, + "learning_rate": 4.947850971458968e-05, + "loss": 0.2175, + "num_input_tokens_seen": 12355368, + "step": 21300 + }, + { + "epoch": 3.1732201370271076, + "grad_norm": 9.901477813720703, + "learning_rate": 4.947784927448147e-05, + "loss": 0.3036, + "num_input_tokens_seen": 12357992, + "step": 21305 + }, + { + "epoch": 3.173964849568067, + "grad_norm": 4.249791145324707, + "learning_rate": 4.9477188420844886e-05, + "loss": 0.329, + "num_input_tokens_seen": 12360616, + "step": 21310 + }, + { + "epoch": 3.174709562109026, + "grad_norm": 6.109784126281738, + "learning_rate": 4.947652715369108e-05, + "loss": 0.5078, + "num_input_tokens_seen": 12363432, + "step": 21315 + }, + { + "epoch": 3.175454274649985, + "grad_norm": 13.817680358886719, + "learning_rate": 4.947586547303121e-05, + "loss": 0.3327, + "num_input_tokens_seen": 12366248, + "step": 21320 + }, + { + "epoch": 3.1761989871909444, + "grad_norm": 5.995926856994629, + "learning_rate": 4.947520337887649e-05, + "loss": 0.4567, + "num_input_tokens_seen": 12369064, + "step": 21325 + }, + { + "epoch": 3.1769436997319036, + "grad_norm": 3.2042407989501953, + "learning_rate": 4.947454087123807e-05, + "loss": 0.2131, + "num_input_tokens_seen": 12372168, + "step": 21330 + }, + { + "epoch": 3.177688412272863, + "grad_norm": 11.951240539550781, + "learning_rate": 4.947387795012716e-05, + "loss": 0.4255, + "num_input_tokens_seen": 12375176, + "step": 21335 + }, + { + "epoch": 3.178433124813822, + "grad_norm": 7.485651016235352, + "learning_rate": 4.947321461555496e-05, + "loss": 0.342, + "num_input_tokens_seen": 12378088, + "step": 21340 + }, + { + "epoch": 3.179177837354781, + "grad_norm": 8.796098709106445, + "learning_rate": 4.947255086753268e-05, + "loss": 0.3334, + "num_input_tokens_seen": 12381064, + "step": 21345 + }, + { + "epoch": 3.1799225498957404, + "grad_norm": 10.212075233459473, + "learning_rate": 4.9471886706071504e-05, + "loss": 0.6072, + "num_input_tokens_seen": 12384104, + "step": 21350 + }, + { + "epoch": 3.1806672624366996, + "grad_norm": 10.644165992736816, + "learning_rate": 4.9471222131182685e-05, + "loss": 0.3447, + "num_input_tokens_seen": 12386792, + "step": 21355 + }, + { + "epoch": 3.181411974977659, + "grad_norm": 7.234433174133301, + "learning_rate": 4.9470557142877446e-05, + "loss": 0.3647, + "num_input_tokens_seen": 12389864, + "step": 21360 + }, + { + "epoch": 3.182156687518618, + "grad_norm": 6.806734561920166, + "learning_rate": 4.946989174116701e-05, + "loss": 0.3469, + "num_input_tokens_seen": 12392776, + "step": 21365 + }, + { + "epoch": 3.182901400059577, + "grad_norm": 3.7955803871154785, + "learning_rate": 4.9469225926062625e-05, + "loss": 0.192, + "num_input_tokens_seen": 12395752, + "step": 21370 + }, + { + "epoch": 3.1836461126005364, + "grad_norm": 23.730260848999023, + "learning_rate": 4.946855969757553e-05, + "loss": 0.3196, + "num_input_tokens_seen": 12398280, + "step": 21375 + }, + { + "epoch": 3.184390825141495, + "grad_norm": 5.464567184448242, + "learning_rate": 4.9467893055716996e-05, + "loss": 0.3484, + "num_input_tokens_seen": 12401448, + "step": 21380 + }, + { + "epoch": 3.1851355376824544, + "grad_norm": 9.851067543029785, + "learning_rate": 4.946722600049827e-05, + "loss": 0.5951, + "num_input_tokens_seen": 12404264, + "step": 21385 + }, + { + "epoch": 3.1858802502234136, + "grad_norm": 18.902170181274414, + "learning_rate": 4.946655853193063e-05, + "loss": 0.6984, + "num_input_tokens_seen": 12407112, + "step": 21390 + }, + { + "epoch": 3.186624962764373, + "grad_norm": 2.785221576690674, + "learning_rate": 4.946589065002535e-05, + "loss": 0.4254, + "num_input_tokens_seen": 12409864, + "step": 21395 + }, + { + "epoch": 3.187369675305332, + "grad_norm": 2.7700867652893066, + "learning_rate": 4.946522235479372e-05, + "loss": 0.1878, + "num_input_tokens_seen": 12412648, + "step": 21400 + }, + { + "epoch": 3.188114387846291, + "grad_norm": 2.156137466430664, + "learning_rate": 4.946455364624702e-05, + "loss": 0.2826, + "num_input_tokens_seen": 12415528, + "step": 21405 + }, + { + "epoch": 3.1888591003872504, + "grad_norm": 0.7681654095649719, + "learning_rate": 4.9463884524396555e-05, + "loss": 0.5681, + "num_input_tokens_seen": 12418504, + "step": 21410 + }, + { + "epoch": 3.1896038129282096, + "grad_norm": 21.720232009887695, + "learning_rate": 4.946321498925362e-05, + "loss": 0.4937, + "num_input_tokens_seen": 12421192, + "step": 21415 + }, + { + "epoch": 3.190348525469169, + "grad_norm": 6.824465751647949, + "learning_rate": 4.946254504082952e-05, + "loss": 0.3992, + "num_input_tokens_seen": 12424104, + "step": 21420 + }, + { + "epoch": 3.191093238010128, + "grad_norm": 6.440730094909668, + "learning_rate": 4.94618746791356e-05, + "loss": 0.4162, + "num_input_tokens_seen": 12426760, + "step": 21425 + }, + { + "epoch": 3.191837950551087, + "grad_norm": 5.415096282958984, + "learning_rate": 4.946120390418316e-05, + "loss": 0.318, + "num_input_tokens_seen": 12429544, + "step": 21430 + }, + { + "epoch": 3.1925826630920464, + "grad_norm": 6.9932756423950195, + "learning_rate": 4.946053271598355e-05, + "loss": 0.3163, + "num_input_tokens_seen": 12432552, + "step": 21435 + }, + { + "epoch": 3.1933273756330056, + "grad_norm": 4.857782363891602, + "learning_rate": 4.94598611145481e-05, + "loss": 0.316, + "num_input_tokens_seen": 12435272, + "step": 21440 + }, + { + "epoch": 3.194072088173965, + "grad_norm": 7.801003932952881, + "learning_rate": 4.945918909988815e-05, + "loss": 0.3957, + "num_input_tokens_seen": 12437992, + "step": 21445 + }, + { + "epoch": 3.194816800714924, + "grad_norm": 3.1717684268951416, + "learning_rate": 4.945851667201507e-05, + "loss": 0.5606, + "num_input_tokens_seen": 12440840, + "step": 21450 + }, + { + "epoch": 3.1955615132558832, + "grad_norm": 9.651822090148926, + "learning_rate": 4.945784383094019e-05, + "loss": 0.3551, + "num_input_tokens_seen": 12443880, + "step": 21455 + }, + { + "epoch": 3.1963062257968424, + "grad_norm": 5.85109281539917, + "learning_rate": 4.9457170576674914e-05, + "loss": 0.2963, + "num_input_tokens_seen": 12446888, + "step": 21460 + }, + { + "epoch": 3.1970509383378016, + "grad_norm": 9.229692459106445, + "learning_rate": 4.945649690923059e-05, + "loss": 0.304, + "num_input_tokens_seen": 12449544, + "step": 21465 + }, + { + "epoch": 3.197795650878761, + "grad_norm": 17.019929885864258, + "learning_rate": 4.94558228286186e-05, + "loss": 0.3185, + "num_input_tokens_seen": 12452552, + "step": 21470 + }, + { + "epoch": 3.19854036341972, + "grad_norm": 5.359261989593506, + "learning_rate": 4.945514833485036e-05, + "loss": 0.3562, + "num_input_tokens_seen": 12455336, + "step": 21475 + }, + { + "epoch": 3.1992850759606792, + "grad_norm": 3.5401806831359863, + "learning_rate": 4.9454473427937225e-05, + "loss": 0.4632, + "num_input_tokens_seen": 12458376, + "step": 21480 + }, + { + "epoch": 3.2000297885016384, + "grad_norm": 32.30338668823242, + "learning_rate": 4.9453798107890624e-05, + "loss": 0.5332, + "num_input_tokens_seen": 12461512, + "step": 21485 + }, + { + "epoch": 3.2007745010425976, + "grad_norm": 17.26091194152832, + "learning_rate": 4.945312237472196e-05, + "loss": 0.4011, + "num_input_tokens_seen": 12464456, + "step": 21490 + }, + { + "epoch": 3.201519213583557, + "grad_norm": 11.086411476135254, + "learning_rate": 4.945244622844264e-05, + "loss": 0.3388, + "num_input_tokens_seen": 12467048, + "step": 21495 + }, + { + "epoch": 3.202263926124516, + "grad_norm": 6.472581386566162, + "learning_rate": 4.9451769669064096e-05, + "loss": 0.3041, + "num_input_tokens_seen": 12469960, + "step": 21500 + }, + { + "epoch": 3.2030086386654752, + "grad_norm": 11.772491455078125, + "learning_rate": 4.945109269659776e-05, + "loss": 0.6075, + "num_input_tokens_seen": 12472808, + "step": 21505 + }, + { + "epoch": 3.2037533512064345, + "grad_norm": 9.013927459716797, + "learning_rate": 4.945041531105505e-05, + "loss": 0.4757, + "num_input_tokens_seen": 12475880, + "step": 21510 + }, + { + "epoch": 3.2044980637473937, + "grad_norm": 10.074678421020508, + "learning_rate": 4.9449737512447435e-05, + "loss": 0.3891, + "num_input_tokens_seen": 12478696, + "step": 21515 + }, + { + "epoch": 3.205242776288353, + "grad_norm": 5.252894401550293, + "learning_rate": 4.9449059300786355e-05, + "loss": 0.3041, + "num_input_tokens_seen": 12481672, + "step": 21520 + }, + { + "epoch": 3.205987488829312, + "grad_norm": 2.101978063583374, + "learning_rate": 4.944838067608326e-05, + "loss": 0.4352, + "num_input_tokens_seen": 12484584, + "step": 21525 + }, + { + "epoch": 3.2067322013702713, + "grad_norm": 10.604639053344727, + "learning_rate": 4.944770163834963e-05, + "loss": 0.4286, + "num_input_tokens_seen": 12487080, + "step": 21530 + }, + { + "epoch": 3.2074769139112305, + "grad_norm": 10.283514022827148, + "learning_rate": 4.944702218759692e-05, + "loss": 0.3699, + "num_input_tokens_seen": 12489864, + "step": 21535 + }, + { + "epoch": 3.2082216264521897, + "grad_norm": 18.313077926635742, + "learning_rate": 4.944634232383662e-05, + "loss": 0.4731, + "num_input_tokens_seen": 12493192, + "step": 21540 + }, + { + "epoch": 3.2089663389931484, + "grad_norm": 12.992506980895996, + "learning_rate": 4.944566204708022e-05, + "loss": 0.312, + "num_input_tokens_seen": 12495816, + "step": 21545 + }, + { + "epoch": 3.2097110515341076, + "grad_norm": 15.150280952453613, + "learning_rate": 4.94449813573392e-05, + "loss": 0.3507, + "num_input_tokens_seen": 12498824, + "step": 21550 + }, + { + "epoch": 3.210455764075067, + "grad_norm": 6.880227565765381, + "learning_rate": 4.944430025462507e-05, + "loss": 0.2548, + "num_input_tokens_seen": 12501576, + "step": 21555 + }, + { + "epoch": 3.211200476616026, + "grad_norm": 6.857739448547363, + "learning_rate": 4.944361873894932e-05, + "loss": 0.4547, + "num_input_tokens_seen": 12504488, + "step": 21560 + }, + { + "epoch": 3.2119451891569852, + "grad_norm": 28.41140365600586, + "learning_rate": 4.944293681032348e-05, + "loss": 0.5232, + "num_input_tokens_seen": 12507272, + "step": 21565 + }, + { + "epoch": 3.2126899016979444, + "grad_norm": 14.38928508758545, + "learning_rate": 4.9442254468759065e-05, + "loss": 0.573, + "num_input_tokens_seen": 12509928, + "step": 21570 + }, + { + "epoch": 3.2134346142389036, + "grad_norm": 8.455419540405273, + "learning_rate": 4.94415717142676e-05, + "loss": 0.4131, + "num_input_tokens_seen": 12512712, + "step": 21575 + }, + { + "epoch": 3.214179326779863, + "grad_norm": 18.064775466918945, + "learning_rate": 4.944088854686062e-05, + "loss": 0.56, + "num_input_tokens_seen": 12515656, + "step": 21580 + }, + { + "epoch": 3.214924039320822, + "grad_norm": 5.533239364624023, + "learning_rate": 4.944020496654968e-05, + "loss": 0.4969, + "num_input_tokens_seen": 12518632, + "step": 21585 + }, + { + "epoch": 3.2156687518617812, + "grad_norm": 11.12833023071289, + "learning_rate": 4.943952097334631e-05, + "loss": 0.437, + "num_input_tokens_seen": 12521640, + "step": 21590 + }, + { + "epoch": 3.2164134644027405, + "grad_norm": 5.615597724914551, + "learning_rate": 4.943883656726207e-05, + "loss": 0.4276, + "num_input_tokens_seen": 12524520, + "step": 21595 + }, + { + "epoch": 3.2171581769436997, + "grad_norm": 2.900033950805664, + "learning_rate": 4.943815174830853e-05, + "loss": 0.2572, + "num_input_tokens_seen": 12527272, + "step": 21600 + }, + { + "epoch": 3.217902889484659, + "grad_norm": 13.611534118652344, + "learning_rate": 4.9437466516497255e-05, + "loss": 0.3739, + "num_input_tokens_seen": 12530248, + "step": 21605 + }, + { + "epoch": 3.218647602025618, + "grad_norm": 15.096978187561035, + "learning_rate": 4.943678087183982e-05, + "loss": 0.4105, + "num_input_tokens_seen": 12532968, + "step": 21610 + }, + { + "epoch": 3.2193923145665773, + "grad_norm": 2.975264072418213, + "learning_rate": 4.94360948143478e-05, + "loss": 0.2656, + "num_input_tokens_seen": 12535624, + "step": 21615 + }, + { + "epoch": 3.2201370271075365, + "grad_norm": 20.041667938232422, + "learning_rate": 4.94354083440328e-05, + "loss": 0.3624, + "num_input_tokens_seen": 12538632, + "step": 21620 + }, + { + "epoch": 3.2208817396484957, + "grad_norm": 3.035179376602173, + "learning_rate": 4.9434721460906406e-05, + "loss": 0.2837, + "num_input_tokens_seen": 12541576, + "step": 21625 + }, + { + "epoch": 3.221626452189455, + "grad_norm": 6.047388076782227, + "learning_rate": 4.9434034164980233e-05, + "loss": 0.2791, + "num_input_tokens_seen": 12544584, + "step": 21630 + }, + { + "epoch": 3.222371164730414, + "grad_norm": 7.068876266479492, + "learning_rate": 4.94333464562659e-05, + "loss": 0.2724, + "num_input_tokens_seen": 12547400, + "step": 21635 + }, + { + "epoch": 3.2231158772713733, + "grad_norm": 32.30796813964844, + "learning_rate": 4.9432658334774984e-05, + "loss": 0.2927, + "num_input_tokens_seen": 12550152, + "step": 21640 + }, + { + "epoch": 3.2238605898123325, + "grad_norm": 11.106694221496582, + "learning_rate": 4.943196980051915e-05, + "loss": 0.3401, + "num_input_tokens_seen": 12553128, + "step": 21645 + }, + { + "epoch": 3.2246053023532917, + "grad_norm": 1.4908047914505005, + "learning_rate": 4.943128085351002e-05, + "loss": 0.4141, + "num_input_tokens_seen": 12556168, + "step": 21650 + }, + { + "epoch": 3.225350014894251, + "grad_norm": 6.94012451171875, + "learning_rate": 4.943059149375923e-05, + "loss": 0.38, + "num_input_tokens_seen": 12559208, + "step": 21655 + }, + { + "epoch": 3.22609472743521, + "grad_norm": 20.557518005371094, + "learning_rate": 4.9429901721278426e-05, + "loss": 0.4829, + "num_input_tokens_seen": 12562440, + "step": 21660 + }, + { + "epoch": 3.2268394399761693, + "grad_norm": 1.2041765451431274, + "learning_rate": 4.9429211536079266e-05, + "loss": 0.3385, + "num_input_tokens_seen": 12565608, + "step": 21665 + }, + { + "epoch": 3.2275841525171285, + "grad_norm": 22.143821716308594, + "learning_rate": 4.94285209381734e-05, + "loss": 0.6091, + "num_input_tokens_seen": 12568584, + "step": 21670 + }, + { + "epoch": 3.2283288650580877, + "grad_norm": 19.429119110107422, + "learning_rate": 4.94278299275725e-05, + "loss": 0.4377, + "num_input_tokens_seen": 12571464, + "step": 21675 + }, + { + "epoch": 3.229073577599047, + "grad_norm": 19.538511276245117, + "learning_rate": 4.9427138504288245e-05, + "loss": 0.2719, + "num_input_tokens_seen": 12574696, + "step": 21680 + }, + { + "epoch": 3.229818290140006, + "grad_norm": 8.471174240112305, + "learning_rate": 4.942644666833231e-05, + "loss": 0.3523, + "num_input_tokens_seen": 12577608, + "step": 21685 + }, + { + "epoch": 3.2305630026809653, + "grad_norm": 13.998016357421875, + "learning_rate": 4.9425754419716383e-05, + "loss": 0.3602, + "num_input_tokens_seen": 12580680, + "step": 21690 + }, + { + "epoch": 3.2313077152219245, + "grad_norm": 24.096458435058594, + "learning_rate": 4.942506175845216e-05, + "loss": 0.4967, + "num_input_tokens_seen": 12583464, + "step": 21695 + }, + { + "epoch": 3.2320524277628837, + "grad_norm": 2.729755401611328, + "learning_rate": 4.9424368684551347e-05, + "loss": 0.3835, + "num_input_tokens_seen": 12586408, + "step": 21700 + }, + { + "epoch": 3.232797140303843, + "grad_norm": 14.30588150024414, + "learning_rate": 4.942367519802565e-05, + "loss": 0.3965, + "num_input_tokens_seen": 12589224, + "step": 21705 + }, + { + "epoch": 3.233541852844802, + "grad_norm": 28.822309494018555, + "learning_rate": 4.9422981298886776e-05, + "loss": 0.4014, + "num_input_tokens_seen": 12592232, + "step": 21710 + }, + { + "epoch": 3.2342865653857613, + "grad_norm": 0.8896400332450867, + "learning_rate": 4.942228698714646e-05, + "loss": 0.2193, + "num_input_tokens_seen": 12594888, + "step": 21715 + }, + { + "epoch": 3.23503127792672, + "grad_norm": 6.934640407562256, + "learning_rate": 4.942159226281643e-05, + "loss": 0.2559, + "num_input_tokens_seen": 12597800, + "step": 21720 + }, + { + "epoch": 3.2357759904676793, + "grad_norm": 14.353257179260254, + "learning_rate": 4.942089712590842e-05, + "loss": 0.409, + "num_input_tokens_seen": 12600936, + "step": 21725 + }, + { + "epoch": 3.2365207030086385, + "grad_norm": 6.525176525115967, + "learning_rate": 4.9420201576434165e-05, + "loss": 0.341, + "num_input_tokens_seen": 12603912, + "step": 21730 + }, + { + "epoch": 3.2372654155495977, + "grad_norm": 15.088266372680664, + "learning_rate": 4.941950561440543e-05, + "loss": 0.3224, + "num_input_tokens_seen": 12606984, + "step": 21735 + }, + { + "epoch": 3.238010128090557, + "grad_norm": 18.663745880126953, + "learning_rate": 4.9418809239833964e-05, + "loss": 0.4652, + "num_input_tokens_seen": 12610024, + "step": 21740 + }, + { + "epoch": 3.238754840631516, + "grad_norm": 23.536672592163086, + "learning_rate": 4.9418112452731534e-05, + "loss": 0.5038, + "num_input_tokens_seen": 12612840, + "step": 21745 + }, + { + "epoch": 3.2394995531724753, + "grad_norm": 9.207903861999512, + "learning_rate": 4.941741525310991e-05, + "loss": 0.3001, + "num_input_tokens_seen": 12615592, + "step": 21750 + }, + { + "epoch": 3.2402442657134345, + "grad_norm": 12.57665729522705, + "learning_rate": 4.9416717640980884e-05, + "loss": 0.4211, + "num_input_tokens_seen": 12618696, + "step": 21755 + }, + { + "epoch": 3.2409889782543937, + "grad_norm": 14.571390151977539, + "learning_rate": 4.941601961635621e-05, + "loss": 0.4912, + "num_input_tokens_seen": 12621448, + "step": 21760 + }, + { + "epoch": 3.241733690795353, + "grad_norm": 5.06542444229126, + "learning_rate": 4.941532117924772e-05, + "loss": 0.4972, + "num_input_tokens_seen": 12624168, + "step": 21765 + }, + { + "epoch": 3.242478403336312, + "grad_norm": 0.6479766964912415, + "learning_rate": 4.941462232966718e-05, + "loss": 0.2102, + "num_input_tokens_seen": 12626952, + "step": 21770 + }, + { + "epoch": 3.2432231158772713, + "grad_norm": 6.561645030975342, + "learning_rate": 4.9413923067626413e-05, + "loss": 0.3704, + "num_input_tokens_seen": 12630056, + "step": 21775 + }, + { + "epoch": 3.2439678284182305, + "grad_norm": 2.6624674797058105, + "learning_rate": 4.941322339313723e-05, + "loss": 0.296, + "num_input_tokens_seen": 12633512, + "step": 21780 + }, + { + "epoch": 3.2447125409591897, + "grad_norm": 5.627182960510254, + "learning_rate": 4.941252330621145e-05, + "loss": 0.4097, + "num_input_tokens_seen": 12636520, + "step": 21785 + }, + { + "epoch": 3.245457253500149, + "grad_norm": 2.011199474334717, + "learning_rate": 4.94118228068609e-05, + "loss": 0.194, + "num_input_tokens_seen": 12639528, + "step": 21790 + }, + { + "epoch": 3.246201966041108, + "grad_norm": 4.203256130218506, + "learning_rate": 4.9411121895097414e-05, + "loss": 0.3284, + "num_input_tokens_seen": 12642408, + "step": 21795 + }, + { + "epoch": 3.2469466785820673, + "grad_norm": 4.922631740570068, + "learning_rate": 4.941042057093284e-05, + "loss": 0.2723, + "num_input_tokens_seen": 12645352, + "step": 21800 + }, + { + "epoch": 3.2476913911230265, + "grad_norm": 17.034889221191406, + "learning_rate": 4.940971883437901e-05, + "loss": 0.1858, + "num_input_tokens_seen": 12648392, + "step": 21805 + }, + { + "epoch": 3.2484361036639857, + "grad_norm": 17.513647079467773, + "learning_rate": 4.94090166854478e-05, + "loss": 0.4545, + "num_input_tokens_seen": 12651432, + "step": 21810 + }, + { + "epoch": 3.249180816204945, + "grad_norm": 4.978719711303711, + "learning_rate": 4.940831412415105e-05, + "loss": 0.8068, + "num_input_tokens_seen": 12654184, + "step": 21815 + }, + { + "epoch": 3.249925528745904, + "grad_norm": 19.61102294921875, + "learning_rate": 4.9407611150500646e-05, + "loss": 0.6215, + "num_input_tokens_seen": 12656872, + "step": 21820 + }, + { + "epoch": 3.2506702412868633, + "grad_norm": 32.03522491455078, + "learning_rate": 4.940690776450846e-05, + "loss": 0.7535, + "num_input_tokens_seen": 12659912, + "step": 21825 + }, + { + "epoch": 3.2514149538278225, + "grad_norm": 9.841414451599121, + "learning_rate": 4.940620396618637e-05, + "loss": 0.4467, + "num_input_tokens_seen": 12662600, + "step": 21830 + }, + { + "epoch": 3.2521596663687817, + "grad_norm": 8.58845043182373, + "learning_rate": 4.940549975554627e-05, + "loss": 0.5052, + "num_input_tokens_seen": 12665736, + "step": 21835 + }, + { + "epoch": 3.252904378909741, + "grad_norm": 42.88116455078125, + "learning_rate": 4.940479513260006e-05, + "loss": 0.403, + "num_input_tokens_seen": 12668616, + "step": 21840 + }, + { + "epoch": 3.2536490914507, + "grad_norm": 47.409549713134766, + "learning_rate": 4.940409009735964e-05, + "loss": 0.6306, + "num_input_tokens_seen": 12671656, + "step": 21845 + }, + { + "epoch": 3.2543938039916593, + "grad_norm": 13.440253257751465, + "learning_rate": 4.940338464983691e-05, + "loss": 0.2869, + "num_input_tokens_seen": 12674504, + "step": 21850 + }, + { + "epoch": 3.2551385165326185, + "grad_norm": 11.536355972290039, + "learning_rate": 4.940267879004381e-05, + "loss": 0.2519, + "num_input_tokens_seen": 12677480, + "step": 21855 + }, + { + "epoch": 3.2558832290735777, + "grad_norm": 11.86855697631836, + "learning_rate": 4.9401972517992254e-05, + "loss": 0.3165, + "num_input_tokens_seen": 12680072, + "step": 21860 + }, + { + "epoch": 3.256627941614537, + "grad_norm": 26.715978622436523, + "learning_rate": 4.9401265833694166e-05, + "loss": 0.3959, + "num_input_tokens_seen": 12683176, + "step": 21865 + }, + { + "epoch": 3.257372654155496, + "grad_norm": 16.09465980529785, + "learning_rate": 4.940055873716149e-05, + "loss": 0.4066, + "num_input_tokens_seen": 12686216, + "step": 21870 + }, + { + "epoch": 3.2581173666964554, + "grad_norm": 10.006288528442383, + "learning_rate": 4.939985122840619e-05, + "loss": 0.2461, + "num_input_tokens_seen": 12689064, + "step": 21875 + }, + { + "epoch": 3.2588620792374146, + "grad_norm": 9.00341796875, + "learning_rate": 4.939914330744019e-05, + "loss": 0.2805, + "num_input_tokens_seen": 12692328, + "step": 21880 + }, + { + "epoch": 3.2596067917783733, + "grad_norm": 6.4401140213012695, + "learning_rate": 4.939843497427547e-05, + "loss": 0.3645, + "num_input_tokens_seen": 12695528, + "step": 21885 + }, + { + "epoch": 3.260351504319333, + "grad_norm": 30.469079971313477, + "learning_rate": 4.939772622892398e-05, + "loss": 0.6649, + "num_input_tokens_seen": 12698600, + "step": 21890 + }, + { + "epoch": 3.2610962168602917, + "grad_norm": 7.538539886474609, + "learning_rate": 4.93970170713977e-05, + "loss": 0.6304, + "num_input_tokens_seen": 12701448, + "step": 21895 + }, + { + "epoch": 3.2618409294012514, + "grad_norm": 15.220974922180176, + "learning_rate": 4.9396307501708625e-05, + "loss": 0.55, + "num_input_tokens_seen": 12704200, + "step": 21900 + }, + { + "epoch": 3.26258564194221, + "grad_norm": 5.6053786277771, + "learning_rate": 4.939559751986872e-05, + "loss": 0.3853, + "num_input_tokens_seen": 12706984, + "step": 21905 + }, + { + "epoch": 3.2633303544831693, + "grad_norm": 4.002363204956055, + "learning_rate": 4.939488712588999e-05, + "loss": 0.3774, + "num_input_tokens_seen": 12709960, + "step": 21910 + }, + { + "epoch": 3.2640750670241285, + "grad_norm": 10.72594165802002, + "learning_rate": 4.939417631978444e-05, + "loss": 0.4685, + "num_input_tokens_seen": 12712744, + "step": 21915 + }, + { + "epoch": 3.2648197795650877, + "grad_norm": 13.833556175231934, + "learning_rate": 4.939346510156407e-05, + "loss": 0.646, + "num_input_tokens_seen": 12715656, + "step": 21920 + }, + { + "epoch": 3.265564492106047, + "grad_norm": 24.402759552001953, + "learning_rate": 4.93927534712409e-05, + "loss": 0.4056, + "num_input_tokens_seen": 12718504, + "step": 21925 + }, + { + "epoch": 3.266309204647006, + "grad_norm": 9.27002239227295, + "learning_rate": 4.939204142882696e-05, + "loss": 0.2677, + "num_input_tokens_seen": 12721384, + "step": 21930 + }, + { + "epoch": 3.2670539171879653, + "grad_norm": 13.045899391174316, + "learning_rate": 4.939132897433426e-05, + "loss": 0.3411, + "num_input_tokens_seen": 12724424, + "step": 21935 + }, + { + "epoch": 3.2677986297289245, + "grad_norm": 7.058811664581299, + "learning_rate": 4.939061610777486e-05, + "loss": 0.2734, + "num_input_tokens_seen": 12727624, + "step": 21940 + }, + { + "epoch": 3.2685433422698837, + "grad_norm": 17.149934768676758, + "learning_rate": 4.938990282916078e-05, + "loss": 0.2772, + "num_input_tokens_seen": 12730376, + "step": 21945 + }, + { + "epoch": 3.269288054810843, + "grad_norm": 9.352054595947266, + "learning_rate": 4.938918913850408e-05, + "loss": 0.3719, + "num_input_tokens_seen": 12733128, + "step": 21950 + }, + { + "epoch": 3.270032767351802, + "grad_norm": 3.1037232875823975, + "learning_rate": 4.938847503581682e-05, + "loss": 0.3322, + "num_input_tokens_seen": 12735912, + "step": 21955 + }, + { + "epoch": 3.2707774798927614, + "grad_norm": 27.412729263305664, + "learning_rate": 4.938776052111106e-05, + "loss": 0.6535, + "num_input_tokens_seen": 12738664, + "step": 21960 + }, + { + "epoch": 3.2715221924337206, + "grad_norm": 21.30345344543457, + "learning_rate": 4.9387045594398875e-05, + "loss": 0.513, + "num_input_tokens_seen": 12741384, + "step": 21965 + }, + { + "epoch": 3.2722669049746798, + "grad_norm": 10.970098495483398, + "learning_rate": 4.9386330255692346e-05, + "loss": 0.4338, + "num_input_tokens_seen": 12743976, + "step": 21970 + }, + { + "epoch": 3.273011617515639, + "grad_norm": 15.966541290283203, + "learning_rate": 4.938561450500354e-05, + "loss": 0.2864, + "num_input_tokens_seen": 12747016, + "step": 21975 + }, + { + "epoch": 3.273756330056598, + "grad_norm": 18.829164505004883, + "learning_rate": 4.938489834234457e-05, + "loss": 0.2982, + "num_input_tokens_seen": 12749960, + "step": 21980 + }, + { + "epoch": 3.2745010425975574, + "grad_norm": 12.635682106018066, + "learning_rate": 4.9384181767727524e-05, + "loss": 0.3556, + "num_input_tokens_seen": 12752808, + "step": 21985 + }, + { + "epoch": 3.2752457551385166, + "grad_norm": 14.797165870666504, + "learning_rate": 4.9383464781164515e-05, + "loss": 0.3935, + "num_input_tokens_seen": 12755656, + "step": 21990 + }, + { + "epoch": 3.2759904676794758, + "grad_norm": 10.510467529296875, + "learning_rate": 4.938274738266764e-05, + "loss": 0.3972, + "num_input_tokens_seen": 12758952, + "step": 21995 + }, + { + "epoch": 3.276735180220435, + "grad_norm": 2.3692781925201416, + "learning_rate": 4.938202957224903e-05, + "loss": 0.228, + "num_input_tokens_seen": 12761640, + "step": 22000 + }, + { + "epoch": 3.277479892761394, + "grad_norm": 3.901393413543701, + "learning_rate": 4.938131134992082e-05, + "loss": 0.3151, + "num_input_tokens_seen": 12764328, + "step": 22005 + }, + { + "epoch": 3.2782246053023534, + "grad_norm": 12.81942367553711, + "learning_rate": 4.938059271569513e-05, + "loss": 0.4714, + "num_input_tokens_seen": 12767400, + "step": 22010 + }, + { + "epoch": 3.2789693178433126, + "grad_norm": 0.9784919619560242, + "learning_rate": 4.937987366958411e-05, + "loss": 0.4505, + "num_input_tokens_seen": 12769960, + "step": 22015 + }, + { + "epoch": 3.279714030384272, + "grad_norm": 22.957477569580078, + "learning_rate": 4.93791542115999e-05, + "loss": 0.5879, + "num_input_tokens_seen": 12773288, + "step": 22020 + }, + { + "epoch": 3.280458742925231, + "grad_norm": 7.4032392501831055, + "learning_rate": 4.937843434175466e-05, + "loss": 0.5182, + "num_input_tokens_seen": 12776040, + "step": 22025 + }, + { + "epoch": 3.28120345546619, + "grad_norm": 12.992008209228516, + "learning_rate": 4.937771406006054e-05, + "loss": 0.4137, + "num_input_tokens_seen": 12779016, + "step": 22030 + }, + { + "epoch": 3.2819481680071494, + "grad_norm": 7.897449016571045, + "learning_rate": 4.937699336652973e-05, + "loss": 0.362, + "num_input_tokens_seen": 12782216, + "step": 22035 + }, + { + "epoch": 3.2826928805481086, + "grad_norm": 11.30048942565918, + "learning_rate": 4.937627226117438e-05, + "loss": 0.4793, + "num_input_tokens_seen": 12785032, + "step": 22040 + }, + { + "epoch": 3.283437593089068, + "grad_norm": 14.566926002502441, + "learning_rate": 4.9375550744006695e-05, + "loss": 0.5587, + "num_input_tokens_seen": 12787784, + "step": 22045 + }, + { + "epoch": 3.284182305630027, + "grad_norm": 22.40081024169922, + "learning_rate": 4.9374828815038856e-05, + "loss": 0.4825, + "num_input_tokens_seen": 12790728, + "step": 22050 + }, + { + "epoch": 3.284927018170986, + "grad_norm": 12.525007247924805, + "learning_rate": 4.937410647428304e-05, + "loss": 0.4697, + "num_input_tokens_seen": 12793544, + "step": 22055 + }, + { + "epoch": 3.285671730711945, + "grad_norm": 34.525814056396484, + "learning_rate": 4.9373383721751486e-05, + "loss": 0.4694, + "num_input_tokens_seen": 12796520, + "step": 22060 + }, + { + "epoch": 3.2864164432529046, + "grad_norm": 9.812824249267578, + "learning_rate": 4.9372660557456384e-05, + "loss": 0.3003, + "num_input_tokens_seen": 12799432, + "step": 22065 + }, + { + "epoch": 3.2871611557938634, + "grad_norm": 29.627317428588867, + "learning_rate": 4.937193698140995e-05, + "loss": 0.5129, + "num_input_tokens_seen": 12802696, + "step": 22070 + }, + { + "epoch": 3.2879058683348226, + "grad_norm": 2.5975184440612793, + "learning_rate": 4.9371212993624405e-05, + "loss": 0.2804, + "num_input_tokens_seen": 12805800, + "step": 22075 + }, + { + "epoch": 3.2886505808757818, + "grad_norm": 4.30350399017334, + "learning_rate": 4.9370488594112e-05, + "loss": 0.347, + "num_input_tokens_seen": 12808648, + "step": 22080 + }, + { + "epoch": 3.289395293416741, + "grad_norm": 35.2178840637207, + "learning_rate": 4.936976378288495e-05, + "loss": 0.3688, + "num_input_tokens_seen": 12811784, + "step": 22085 + }, + { + "epoch": 3.2901400059577, + "grad_norm": 4.088438510894775, + "learning_rate": 4.93690385599555e-05, + "loss": 0.4289, + "num_input_tokens_seen": 12814792, + "step": 22090 + }, + { + "epoch": 3.2908847184986594, + "grad_norm": 30.125816345214844, + "learning_rate": 4.9368312925335925e-05, + "loss": 0.632, + "num_input_tokens_seen": 12817448, + "step": 22095 + }, + { + "epoch": 3.2916294310396186, + "grad_norm": 10.111601829528809, + "learning_rate": 4.9367586879038466e-05, + "loss": 0.517, + "num_input_tokens_seen": 12820232, + "step": 22100 + }, + { + "epoch": 3.292374143580578, + "grad_norm": 7.602862358093262, + "learning_rate": 4.93668604210754e-05, + "loss": 0.4249, + "num_input_tokens_seen": 12823272, + "step": 22105 + }, + { + "epoch": 3.293118856121537, + "grad_norm": 10.336459159851074, + "learning_rate": 4.936613355145898e-05, + "loss": 0.4775, + "num_input_tokens_seen": 12826280, + "step": 22110 + }, + { + "epoch": 3.293863568662496, + "grad_norm": 8.754937171936035, + "learning_rate": 4.936540627020151e-05, + "loss": 0.3191, + "num_input_tokens_seen": 12829096, + "step": 22115 + }, + { + "epoch": 3.2946082812034554, + "grad_norm": 1.909999132156372, + "learning_rate": 4.936467857731526e-05, + "loss": 0.2789, + "num_input_tokens_seen": 12831912, + "step": 22120 + }, + { + "epoch": 3.2953529937444146, + "grad_norm": 16.154409408569336, + "learning_rate": 4.9363950472812524e-05, + "loss": 0.4025, + "num_input_tokens_seen": 12834536, + "step": 22125 + }, + { + "epoch": 3.296097706285374, + "grad_norm": 12.23879623413086, + "learning_rate": 4.936322195670561e-05, + "loss": 0.3987, + "num_input_tokens_seen": 12837288, + "step": 22130 + }, + { + "epoch": 3.296842418826333, + "grad_norm": 21.66054916381836, + "learning_rate": 4.936249302900682e-05, + "loss": 0.6484, + "num_input_tokens_seen": 12840392, + "step": 22135 + }, + { + "epoch": 3.297587131367292, + "grad_norm": 3.2876791954040527, + "learning_rate": 4.936176368972848e-05, + "loss": 0.2907, + "num_input_tokens_seen": 12843304, + "step": 22140 + }, + { + "epoch": 3.2983318439082514, + "grad_norm": 4.012237548828125, + "learning_rate": 4.93610339388829e-05, + "loss": 0.312, + "num_input_tokens_seen": 12846184, + "step": 22145 + }, + { + "epoch": 3.2990765564492106, + "grad_norm": 22.034866333007812, + "learning_rate": 4.936030377648241e-05, + "loss": 0.4742, + "num_input_tokens_seen": 12848968, + "step": 22150 + }, + { + "epoch": 3.29982126899017, + "grad_norm": 5.345856189727783, + "learning_rate": 4.935957320253934e-05, + "loss": 0.2441, + "num_input_tokens_seen": 12852264, + "step": 22155 + }, + { + "epoch": 3.300565981531129, + "grad_norm": 17.18603515625, + "learning_rate": 4.9358842217066044e-05, + "loss": 0.219, + "num_input_tokens_seen": 12855048, + "step": 22160 + }, + { + "epoch": 3.301310694072088, + "grad_norm": 12.029886245727539, + "learning_rate": 4.935811082007487e-05, + "loss": 0.4651, + "num_input_tokens_seen": 12857864, + "step": 22165 + }, + { + "epoch": 3.3020554066130474, + "grad_norm": 3.164698839187622, + "learning_rate": 4.935737901157816e-05, + "loss": 0.3018, + "num_input_tokens_seen": 12860936, + "step": 22170 + }, + { + "epoch": 3.3028001191540066, + "grad_norm": 15.227065086364746, + "learning_rate": 4.935664679158829e-05, + "loss": 0.4396, + "num_input_tokens_seen": 12864008, + "step": 22175 + }, + { + "epoch": 3.303544831694966, + "grad_norm": 6.860264778137207, + "learning_rate": 4.935591416011763e-05, + "loss": 0.5132, + "num_input_tokens_seen": 12866760, + "step": 22180 + }, + { + "epoch": 3.304289544235925, + "grad_norm": 10.447898864746094, + "learning_rate": 4.9355181117178564e-05, + "loss": 0.465, + "num_input_tokens_seen": 12869576, + "step": 22185 + }, + { + "epoch": 3.3050342567768842, + "grad_norm": 2.7613918781280518, + "learning_rate": 4.935444766278345e-05, + "loss": 0.3114, + "num_input_tokens_seen": 12872648, + "step": 22190 + }, + { + "epoch": 3.3057789693178434, + "grad_norm": 15.841456413269043, + "learning_rate": 4.93537137969447e-05, + "loss": 0.4849, + "num_input_tokens_seen": 12875336, + "step": 22195 + }, + { + "epoch": 3.3065236818588026, + "grad_norm": 10.820671081542969, + "learning_rate": 4.935297951967471e-05, + "loss": 0.4408, + "num_input_tokens_seen": 12878088, + "step": 22200 + }, + { + "epoch": 3.307268394399762, + "grad_norm": 9.729879379272461, + "learning_rate": 4.9352244830985886e-05, + "loss": 0.5054, + "num_input_tokens_seen": 12880936, + "step": 22205 + }, + { + "epoch": 3.308013106940721, + "grad_norm": 20.078882217407227, + "learning_rate": 4.935150973089063e-05, + "loss": 0.5457, + "num_input_tokens_seen": 12883656, + "step": 22210 + }, + { + "epoch": 3.3087578194816802, + "grad_norm": 14.47400951385498, + "learning_rate": 4.935077421940137e-05, + "loss": 0.5543, + "num_input_tokens_seen": 12886568, + "step": 22215 + }, + { + "epoch": 3.3095025320226394, + "grad_norm": 7.123687744140625, + "learning_rate": 4.935003829653053e-05, + "loss": 0.4311, + "num_input_tokens_seen": 12889896, + "step": 22220 + }, + { + "epoch": 3.310247244563598, + "grad_norm": 7.724090099334717, + "learning_rate": 4.934930196229054e-05, + "loss": 0.4033, + "num_input_tokens_seen": 12892680, + "step": 22225 + }, + { + "epoch": 3.310991957104558, + "grad_norm": 8.28972339630127, + "learning_rate": 4.9348565216693845e-05, + "loss": 0.336, + "num_input_tokens_seen": 12895688, + "step": 22230 + }, + { + "epoch": 3.3117366696455166, + "grad_norm": 4.91711950302124, + "learning_rate": 4.9347828059752874e-05, + "loss": 0.3207, + "num_input_tokens_seen": 12898376, + "step": 22235 + }, + { + "epoch": 3.3124813821864763, + "grad_norm": 6.616153240203857, + "learning_rate": 4.934709049148011e-05, + "loss": 0.4005, + "num_input_tokens_seen": 12901384, + "step": 22240 + }, + { + "epoch": 3.313226094727435, + "grad_norm": 33.74029541015625, + "learning_rate": 4.934635251188799e-05, + "loss": 0.1765, + "num_input_tokens_seen": 12904200, + "step": 22245 + }, + { + "epoch": 3.313970807268394, + "grad_norm": 12.380607604980469, + "learning_rate": 4.934561412098899e-05, + "loss": 0.1647, + "num_input_tokens_seen": 12907304, + "step": 22250 + }, + { + "epoch": 3.3147155198093534, + "grad_norm": 31.76372528076172, + "learning_rate": 4.934487531879558e-05, + "loss": 0.5805, + "num_input_tokens_seen": 12910056, + "step": 22255 + }, + { + "epoch": 3.3154602323503126, + "grad_norm": 3.7080674171447754, + "learning_rate": 4.934413610532025e-05, + "loss": 0.1603, + "num_input_tokens_seen": 12912936, + "step": 22260 + }, + { + "epoch": 3.316204944891272, + "grad_norm": 16.97675323486328, + "learning_rate": 4.9343396480575474e-05, + "loss": 0.7719, + "num_input_tokens_seen": 12915880, + "step": 22265 + }, + { + "epoch": 3.316949657432231, + "grad_norm": 5.304111480712891, + "learning_rate": 4.9342656444573764e-05, + "loss": 0.2971, + "num_input_tokens_seen": 12918760, + "step": 22270 + }, + { + "epoch": 3.3176943699731902, + "grad_norm": 7.975998878479004, + "learning_rate": 4.934191599732762e-05, + "loss": 0.5499, + "num_input_tokens_seen": 12921672, + "step": 22275 + }, + { + "epoch": 3.3184390825141494, + "grad_norm": 9.080615043640137, + "learning_rate": 4.934117513884953e-05, + "loss": 0.2818, + "num_input_tokens_seen": 12924648, + "step": 22280 + }, + { + "epoch": 3.3191837950551086, + "grad_norm": 14.28605842590332, + "learning_rate": 4.934043386915203e-05, + "loss": 0.7064, + "num_input_tokens_seen": 12927560, + "step": 22285 + }, + { + "epoch": 3.319928507596068, + "grad_norm": 13.916600227355957, + "learning_rate": 4.933969218824764e-05, + "loss": 0.5005, + "num_input_tokens_seen": 12930760, + "step": 22290 + }, + { + "epoch": 3.320673220137027, + "grad_norm": 3.7889299392700195, + "learning_rate": 4.933895009614889e-05, + "loss": 0.3133, + "num_input_tokens_seen": 12933576, + "step": 22295 + }, + { + "epoch": 3.3214179326779862, + "grad_norm": 16.222389221191406, + "learning_rate": 4.933820759286831e-05, + "loss": 0.6285, + "num_input_tokens_seen": 12936808, + "step": 22300 + }, + { + "epoch": 3.3221626452189454, + "grad_norm": 15.927227020263672, + "learning_rate": 4.933746467841846e-05, + "loss": 0.3286, + "num_input_tokens_seen": 12939464, + "step": 22305 + }, + { + "epoch": 3.3229073577599046, + "grad_norm": 6.413403034210205, + "learning_rate": 4.9336721352811864e-05, + "loss": 0.3407, + "num_input_tokens_seen": 12942472, + "step": 22310 + }, + { + "epoch": 3.323652070300864, + "grad_norm": 4.8679280281066895, + "learning_rate": 4.933597761606111e-05, + "loss": 0.4116, + "num_input_tokens_seen": 12945512, + "step": 22315 + }, + { + "epoch": 3.324396782841823, + "grad_norm": 11.431103706359863, + "learning_rate": 4.9335233468178744e-05, + "loss": 0.5383, + "num_input_tokens_seen": 12948424, + "step": 22320 + }, + { + "epoch": 3.3251414953827823, + "grad_norm": 6.260740756988525, + "learning_rate": 4.9334488909177336e-05, + "loss": 0.4669, + "num_input_tokens_seen": 12951144, + "step": 22325 + }, + { + "epoch": 3.3258862079237415, + "grad_norm": 5.3165388107299805, + "learning_rate": 4.9333743939069476e-05, + "loss": 0.5346, + "num_input_tokens_seen": 12954280, + "step": 22330 + }, + { + "epoch": 3.3266309204647007, + "grad_norm": 11.235745429992676, + "learning_rate": 4.9332998557867735e-05, + "loss": 0.2686, + "num_input_tokens_seen": 12957320, + "step": 22335 + }, + { + "epoch": 3.32737563300566, + "grad_norm": 26.46112632751465, + "learning_rate": 4.933225276558473e-05, + "loss": 0.4177, + "num_input_tokens_seen": 12960136, + "step": 22340 + }, + { + "epoch": 3.328120345546619, + "grad_norm": 8.178186416625977, + "learning_rate": 4.933150656223303e-05, + "loss": 0.5351, + "num_input_tokens_seen": 12962888, + "step": 22345 + }, + { + "epoch": 3.3288650580875783, + "grad_norm": 27.7529354095459, + "learning_rate": 4.933075994782527e-05, + "loss": 0.6041, + "num_input_tokens_seen": 12965608, + "step": 22350 + }, + { + "epoch": 3.3296097706285375, + "grad_norm": 5.595302581787109, + "learning_rate": 4.933001292237404e-05, + "loss": 0.3118, + "num_input_tokens_seen": 12968840, + "step": 22355 + }, + { + "epoch": 3.3303544831694967, + "grad_norm": 11.334243774414062, + "learning_rate": 4.9329265485891966e-05, + "loss": 0.3543, + "num_input_tokens_seen": 12971496, + "step": 22360 + }, + { + "epoch": 3.331099195710456, + "grad_norm": 16.296226501464844, + "learning_rate": 4.9328517638391684e-05, + "loss": 0.3938, + "num_input_tokens_seen": 12974280, + "step": 22365 + }, + { + "epoch": 3.331843908251415, + "grad_norm": 7.581705570220947, + "learning_rate": 4.932776937988582e-05, + "loss": 0.3433, + "num_input_tokens_seen": 12977352, + "step": 22370 + }, + { + "epoch": 3.3325886207923743, + "grad_norm": 0.937022864818573, + "learning_rate": 4.932702071038703e-05, + "loss": 0.4762, + "num_input_tokens_seen": 12980072, + "step": 22375 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 5.445438385009766, + "learning_rate": 4.932627162990794e-05, + "loss": 0.2812, + "num_input_tokens_seen": 12982984, + "step": 22380 + }, + { + "epoch": 3.3340780458742927, + "grad_norm": 18.524185180664062, + "learning_rate": 4.932552213846121e-05, + "loss": 0.3803, + "num_input_tokens_seen": 12985800, + "step": 22385 + }, + { + "epoch": 3.334822758415252, + "grad_norm": 3.5807125568389893, + "learning_rate": 4.932477223605951e-05, + "loss": 0.3564, + "num_input_tokens_seen": 12988456, + "step": 22390 + }, + { + "epoch": 3.335567470956211, + "grad_norm": 4.458938121795654, + "learning_rate": 4.932402192271551e-05, + "loss": 0.619, + "num_input_tokens_seen": 12991240, + "step": 22395 + }, + { + "epoch": 3.33631218349717, + "grad_norm": 10.091887474060059, + "learning_rate": 4.9323271198441886e-05, + "loss": 0.5578, + "num_input_tokens_seen": 12994120, + "step": 22400 + }, + { + "epoch": 3.3370568960381295, + "grad_norm": 4.802518844604492, + "learning_rate": 4.932252006325131e-05, + "loss": 0.3686, + "num_input_tokens_seen": 12997096, + "step": 22405 + }, + { + "epoch": 3.3378016085790883, + "grad_norm": 31.721221923828125, + "learning_rate": 4.932176851715647e-05, + "loss": 0.3316, + "num_input_tokens_seen": 12999720, + "step": 22410 + }, + { + "epoch": 3.338546321120048, + "grad_norm": 8.106464385986328, + "learning_rate": 4.932101656017008e-05, + "loss": 0.2916, + "num_input_tokens_seen": 13003016, + "step": 22415 + }, + { + "epoch": 3.3392910336610067, + "grad_norm": 10.114603996276855, + "learning_rate": 4.9320264192304835e-05, + "loss": 0.418, + "num_input_tokens_seen": 13005864, + "step": 22420 + }, + { + "epoch": 3.340035746201966, + "grad_norm": 6.808538436889648, + "learning_rate": 4.931951141357344e-05, + "loss": 0.4444, + "num_input_tokens_seen": 13008936, + "step": 22425 + }, + { + "epoch": 3.340780458742925, + "grad_norm": 2.399855136871338, + "learning_rate": 4.931875822398862e-05, + "loss": 0.2506, + "num_input_tokens_seen": 13011816, + "step": 22430 + }, + { + "epoch": 3.3415251712838843, + "grad_norm": 9.409810066223145, + "learning_rate": 4.93180046235631e-05, + "loss": 0.4922, + "num_input_tokens_seen": 13014568, + "step": 22435 + }, + { + "epoch": 3.3422698838248435, + "grad_norm": 26.64987564086914, + "learning_rate": 4.9317250612309594e-05, + "loss": 0.4651, + "num_input_tokens_seen": 13017512, + "step": 22440 + }, + { + "epoch": 3.3430145963658027, + "grad_norm": 9.75442123413086, + "learning_rate": 4.9316496190240866e-05, + "loss": 0.4351, + "num_input_tokens_seen": 13020424, + "step": 22445 + }, + { + "epoch": 3.343759308906762, + "grad_norm": 4.770716667175293, + "learning_rate": 4.931574135736965e-05, + "loss": 0.4051, + "num_input_tokens_seen": 13023368, + "step": 22450 + }, + { + "epoch": 3.344504021447721, + "grad_norm": 14.562820434570312, + "learning_rate": 4.931498611370869e-05, + "loss": 0.4552, + "num_input_tokens_seen": 13026056, + "step": 22455 + }, + { + "epoch": 3.3452487339886803, + "grad_norm": 12.773308753967285, + "learning_rate": 4.9314230459270756e-05, + "loss": 0.4254, + "num_input_tokens_seen": 13028808, + "step": 22460 + }, + { + "epoch": 3.3459934465296395, + "grad_norm": 9.602228164672852, + "learning_rate": 4.9313474394068604e-05, + "loss": 0.3286, + "num_input_tokens_seen": 13031624, + "step": 22465 + }, + { + "epoch": 3.3467381590705987, + "grad_norm": 6.499200344085693, + "learning_rate": 4.931271791811502e-05, + "loss": 0.4926, + "num_input_tokens_seen": 13034568, + "step": 22470 + }, + { + "epoch": 3.347482871611558, + "grad_norm": 6.133707523345947, + "learning_rate": 4.931196103142278e-05, + "loss": 0.2444, + "num_input_tokens_seen": 13037512, + "step": 22475 + }, + { + "epoch": 3.348227584152517, + "grad_norm": 15.03150463104248, + "learning_rate": 4.9311203734004665e-05, + "loss": 0.609, + "num_input_tokens_seen": 13040520, + "step": 22480 + }, + { + "epoch": 3.3489722966934763, + "grad_norm": 25.433536529541016, + "learning_rate": 4.931044602587346e-05, + "loss": 0.608, + "num_input_tokens_seen": 13043272, + "step": 22485 + }, + { + "epoch": 3.3497170092344355, + "grad_norm": 10.792088508605957, + "learning_rate": 4.930968790704199e-05, + "loss": 0.2715, + "num_input_tokens_seen": 13046472, + "step": 22490 + }, + { + "epoch": 3.3504617217753947, + "grad_norm": 4.198763847351074, + "learning_rate": 4.930892937752305e-05, + "loss": 0.4083, + "num_input_tokens_seen": 13049416, + "step": 22495 + }, + { + "epoch": 3.351206434316354, + "grad_norm": 26.74439239501953, + "learning_rate": 4.930817043732945e-05, + "loss": 0.4288, + "num_input_tokens_seen": 13052360, + "step": 22500 + }, + { + "epoch": 3.351951146857313, + "grad_norm": 7.497776508331299, + "learning_rate": 4.930741108647402e-05, + "loss": 0.3878, + "num_input_tokens_seen": 13055240, + "step": 22505 + }, + { + "epoch": 3.3526958593982723, + "grad_norm": 1.9457738399505615, + "learning_rate": 4.9306651324969583e-05, + "loss": 0.1867, + "num_input_tokens_seen": 13058152, + "step": 22510 + }, + { + "epoch": 3.3534405719392315, + "grad_norm": 20.599376678466797, + "learning_rate": 4.9305891152828976e-05, + "loss": 0.5124, + "num_input_tokens_seen": 13060936, + "step": 22515 + }, + { + "epoch": 3.3541852844801907, + "grad_norm": 12.316161155700684, + "learning_rate": 4.930513057006504e-05, + "loss": 0.3562, + "num_input_tokens_seen": 13063784, + "step": 22520 + }, + { + "epoch": 3.35492999702115, + "grad_norm": 10.77412223815918, + "learning_rate": 4.930436957669063e-05, + "loss": 0.3625, + "num_input_tokens_seen": 13066536, + "step": 22525 + }, + { + "epoch": 3.355674709562109, + "grad_norm": 12.007887840270996, + "learning_rate": 4.93036081727186e-05, + "loss": 0.5147, + "num_input_tokens_seen": 13069224, + "step": 22530 + }, + { + "epoch": 3.3564194221030683, + "grad_norm": 38.017311096191406, + "learning_rate": 4.93028463581618e-05, + "loss": 0.4261, + "num_input_tokens_seen": 13072072, + "step": 22535 + }, + { + "epoch": 3.3571641346440275, + "grad_norm": 10.013136863708496, + "learning_rate": 4.930208413303312e-05, + "loss": 0.2087, + "num_input_tokens_seen": 13074920, + "step": 22540 + }, + { + "epoch": 3.3579088471849867, + "grad_norm": 13.85126781463623, + "learning_rate": 4.930132149734542e-05, + "loss": 0.2951, + "num_input_tokens_seen": 13077800, + "step": 22545 + }, + { + "epoch": 3.358653559725946, + "grad_norm": 24.5006160736084, + "learning_rate": 4.93005584511116e-05, + "loss": 0.4281, + "num_input_tokens_seen": 13080456, + "step": 22550 + }, + { + "epoch": 3.359398272266905, + "grad_norm": 17.051488876342773, + "learning_rate": 4.929979499434454e-05, + "loss": 0.3387, + "num_input_tokens_seen": 13083496, + "step": 22555 + }, + { + "epoch": 3.3601429848078643, + "grad_norm": 8.471799850463867, + "learning_rate": 4.929903112705714e-05, + "loss": 0.1446, + "num_input_tokens_seen": 13086952, + "step": 22560 + }, + { + "epoch": 3.3608876973488235, + "grad_norm": 22.587759017944336, + "learning_rate": 4.9298266849262306e-05, + "loss": 0.5504, + "num_input_tokens_seen": 13089480, + "step": 22565 + }, + { + "epoch": 3.3616324098897827, + "grad_norm": 6.74413537979126, + "learning_rate": 4.929750216097295e-05, + "loss": 0.5253, + "num_input_tokens_seen": 13092488, + "step": 22570 + }, + { + "epoch": 3.3623771224307415, + "grad_norm": 13.739048957824707, + "learning_rate": 4.929673706220199e-05, + "loss": 0.4214, + "num_input_tokens_seen": 13095400, + "step": 22575 + }, + { + "epoch": 3.363121834971701, + "grad_norm": 15.063396453857422, + "learning_rate": 4.929597155296235e-05, + "loss": 0.4778, + "num_input_tokens_seen": 13098600, + "step": 22580 + }, + { + "epoch": 3.36386654751266, + "grad_norm": 14.412280082702637, + "learning_rate": 4.929520563326697e-05, + "loss": 0.3063, + "num_input_tokens_seen": 13101192, + "step": 22585 + }, + { + "epoch": 3.3646112600536195, + "grad_norm": 10.173118591308594, + "learning_rate": 4.929443930312878e-05, + "loss": 0.5844, + "num_input_tokens_seen": 13104008, + "step": 22590 + }, + { + "epoch": 3.3653559725945783, + "grad_norm": 6.735815048217773, + "learning_rate": 4.929367256256072e-05, + "loss": 0.4228, + "num_input_tokens_seen": 13106952, + "step": 22595 + }, + { + "epoch": 3.3661006851355375, + "grad_norm": 9.347417831420898, + "learning_rate": 4.929290541157576e-05, + "loss": 0.5439, + "num_input_tokens_seen": 13109832, + "step": 22600 + }, + { + "epoch": 3.3668453976764967, + "grad_norm": 18.971603393554688, + "learning_rate": 4.929213785018686e-05, + "loss": 0.4306, + "num_input_tokens_seen": 13112776, + "step": 22605 + }, + { + "epoch": 3.367590110217456, + "grad_norm": 3.0392613410949707, + "learning_rate": 4.9291369878406975e-05, + "loss": 0.2386, + "num_input_tokens_seen": 13115560, + "step": 22610 + }, + { + "epoch": 3.368334822758415, + "grad_norm": 12.482563972473145, + "learning_rate": 4.929060149624909e-05, + "loss": 0.4824, + "num_input_tokens_seen": 13118568, + "step": 22615 + }, + { + "epoch": 3.3690795352993743, + "grad_norm": 24.135879516601562, + "learning_rate": 4.928983270372617e-05, + "loss": 0.5064, + "num_input_tokens_seen": 13121320, + "step": 22620 + }, + { + "epoch": 3.3698242478403335, + "grad_norm": 31.396329879760742, + "learning_rate": 4.928906350085122e-05, + "loss": 0.6829, + "num_input_tokens_seen": 13124200, + "step": 22625 + }, + { + "epoch": 3.3705689603812927, + "grad_norm": 6.53482723236084, + "learning_rate": 4.928829388763723e-05, + "loss": 0.3892, + "num_input_tokens_seen": 13126984, + "step": 22630 + }, + { + "epoch": 3.371313672922252, + "grad_norm": 3.81447696685791, + "learning_rate": 4.928752386409719e-05, + "loss": 0.3983, + "num_input_tokens_seen": 13129832, + "step": 22635 + }, + { + "epoch": 3.372058385463211, + "grad_norm": 16.767269134521484, + "learning_rate": 4.9286753430244126e-05, + "loss": 0.4907, + "num_input_tokens_seen": 13132904, + "step": 22640 + }, + { + "epoch": 3.3728030980041703, + "grad_norm": 9.324124336242676, + "learning_rate": 4.928598258609105e-05, + "loss": 0.3173, + "num_input_tokens_seen": 13135656, + "step": 22645 + }, + { + "epoch": 3.3735478105451295, + "grad_norm": 11.488977432250977, + "learning_rate": 4.928521133165098e-05, + "loss": 0.3886, + "num_input_tokens_seen": 13138472, + "step": 22650 + }, + { + "epoch": 3.3742925230860887, + "grad_norm": 1.2835609912872314, + "learning_rate": 4.928443966693694e-05, + "loss": 0.0909, + "num_input_tokens_seen": 13141384, + "step": 22655 + }, + { + "epoch": 3.375037235627048, + "grad_norm": 7.64758825302124, + "learning_rate": 4.928366759196198e-05, + "loss": 0.3979, + "num_input_tokens_seen": 13144264, + "step": 22660 + }, + { + "epoch": 3.375781948168007, + "grad_norm": 25.074575424194336, + "learning_rate": 4.9282895106739136e-05, + "loss": 0.7468, + "num_input_tokens_seen": 13146888, + "step": 22665 + }, + { + "epoch": 3.3765266607089663, + "grad_norm": 19.962451934814453, + "learning_rate": 4.928212221128146e-05, + "loss": 0.3067, + "num_input_tokens_seen": 13149640, + "step": 22670 + }, + { + "epoch": 3.3772713732499255, + "grad_norm": 7.361161231994629, + "learning_rate": 4.928134890560201e-05, + "loss": 0.4261, + "num_input_tokens_seen": 13152392, + "step": 22675 + }, + { + "epoch": 3.3780160857908847, + "grad_norm": 7.73993444442749, + "learning_rate": 4.928057518971384e-05, + "loss": 0.3717, + "num_input_tokens_seen": 13155176, + "step": 22680 + }, + { + "epoch": 3.378760798331844, + "grad_norm": 3.4995081424713135, + "learning_rate": 4.9279801063630035e-05, + "loss": 0.3821, + "num_input_tokens_seen": 13158120, + "step": 22685 + }, + { + "epoch": 3.379505510872803, + "grad_norm": 21.141464233398438, + "learning_rate": 4.9279026527363666e-05, + "loss": 0.3871, + "num_input_tokens_seen": 13160776, + "step": 22690 + }, + { + "epoch": 3.3802502234137624, + "grad_norm": 4.348607063293457, + "learning_rate": 4.927825158092783e-05, + "loss": 0.3394, + "num_input_tokens_seen": 13163912, + "step": 22695 + }, + { + "epoch": 3.3809949359547216, + "grad_norm": 16.63508415222168, + "learning_rate": 4.9277476224335603e-05, + "loss": 0.3414, + "num_input_tokens_seen": 13166824, + "step": 22700 + }, + { + "epoch": 3.3817396484956808, + "grad_norm": 6.585508346557617, + "learning_rate": 4.927670045760009e-05, + "loss": 0.4768, + "num_input_tokens_seen": 13169640, + "step": 22705 + }, + { + "epoch": 3.38248436103664, + "grad_norm": 4.581945419311523, + "learning_rate": 4.927592428073439e-05, + "loss": 0.4187, + "num_input_tokens_seen": 13172360, + "step": 22710 + }, + { + "epoch": 3.383229073577599, + "grad_norm": 4.56156063079834, + "learning_rate": 4.927514769375163e-05, + "loss": 0.2714, + "num_input_tokens_seen": 13175144, + "step": 22715 + }, + { + "epoch": 3.3839737861185584, + "grad_norm": 3.7234385013580322, + "learning_rate": 4.9274370696664916e-05, + "loss": 0.4111, + "num_input_tokens_seen": 13177768, + "step": 22720 + }, + { + "epoch": 3.3847184986595176, + "grad_norm": 20.174518585205078, + "learning_rate": 4.9273593289487384e-05, + "loss": 0.2213, + "num_input_tokens_seen": 13180520, + "step": 22725 + }, + { + "epoch": 3.3854632112004768, + "grad_norm": 4.423483848571777, + "learning_rate": 4.9272815472232165e-05, + "loss": 0.6179, + "num_input_tokens_seen": 13183368, + "step": 22730 + }, + { + "epoch": 3.386207923741436, + "grad_norm": 12.259525299072266, + "learning_rate": 4.9272037244912394e-05, + "loss": 0.4533, + "num_input_tokens_seen": 13185896, + "step": 22735 + }, + { + "epoch": 3.386952636282395, + "grad_norm": 5.609577655792236, + "learning_rate": 4.927125860754123e-05, + "loss": 0.283, + "num_input_tokens_seen": 13188712, + "step": 22740 + }, + { + "epoch": 3.3876973488233544, + "grad_norm": 26.907577514648438, + "learning_rate": 4.9270479560131813e-05, + "loss": 0.5689, + "num_input_tokens_seen": 13191688, + "step": 22745 + }, + { + "epoch": 3.388442061364313, + "grad_norm": 14.354175567626953, + "learning_rate": 4.926970010269731e-05, + "loss": 0.2726, + "num_input_tokens_seen": 13194440, + "step": 22750 + }, + { + "epoch": 3.389186773905273, + "grad_norm": 12.747197151184082, + "learning_rate": 4.92689202352509e-05, + "loss": 0.4485, + "num_input_tokens_seen": 13197512, + "step": 22755 + }, + { + "epoch": 3.3899314864462315, + "grad_norm": 39.7220573425293, + "learning_rate": 4.926813995780574e-05, + "loss": 0.2142, + "num_input_tokens_seen": 13200360, + "step": 22760 + }, + { + "epoch": 3.390676198987191, + "grad_norm": 6.6379241943359375, + "learning_rate": 4.926735927037503e-05, + "loss": 0.2642, + "num_input_tokens_seen": 13203336, + "step": 22765 + }, + { + "epoch": 3.39142091152815, + "grad_norm": 12.091933250427246, + "learning_rate": 4.9266578172971934e-05, + "loss": 0.549, + "num_input_tokens_seen": 13206184, + "step": 22770 + }, + { + "epoch": 3.392165624069109, + "grad_norm": 4.359160423278809, + "learning_rate": 4.926579666560968e-05, + "loss": 0.367, + "num_input_tokens_seen": 13209256, + "step": 22775 + }, + { + "epoch": 3.3929103366100684, + "grad_norm": 9.68154239654541, + "learning_rate": 4.926501474830144e-05, + "loss": 0.555, + "num_input_tokens_seen": 13212808, + "step": 22780 + }, + { + "epoch": 3.3936550491510276, + "grad_norm": 14.455306053161621, + "learning_rate": 4.926423242106044e-05, + "loss": 0.2478, + "num_input_tokens_seen": 13215496, + "step": 22785 + }, + { + "epoch": 3.3943997616919868, + "grad_norm": 5.426750183105469, + "learning_rate": 4.92634496838999e-05, + "loss": 0.4717, + "num_input_tokens_seen": 13218216, + "step": 22790 + }, + { + "epoch": 3.395144474232946, + "grad_norm": 8.478198051452637, + "learning_rate": 4.9262666536833035e-05, + "loss": 0.4976, + "num_input_tokens_seen": 13220936, + "step": 22795 + }, + { + "epoch": 3.395889186773905, + "grad_norm": 4.278824806213379, + "learning_rate": 4.926188297987308e-05, + "loss": 0.4686, + "num_input_tokens_seen": 13224360, + "step": 22800 + }, + { + "epoch": 3.3966338993148644, + "grad_norm": 10.98216724395752, + "learning_rate": 4.926109901303327e-05, + "loss": 0.4194, + "num_input_tokens_seen": 13227080, + "step": 22805 + }, + { + "epoch": 3.3973786118558236, + "grad_norm": 2.9672675132751465, + "learning_rate": 4.9260314636326846e-05, + "loss": 0.277, + "num_input_tokens_seen": 13230056, + "step": 22810 + }, + { + "epoch": 3.3981233243967828, + "grad_norm": 6.872596740722656, + "learning_rate": 4.925952984976707e-05, + "loss": 0.5216, + "num_input_tokens_seen": 13232968, + "step": 22815 + }, + { + "epoch": 3.398868036937742, + "grad_norm": 11.13196849822998, + "learning_rate": 4.925874465336719e-05, + "loss": 0.3643, + "num_input_tokens_seen": 13235688, + "step": 22820 + }, + { + "epoch": 3.399612749478701, + "grad_norm": 5.226984024047852, + "learning_rate": 4.9257959047140476e-05, + "loss": 0.4324, + "num_input_tokens_seen": 13238408, + "step": 22825 + }, + { + "epoch": 3.4003574620196604, + "grad_norm": 9.002808570861816, + "learning_rate": 4.9257173031100196e-05, + "loss": 0.4224, + "num_input_tokens_seen": 13241256, + "step": 22830 + }, + { + "epoch": 3.4011021745606196, + "grad_norm": 10.56143856048584, + "learning_rate": 4.925638660525963e-05, + "loss": 0.4103, + "num_input_tokens_seen": 13244168, + "step": 22835 + }, + { + "epoch": 3.401846887101579, + "grad_norm": 8.11614990234375, + "learning_rate": 4.925559976963207e-05, + "loss": 0.3758, + "num_input_tokens_seen": 13246920, + "step": 22840 + }, + { + "epoch": 3.402591599642538, + "grad_norm": 8.25603199005127, + "learning_rate": 4.9254812524230806e-05, + "loss": 0.3744, + "num_input_tokens_seen": 13249992, + "step": 22845 + }, + { + "epoch": 3.403336312183497, + "grad_norm": 7.589822292327881, + "learning_rate": 4.925402486906913e-05, + "loss": 0.5575, + "num_input_tokens_seen": 13252776, + "step": 22850 + }, + { + "epoch": 3.4040810247244564, + "grad_norm": 8.211682319641113, + "learning_rate": 4.925323680416036e-05, + "loss": 0.3041, + "num_input_tokens_seen": 13255848, + "step": 22855 + }, + { + "epoch": 3.4048257372654156, + "grad_norm": 6.727567672729492, + "learning_rate": 4.92524483295178e-05, + "loss": 0.4302, + "num_input_tokens_seen": 13258696, + "step": 22860 + }, + { + "epoch": 3.405570449806375, + "grad_norm": 4.642481803894043, + "learning_rate": 4.925165944515477e-05, + "loss": 0.3367, + "num_input_tokens_seen": 13261320, + "step": 22865 + }, + { + "epoch": 3.406315162347334, + "grad_norm": 10.623671531677246, + "learning_rate": 4.9250870151084614e-05, + "loss": 0.3845, + "num_input_tokens_seen": 13264296, + "step": 22870 + }, + { + "epoch": 3.407059874888293, + "grad_norm": 9.614335060119629, + "learning_rate": 4.9250080447320644e-05, + "loss": 0.4554, + "num_input_tokens_seen": 13266888, + "step": 22875 + }, + { + "epoch": 3.4078045874292524, + "grad_norm": 11.032925605773926, + "learning_rate": 4.924929033387622e-05, + "loss": 0.5023, + "num_input_tokens_seen": 13269800, + "step": 22880 + }, + { + "epoch": 3.4085492999702116, + "grad_norm": 20.809484481811523, + "learning_rate": 4.9248499810764675e-05, + "loss": 0.322, + "num_input_tokens_seen": 13272808, + "step": 22885 + }, + { + "epoch": 3.409294012511171, + "grad_norm": 18.75590705871582, + "learning_rate": 4.9247708877999375e-05, + "loss": 0.5151, + "num_input_tokens_seen": 13275752, + "step": 22890 + }, + { + "epoch": 3.41003872505213, + "grad_norm": 6.761152744293213, + "learning_rate": 4.9246917535593675e-05, + "loss": 0.3562, + "num_input_tokens_seen": 13278472, + "step": 22895 + }, + { + "epoch": 3.410783437593089, + "grad_norm": 47.59885787963867, + "learning_rate": 4.924612578356095e-05, + "loss": 0.6456, + "num_input_tokens_seen": 13281160, + "step": 22900 + }, + { + "epoch": 3.4115281501340484, + "grad_norm": 13.7451171875, + "learning_rate": 4.9245333621914566e-05, + "loss": 0.2147, + "num_input_tokens_seen": 13284072, + "step": 22905 + }, + { + "epoch": 3.4122728626750076, + "grad_norm": 11.367568969726562, + "learning_rate": 4.9244541050667916e-05, + "loss": 0.6132, + "num_input_tokens_seen": 13287048, + "step": 22910 + }, + { + "epoch": 3.413017575215967, + "grad_norm": 4.351980686187744, + "learning_rate": 4.9243748069834386e-05, + "loss": 0.6984, + "num_input_tokens_seen": 13290280, + "step": 22915 + }, + { + "epoch": 3.413762287756926, + "grad_norm": 8.65907096862793, + "learning_rate": 4.924295467942737e-05, + "loss": 0.4351, + "num_input_tokens_seen": 13293288, + "step": 22920 + }, + { + "epoch": 3.414507000297885, + "grad_norm": 8.302993774414062, + "learning_rate": 4.924216087946028e-05, + "loss": 0.391, + "num_input_tokens_seen": 13295912, + "step": 22925 + }, + { + "epoch": 3.4152517128388444, + "grad_norm": 3.3752646446228027, + "learning_rate": 4.924136666994652e-05, + "loss": 0.2637, + "num_input_tokens_seen": 13298760, + "step": 22930 + }, + { + "epoch": 3.415996425379803, + "grad_norm": 3.056360960006714, + "learning_rate": 4.9240572050899505e-05, + "loss": 0.533, + "num_input_tokens_seen": 13301608, + "step": 22935 + }, + { + "epoch": 3.4167411379207624, + "grad_norm": 11.16057300567627, + "learning_rate": 4.923977702233266e-05, + "loss": 0.4291, + "num_input_tokens_seen": 13304488, + "step": 22940 + }, + { + "epoch": 3.4174858504617216, + "grad_norm": 2.500913381576538, + "learning_rate": 4.923898158425942e-05, + "loss": 0.1791, + "num_input_tokens_seen": 13307496, + "step": 22945 + }, + { + "epoch": 3.418230563002681, + "grad_norm": 10.2300443649292, + "learning_rate": 4.923818573669322e-05, + "loss": 0.4158, + "num_input_tokens_seen": 13310344, + "step": 22950 + }, + { + "epoch": 3.41897527554364, + "grad_norm": 7.173845291137695, + "learning_rate": 4.923738947964751e-05, + "loss": 0.4143, + "num_input_tokens_seen": 13313448, + "step": 22955 + }, + { + "epoch": 3.419719988084599, + "grad_norm": 4.514631748199463, + "learning_rate": 4.923659281313574e-05, + "loss": 0.4603, + "num_input_tokens_seen": 13316360, + "step": 22960 + }, + { + "epoch": 3.4204647006255584, + "grad_norm": 1.8021748065948486, + "learning_rate": 4.9235795737171365e-05, + "loss": 0.4094, + "num_input_tokens_seen": 13319336, + "step": 22965 + }, + { + "epoch": 3.4212094131665176, + "grad_norm": 17.56688117980957, + "learning_rate": 4.923499825176786e-05, + "loss": 0.3603, + "num_input_tokens_seen": 13322120, + "step": 22970 + }, + { + "epoch": 3.421954125707477, + "grad_norm": 4.909046649932861, + "learning_rate": 4.923420035693868e-05, + "loss": 0.6523, + "num_input_tokens_seen": 13325128, + "step": 22975 + }, + { + "epoch": 3.422698838248436, + "grad_norm": 5.998760223388672, + "learning_rate": 4.923340205269732e-05, + "loss": 0.503, + "num_input_tokens_seen": 13327976, + "step": 22980 + }, + { + "epoch": 3.423443550789395, + "grad_norm": 11.491267204284668, + "learning_rate": 4.923260333905726e-05, + "loss": 0.3317, + "num_input_tokens_seen": 13330568, + "step": 22985 + }, + { + "epoch": 3.4241882633303544, + "grad_norm": 12.897880554199219, + "learning_rate": 4.9231804216031995e-05, + "loss": 0.3119, + "num_input_tokens_seen": 13333192, + "step": 22990 + }, + { + "epoch": 3.4249329758713136, + "grad_norm": 15.793729782104492, + "learning_rate": 4.923100468363503e-05, + "loss": 0.3704, + "num_input_tokens_seen": 13336296, + "step": 22995 + }, + { + "epoch": 3.425677688412273, + "grad_norm": 30.37188720703125, + "learning_rate": 4.923020474187987e-05, + "loss": 0.5435, + "num_input_tokens_seen": 13339432, + "step": 23000 + }, + { + "epoch": 3.426422400953232, + "grad_norm": 11.890679359436035, + "learning_rate": 4.922940439078002e-05, + "loss": 0.3843, + "num_input_tokens_seen": 13342248, + "step": 23005 + }, + { + "epoch": 3.4271671134941912, + "grad_norm": 14.938196182250977, + "learning_rate": 4.922860363034901e-05, + "loss": 0.5066, + "num_input_tokens_seen": 13345000, + "step": 23010 + }, + { + "epoch": 3.4279118260351504, + "grad_norm": 17.610212326049805, + "learning_rate": 4.922780246060037e-05, + "loss": 0.4114, + "num_input_tokens_seen": 13347720, + "step": 23015 + }, + { + "epoch": 3.4286565385761096, + "grad_norm": 12.637754440307617, + "learning_rate": 4.922700088154764e-05, + "loss": 0.3699, + "num_input_tokens_seen": 13350696, + "step": 23020 + }, + { + "epoch": 3.429401251117069, + "grad_norm": 16.544109344482422, + "learning_rate": 4.9226198893204335e-05, + "loss": 0.455, + "num_input_tokens_seen": 13353224, + "step": 23025 + }, + { + "epoch": 3.430145963658028, + "grad_norm": 17.28939437866211, + "learning_rate": 4.922539649558403e-05, + "loss": 0.5197, + "num_input_tokens_seen": 13355976, + "step": 23030 + }, + { + "epoch": 3.4308906761989872, + "grad_norm": 7.037984371185303, + "learning_rate": 4.9224593688700274e-05, + "loss": 0.2212, + "num_input_tokens_seen": 13358664, + "step": 23035 + }, + { + "epoch": 3.4316353887399464, + "grad_norm": 2.487353563308716, + "learning_rate": 4.922379047256663e-05, + "loss": 0.3045, + "num_input_tokens_seen": 13361352, + "step": 23040 + }, + { + "epoch": 3.4323801012809056, + "grad_norm": 1.108189344406128, + "learning_rate": 4.922298684719666e-05, + "loss": 0.466, + "num_input_tokens_seen": 13364296, + "step": 23045 + }, + { + "epoch": 3.433124813821865, + "grad_norm": 15.654330253601074, + "learning_rate": 4.922218281260395e-05, + "loss": 0.3097, + "num_input_tokens_seen": 13367272, + "step": 23050 + }, + { + "epoch": 3.433869526362824, + "grad_norm": 0.36041688919067383, + "learning_rate": 4.9221378368802085e-05, + "loss": 0.3853, + "num_input_tokens_seen": 13369928, + "step": 23055 + }, + { + "epoch": 3.4346142389037833, + "grad_norm": 10.044452667236328, + "learning_rate": 4.9220573515804644e-05, + "loss": 0.437, + "num_input_tokens_seen": 13372808, + "step": 23060 + }, + { + "epoch": 3.4353589514447425, + "grad_norm": 8.357728004455566, + "learning_rate": 4.921976825362523e-05, + "loss": 0.4631, + "num_input_tokens_seen": 13375528, + "step": 23065 + }, + { + "epoch": 3.4361036639857017, + "grad_norm": 7.608503818511963, + "learning_rate": 4.921896258227745e-05, + "loss": 0.4993, + "num_input_tokens_seen": 13378088, + "step": 23070 + }, + { + "epoch": 3.436848376526661, + "grad_norm": 19.962739944458008, + "learning_rate": 4.921815650177491e-05, + "loss": 0.532, + "num_input_tokens_seen": 13381288, + "step": 23075 + }, + { + "epoch": 3.43759308906762, + "grad_norm": 6.672410488128662, + "learning_rate": 4.9217350012131223e-05, + "loss": 0.3894, + "num_input_tokens_seen": 13384424, + "step": 23080 + }, + { + "epoch": 3.4383378016085793, + "grad_norm": 9.171102523803711, + "learning_rate": 4.9216543113360035e-05, + "loss": 0.5328, + "num_input_tokens_seen": 13387016, + "step": 23085 + }, + { + "epoch": 3.4390825141495385, + "grad_norm": 9.245136260986328, + "learning_rate": 4.9215735805474956e-05, + "loss": 0.3763, + "num_input_tokens_seen": 13389832, + "step": 23090 + }, + { + "epoch": 3.4398272266904977, + "grad_norm": 8.979183197021484, + "learning_rate": 4.921492808848963e-05, + "loss": 0.6088, + "num_input_tokens_seen": 13392712, + "step": 23095 + }, + { + "epoch": 3.4405719392314564, + "grad_norm": 1.5557855367660522, + "learning_rate": 4.921411996241771e-05, + "loss": 0.4175, + "num_input_tokens_seen": 13395496, + "step": 23100 + }, + { + "epoch": 3.441316651772416, + "grad_norm": 0.7417711615562439, + "learning_rate": 4.921331142727284e-05, + "loss": 0.3438, + "num_input_tokens_seen": 13398216, + "step": 23105 + }, + { + "epoch": 3.442061364313375, + "grad_norm": 10.815970420837402, + "learning_rate": 4.921250248306869e-05, + "loss": 0.3509, + "num_input_tokens_seen": 13401000, + "step": 23110 + }, + { + "epoch": 3.442806076854334, + "grad_norm": 13.43005657196045, + "learning_rate": 4.9211693129818915e-05, + "loss": 0.4078, + "num_input_tokens_seen": 13404072, + "step": 23115 + }, + { + "epoch": 3.4435507893952932, + "grad_norm": 9.559517860412598, + "learning_rate": 4.9210883367537184e-05, + "loss": 0.4421, + "num_input_tokens_seen": 13406952, + "step": 23120 + }, + { + "epoch": 3.4442955019362524, + "grad_norm": 3.2910091876983643, + "learning_rate": 4.9210073196237196e-05, + "loss": 0.3816, + "num_input_tokens_seen": 13409512, + "step": 23125 + }, + { + "epoch": 3.4450402144772116, + "grad_norm": 6.7909111976623535, + "learning_rate": 4.9209262615932624e-05, + "loss": 0.2056, + "num_input_tokens_seen": 13412360, + "step": 23130 + }, + { + "epoch": 3.445784927018171, + "grad_norm": 8.48888111114502, + "learning_rate": 4.9208451626637164e-05, + "loss": 0.3828, + "num_input_tokens_seen": 13415496, + "step": 23135 + }, + { + "epoch": 3.44652963955913, + "grad_norm": 19.136301040649414, + "learning_rate": 4.920764022836452e-05, + "loss": 0.6718, + "num_input_tokens_seen": 13418248, + "step": 23140 + }, + { + "epoch": 3.4472743521000893, + "grad_norm": 19.859365463256836, + "learning_rate": 4.920682842112839e-05, + "loss": 0.6108, + "num_input_tokens_seen": 13421352, + "step": 23145 + }, + { + "epoch": 3.4480190646410485, + "grad_norm": 14.557271003723145, + "learning_rate": 4.920601620494251e-05, + "loss": 0.2598, + "num_input_tokens_seen": 13424264, + "step": 23150 + }, + { + "epoch": 3.4487637771820077, + "grad_norm": 33.070735931396484, + "learning_rate": 4.920520357982058e-05, + "loss": 0.5157, + "num_input_tokens_seen": 13427048, + "step": 23155 + }, + { + "epoch": 3.449508489722967, + "grad_norm": 24.56453514099121, + "learning_rate": 4.9204390545776334e-05, + "loss": 0.5119, + "num_input_tokens_seen": 13430280, + "step": 23160 + }, + { + "epoch": 3.450253202263926, + "grad_norm": 11.281193733215332, + "learning_rate": 4.920357710282352e-05, + "loss": 0.2627, + "num_input_tokens_seen": 13433192, + "step": 23165 + }, + { + "epoch": 3.4509979148048853, + "grad_norm": 1.1777937412261963, + "learning_rate": 4.9202763250975864e-05, + "loss": 0.2858, + "num_input_tokens_seen": 13435784, + "step": 23170 + }, + { + "epoch": 3.4517426273458445, + "grad_norm": 11.635845184326172, + "learning_rate": 4.920194899024712e-05, + "loss": 0.3643, + "num_input_tokens_seen": 13438888, + "step": 23175 + }, + { + "epoch": 3.4524873398868037, + "grad_norm": 3.272047758102417, + "learning_rate": 4.920113432065105e-05, + "loss": 0.3599, + "num_input_tokens_seen": 13442024, + "step": 23180 + }, + { + "epoch": 3.453232052427763, + "grad_norm": 11.329598426818848, + "learning_rate": 4.920031924220141e-05, + "loss": 0.3124, + "num_input_tokens_seen": 13445416, + "step": 23185 + }, + { + "epoch": 3.453976764968722, + "grad_norm": 9.867401123046875, + "learning_rate": 4.919950375491197e-05, + "loss": 0.3664, + "num_input_tokens_seen": 13448520, + "step": 23190 + }, + { + "epoch": 3.4547214775096813, + "grad_norm": 15.996686935424805, + "learning_rate": 4.919868785879651e-05, + "loss": 0.4272, + "num_input_tokens_seen": 13451304, + "step": 23195 + }, + { + "epoch": 3.4554661900506405, + "grad_norm": 23.317548751831055, + "learning_rate": 4.919787155386882e-05, + "loss": 0.3666, + "num_input_tokens_seen": 13454024, + "step": 23200 + }, + { + "epoch": 3.4562109025915997, + "grad_norm": 9.636860847473145, + "learning_rate": 4.919705484014268e-05, + "loss": 0.63, + "num_input_tokens_seen": 13456936, + "step": 23205 + }, + { + "epoch": 3.456955615132559, + "grad_norm": 25.280963897705078, + "learning_rate": 4.919623771763189e-05, + "loss": 0.454, + "num_input_tokens_seen": 13460072, + "step": 23210 + }, + { + "epoch": 3.457700327673518, + "grad_norm": 3.6729736328125, + "learning_rate": 4.919542018635025e-05, + "loss": 0.4564, + "num_input_tokens_seen": 13463080, + "step": 23215 + }, + { + "epoch": 3.4584450402144773, + "grad_norm": 5.248834133148193, + "learning_rate": 4.919460224631158e-05, + "loss": 0.3493, + "num_input_tokens_seen": 13465992, + "step": 23220 + }, + { + "epoch": 3.4591897527554365, + "grad_norm": 1.2311012744903564, + "learning_rate": 4.91937838975297e-05, + "loss": 0.3482, + "num_input_tokens_seen": 13468776, + "step": 23225 + }, + { + "epoch": 3.4599344652963957, + "grad_norm": 25.98386001586914, + "learning_rate": 4.9192965140018435e-05, + "loss": 0.6107, + "num_input_tokens_seen": 13471656, + "step": 23230 + }, + { + "epoch": 3.460679177837355, + "grad_norm": 1.6994131803512573, + "learning_rate": 4.919214597379161e-05, + "loss": 0.4153, + "num_input_tokens_seen": 13474440, + "step": 23235 + }, + { + "epoch": 3.461423890378314, + "grad_norm": 31.230619430541992, + "learning_rate": 4.919132639886306e-05, + "loss": 0.4339, + "num_input_tokens_seen": 13476968, + "step": 23240 + }, + { + "epoch": 3.4621686029192733, + "grad_norm": 11.898646354675293, + "learning_rate": 4.919050641524663e-05, + "loss": 0.4427, + "num_input_tokens_seen": 13479976, + "step": 23245 + }, + { + "epoch": 3.4629133154602325, + "grad_norm": 1.5569546222686768, + "learning_rate": 4.9189686022956195e-05, + "loss": 0.3741, + "num_input_tokens_seen": 13482888, + "step": 23250 + }, + { + "epoch": 3.4636580280011917, + "grad_norm": 5.261870861053467, + "learning_rate": 4.91888652220056e-05, + "loss": 0.33, + "num_input_tokens_seen": 13485704, + "step": 23255 + }, + { + "epoch": 3.464402740542151, + "grad_norm": 6.695267677307129, + "learning_rate": 4.91880440124087e-05, + "loss": 0.5761, + "num_input_tokens_seen": 13488520, + "step": 23260 + }, + { + "epoch": 3.4651474530831097, + "grad_norm": 11.450047492980957, + "learning_rate": 4.918722239417939e-05, + "loss": 0.0727, + "num_input_tokens_seen": 13491496, + "step": 23265 + }, + { + "epoch": 3.4658921656240693, + "grad_norm": 12.490949630737305, + "learning_rate": 4.918640036733154e-05, + "loss": 0.8791, + "num_input_tokens_seen": 13494376, + "step": 23270 + }, + { + "epoch": 3.466636878165028, + "grad_norm": 11.119885444641113, + "learning_rate": 4.9185577931879034e-05, + "loss": 0.5671, + "num_input_tokens_seen": 13497288, + "step": 23275 + }, + { + "epoch": 3.4673815907059877, + "grad_norm": 11.983973503112793, + "learning_rate": 4.9184755087835766e-05, + "loss": 0.4053, + "num_input_tokens_seen": 13500168, + "step": 23280 + }, + { + "epoch": 3.4681263032469465, + "grad_norm": 10.076369285583496, + "learning_rate": 4.9183931835215645e-05, + "loss": 0.3765, + "num_input_tokens_seen": 13502952, + "step": 23285 + }, + { + "epoch": 3.4688710157879057, + "grad_norm": 6.328466892242432, + "learning_rate": 4.918310817403258e-05, + "loss": 0.3494, + "num_input_tokens_seen": 13506216, + "step": 23290 + }, + { + "epoch": 3.469615728328865, + "grad_norm": 5.613236904144287, + "learning_rate": 4.918228410430048e-05, + "loss": 0.4005, + "num_input_tokens_seen": 13509672, + "step": 23295 + }, + { + "epoch": 3.470360440869824, + "grad_norm": 11.5214204788208, + "learning_rate": 4.918145962603326e-05, + "loss": 0.4473, + "num_input_tokens_seen": 13512840, + "step": 23300 + }, + { + "epoch": 3.4711051534107833, + "grad_norm": 13.233625411987305, + "learning_rate": 4.918063473924486e-05, + "loss": 0.4061, + "num_input_tokens_seen": 13515848, + "step": 23305 + }, + { + "epoch": 3.4718498659517425, + "grad_norm": 3.8237712383270264, + "learning_rate": 4.917980944394922e-05, + "loss": 0.4489, + "num_input_tokens_seen": 13518696, + "step": 23310 + }, + { + "epoch": 3.4725945784927017, + "grad_norm": 16.933156967163086, + "learning_rate": 4.9178983740160264e-05, + "loss": 0.7783, + "num_input_tokens_seen": 13521768, + "step": 23315 + }, + { + "epoch": 3.473339291033661, + "grad_norm": 7.718482971191406, + "learning_rate": 4.9178157627891956e-05, + "loss": 0.3488, + "num_input_tokens_seen": 13524456, + "step": 23320 + }, + { + "epoch": 3.47408400357462, + "grad_norm": 7.0370330810546875, + "learning_rate": 4.917733110715825e-05, + "loss": 0.2813, + "num_input_tokens_seen": 13527272, + "step": 23325 + }, + { + "epoch": 3.4748287161155793, + "grad_norm": 8.582797050476074, + "learning_rate": 4.9176504177973105e-05, + "loss": 0.3622, + "num_input_tokens_seen": 13530440, + "step": 23330 + }, + { + "epoch": 3.4755734286565385, + "grad_norm": 8.100451469421387, + "learning_rate": 4.91756768403505e-05, + "loss": 0.4239, + "num_input_tokens_seen": 13533256, + "step": 23335 + }, + { + "epoch": 3.4763181411974977, + "grad_norm": 4.688600063323975, + "learning_rate": 4.9174849094304396e-05, + "loss": 0.3299, + "num_input_tokens_seen": 13536136, + "step": 23340 + }, + { + "epoch": 3.477062853738457, + "grad_norm": 7.086465835571289, + "learning_rate": 4.91740209398488e-05, + "loss": 0.4805, + "num_input_tokens_seen": 13538920, + "step": 23345 + }, + { + "epoch": 3.477807566279416, + "grad_norm": 19.12757682800293, + "learning_rate": 4.917319237699768e-05, + "loss": 0.5427, + "num_input_tokens_seen": 13541896, + "step": 23350 + }, + { + "epoch": 3.4785522788203753, + "grad_norm": 7.716381072998047, + "learning_rate": 4.9172363405765044e-05, + "loss": 0.4919, + "num_input_tokens_seen": 13544552, + "step": 23355 + }, + { + "epoch": 3.4792969913613345, + "grad_norm": 2.1828386783599854, + "learning_rate": 4.91715340261649e-05, + "loss": 0.43, + "num_input_tokens_seen": 13547464, + "step": 23360 + }, + { + "epoch": 3.4800417039022937, + "grad_norm": 6.808770656585693, + "learning_rate": 4.917070423821125e-05, + "loss": 0.2915, + "num_input_tokens_seen": 13550216, + "step": 23365 + }, + { + "epoch": 3.480786416443253, + "grad_norm": 9.180678367614746, + "learning_rate": 4.9169874041918116e-05, + "loss": 0.2639, + "num_input_tokens_seen": 13553192, + "step": 23370 + }, + { + "epoch": 3.481531128984212, + "grad_norm": 3.8250620365142822, + "learning_rate": 4.916904343729954e-05, + "loss": 0.5084, + "num_input_tokens_seen": 13555912, + "step": 23375 + }, + { + "epoch": 3.4822758415251713, + "grad_norm": 2.8431386947631836, + "learning_rate": 4.916821242436952e-05, + "loss": 0.2685, + "num_input_tokens_seen": 13558664, + "step": 23380 + }, + { + "epoch": 3.4830205540661305, + "grad_norm": 9.848294258117676, + "learning_rate": 4.916738100314213e-05, + "loss": 0.3808, + "num_input_tokens_seen": 13561512, + "step": 23385 + }, + { + "epoch": 3.4837652666070897, + "grad_norm": 3.6324706077575684, + "learning_rate": 4.916654917363139e-05, + "loss": 0.3671, + "num_input_tokens_seen": 13564296, + "step": 23390 + }, + { + "epoch": 3.484509979148049, + "grad_norm": 3.594656229019165, + "learning_rate": 4.916571693585137e-05, + "loss": 0.4341, + "num_input_tokens_seen": 13567144, + "step": 23395 + }, + { + "epoch": 3.485254691689008, + "grad_norm": 10.600226402282715, + "learning_rate": 4.9164884289816115e-05, + "loss": 0.4077, + "num_input_tokens_seen": 13569896, + "step": 23400 + }, + { + "epoch": 3.4859994042299673, + "grad_norm": 14.614887237548828, + "learning_rate": 4.916405123553971e-05, + "loss": 0.4265, + "num_input_tokens_seen": 13572840, + "step": 23405 + }, + { + "epoch": 3.4867441167709265, + "grad_norm": 12.843442916870117, + "learning_rate": 4.9163217773036214e-05, + "loss": 0.3984, + "num_input_tokens_seen": 13575752, + "step": 23410 + }, + { + "epoch": 3.4874888293118858, + "grad_norm": 7.724523544311523, + "learning_rate": 4.916238390231971e-05, + "loss": 0.444, + "num_input_tokens_seen": 13578600, + "step": 23415 + }, + { + "epoch": 3.488233541852845, + "grad_norm": 3.9042458534240723, + "learning_rate": 4.916154962340429e-05, + "loss": 0.6016, + "num_input_tokens_seen": 13581288, + "step": 23420 + }, + { + "epoch": 3.488978254393804, + "grad_norm": 5.423035144805908, + "learning_rate": 4.916071493630405e-05, + "loss": 0.3152, + "num_input_tokens_seen": 13584072, + "step": 23425 + }, + { + "epoch": 3.4897229669347634, + "grad_norm": 13.94538688659668, + "learning_rate": 4.915987984103309e-05, + "loss": 0.5388, + "num_input_tokens_seen": 13587080, + "step": 23430 + }, + { + "epoch": 3.4904676794757226, + "grad_norm": 8.245246887207031, + "learning_rate": 4.9159044337605495e-05, + "loss": 0.3824, + "num_input_tokens_seen": 13589896, + "step": 23435 + }, + { + "epoch": 3.4912123920166813, + "grad_norm": 9.827716827392578, + "learning_rate": 4.915820842603542e-05, + "loss": 0.3332, + "num_input_tokens_seen": 13592712, + "step": 23440 + }, + { + "epoch": 3.491957104557641, + "grad_norm": 11.743494987487793, + "learning_rate": 4.9157372106336965e-05, + "loss": 0.5509, + "num_input_tokens_seen": 13595752, + "step": 23445 + }, + { + "epoch": 3.4927018170985997, + "grad_norm": 23.015579223632812, + "learning_rate": 4.915653537852425e-05, + "loss": 0.5139, + "num_input_tokens_seen": 13598312, + "step": 23450 + }, + { + "epoch": 3.4934465296395594, + "grad_norm": 12.563140869140625, + "learning_rate": 4.915569824261143e-05, + "loss": 0.2728, + "num_input_tokens_seen": 13601128, + "step": 23455 + }, + { + "epoch": 3.494191242180518, + "grad_norm": 3.766937732696533, + "learning_rate": 4.915486069861264e-05, + "loss": 0.4275, + "num_input_tokens_seen": 13603944, + "step": 23460 + }, + { + "epoch": 3.4949359547214773, + "grad_norm": 20.20008659362793, + "learning_rate": 4.915402274654202e-05, + "loss": 0.3887, + "num_input_tokens_seen": 13606568, + "step": 23465 + }, + { + "epoch": 3.4956806672624365, + "grad_norm": 9.793127059936523, + "learning_rate": 4.915318438641374e-05, + "loss": 0.3269, + "num_input_tokens_seen": 13609608, + "step": 23470 + }, + { + "epoch": 3.4964253798033957, + "grad_norm": 8.057459831237793, + "learning_rate": 4.915234561824196e-05, + "loss": 0.2903, + "num_input_tokens_seen": 13612552, + "step": 23475 + }, + { + "epoch": 3.497170092344355, + "grad_norm": 14.457361221313477, + "learning_rate": 4.915150644204084e-05, + "loss": 0.4976, + "num_input_tokens_seen": 13615560, + "step": 23480 + }, + { + "epoch": 3.497914804885314, + "grad_norm": 25.24042510986328, + "learning_rate": 4.915066685782457e-05, + "loss": 0.1128, + "num_input_tokens_seen": 13618312, + "step": 23485 + }, + { + "epoch": 3.4986595174262733, + "grad_norm": 11.082783699035645, + "learning_rate": 4.914982686560733e-05, + "loss": 0.558, + "num_input_tokens_seen": 13621064, + "step": 23490 + }, + { + "epoch": 3.4994042299672325, + "grad_norm": 9.094414710998535, + "learning_rate": 4.914898646540331e-05, + "loss": 0.3804, + "num_input_tokens_seen": 13624136, + "step": 23495 + }, + { + "epoch": 3.5001489425081918, + "grad_norm": 6.4736433029174805, + "learning_rate": 4.914814565722671e-05, + "loss": 0.4625, + "num_input_tokens_seen": 13627112, + "step": 23500 + }, + { + "epoch": 3.500893655049151, + "grad_norm": 2.484621047973633, + "learning_rate": 4.914730444109173e-05, + "loss": 0.3731, + "num_input_tokens_seen": 13629928, + "step": 23505 + }, + { + "epoch": 3.50163836759011, + "grad_norm": 11.128022193908691, + "learning_rate": 4.9146462817012586e-05, + "loss": 0.3917, + "num_input_tokens_seen": 13632904, + "step": 23510 + }, + { + "epoch": 3.5023830801310694, + "grad_norm": 18.938684463500977, + "learning_rate": 4.9145620785003485e-05, + "loss": 0.3212, + "num_input_tokens_seen": 13635656, + "step": 23515 + }, + { + "epoch": 3.5031277926720286, + "grad_norm": 4.072774887084961, + "learning_rate": 4.9144778345078665e-05, + "loss": 0.7142, + "num_input_tokens_seen": 13638440, + "step": 23520 + }, + { + "epoch": 3.5038725052129878, + "grad_norm": 13.18824577331543, + "learning_rate": 4.914393549725236e-05, + "loss": 0.504, + "num_input_tokens_seen": 13641352, + "step": 23525 + }, + { + "epoch": 3.504617217753947, + "grad_norm": 21.7359676361084, + "learning_rate": 4.91430922415388e-05, + "loss": 0.3768, + "num_input_tokens_seen": 13644264, + "step": 23530 + }, + { + "epoch": 3.505361930294906, + "grad_norm": 4.0368804931640625, + "learning_rate": 4.914224857795224e-05, + "loss": 0.3241, + "num_input_tokens_seen": 13647080, + "step": 23535 + }, + { + "epoch": 3.5061066428358654, + "grad_norm": 12.451681137084961, + "learning_rate": 4.914140450650692e-05, + "loss": 0.5235, + "num_input_tokens_seen": 13650216, + "step": 23540 + }, + { + "epoch": 3.5068513553768246, + "grad_norm": 11.565773963928223, + "learning_rate": 4.9140560027217106e-05, + "loss": 0.4255, + "num_input_tokens_seen": 13653352, + "step": 23545 + }, + { + "epoch": 3.5075960679177838, + "grad_norm": 5.595832824707031, + "learning_rate": 4.9139715140097075e-05, + "loss": 0.655, + "num_input_tokens_seen": 13656072, + "step": 23550 + }, + { + "epoch": 3.508340780458743, + "grad_norm": 6.59311056137085, + "learning_rate": 4.9138869845161086e-05, + "loss": 0.382, + "num_input_tokens_seen": 13658824, + "step": 23555 + }, + { + "epoch": 3.509085492999702, + "grad_norm": 6.519649028778076, + "learning_rate": 4.913802414242342e-05, + "loss": 0.3617, + "num_input_tokens_seen": 13661992, + "step": 23560 + }, + { + "epoch": 3.5098302055406614, + "grad_norm": 23.200681686401367, + "learning_rate": 4.913717803189838e-05, + "loss": 0.4462, + "num_input_tokens_seen": 13664904, + "step": 23565 + }, + { + "epoch": 3.5105749180816206, + "grad_norm": 8.421856880187988, + "learning_rate": 4.913633151360024e-05, + "loss": 0.4177, + "num_input_tokens_seen": 13667688, + "step": 23570 + }, + { + "epoch": 3.51131963062258, + "grad_norm": 13.96364974975586, + "learning_rate": 4.913548458754331e-05, + "loss": 0.39, + "num_input_tokens_seen": 13670440, + "step": 23575 + }, + { + "epoch": 3.512064343163539, + "grad_norm": 5.0734028816223145, + "learning_rate": 4.91346372537419e-05, + "loss": 0.2768, + "num_input_tokens_seen": 13673192, + "step": 23580 + }, + { + "epoch": 3.512809055704498, + "grad_norm": 8.15803337097168, + "learning_rate": 4.913378951221033e-05, + "loss": 0.4999, + "num_input_tokens_seen": 13675976, + "step": 23585 + }, + { + "epoch": 3.5135537682454574, + "grad_norm": 19.03618812561035, + "learning_rate": 4.9132941362962905e-05, + "loss": 0.5576, + "num_input_tokens_seen": 13678728, + "step": 23590 + }, + { + "epoch": 3.5142984807864166, + "grad_norm": 7.760671138763428, + "learning_rate": 4.913209280601396e-05, + "loss": 0.4714, + "num_input_tokens_seen": 13681544, + "step": 23595 + }, + { + "epoch": 3.515043193327376, + "grad_norm": 3.658720016479492, + "learning_rate": 4.913124384137784e-05, + "loss": 0.4078, + "num_input_tokens_seen": 13684392, + "step": 23600 + }, + { + "epoch": 3.5157879058683346, + "grad_norm": 6.749122619628906, + "learning_rate": 4.9130394469068886e-05, + "loss": 0.3109, + "num_input_tokens_seen": 13687144, + "step": 23605 + }, + { + "epoch": 3.516532618409294, + "grad_norm": 11.613603591918945, + "learning_rate": 4.9129544689101437e-05, + "loss": 0.2289, + "num_input_tokens_seen": 13690056, + "step": 23610 + }, + { + "epoch": 3.517277330950253, + "grad_norm": 3.601436138153076, + "learning_rate": 4.912869450148986e-05, + "loss": 0.2511, + "num_input_tokens_seen": 13692872, + "step": 23615 + }, + { + "epoch": 3.5180220434912126, + "grad_norm": 6.6598639488220215, + "learning_rate": 4.9127843906248504e-05, + "loss": 0.3306, + "num_input_tokens_seen": 13695720, + "step": 23620 + }, + { + "epoch": 3.5187667560321714, + "grad_norm": 1.200453281402588, + "learning_rate": 4.912699290339175e-05, + "loss": 0.3242, + "num_input_tokens_seen": 13698728, + "step": 23625 + }, + { + "epoch": 3.519511468573131, + "grad_norm": 8.923880577087402, + "learning_rate": 4.912614149293398e-05, + "loss": 0.6878, + "num_input_tokens_seen": 13701288, + "step": 23630 + }, + { + "epoch": 3.5202561811140898, + "grad_norm": 4.250435829162598, + "learning_rate": 4.9125289674889566e-05, + "loss": 0.3396, + "num_input_tokens_seen": 13704200, + "step": 23635 + }, + { + "epoch": 3.5210008936550494, + "grad_norm": 7.926689624786377, + "learning_rate": 4.91244374492729e-05, + "loss": 0.2868, + "num_input_tokens_seen": 13707208, + "step": 23640 + }, + { + "epoch": 3.521745606196008, + "grad_norm": 24.619935989379883, + "learning_rate": 4.912358481609838e-05, + "loss": 0.605, + "num_input_tokens_seen": 13710248, + "step": 23645 + }, + { + "epoch": 3.5224903187369674, + "grad_norm": 1.7618004083633423, + "learning_rate": 4.912273177538041e-05, + "loss": 0.1518, + "num_input_tokens_seen": 13713416, + "step": 23650 + }, + { + "epoch": 3.5232350312779266, + "grad_norm": 18.1259765625, + "learning_rate": 4.912187832713342e-05, + "loss": 0.6071, + "num_input_tokens_seen": 13716104, + "step": 23655 + }, + { + "epoch": 3.523979743818886, + "grad_norm": 22.59305763244629, + "learning_rate": 4.91210244713718e-05, + "loss": 0.5995, + "num_input_tokens_seen": 13718984, + "step": 23660 + }, + { + "epoch": 3.524724456359845, + "grad_norm": 5.966433525085449, + "learning_rate": 4.912017020810999e-05, + "loss": 0.6009, + "num_input_tokens_seen": 13721864, + "step": 23665 + }, + { + "epoch": 3.525469168900804, + "grad_norm": 11.623407363891602, + "learning_rate": 4.911931553736242e-05, + "loss": 0.2978, + "num_input_tokens_seen": 13724648, + "step": 23670 + }, + { + "epoch": 3.5262138814417634, + "grad_norm": 8.83418083190918, + "learning_rate": 4.9118460459143524e-05, + "loss": 0.483, + "num_input_tokens_seen": 13728104, + "step": 23675 + }, + { + "epoch": 3.5269585939827226, + "grad_norm": 11.944294929504395, + "learning_rate": 4.9117604973467756e-05, + "loss": 0.3277, + "num_input_tokens_seen": 13731080, + "step": 23680 + }, + { + "epoch": 3.527703306523682, + "grad_norm": 3.4239048957824707, + "learning_rate": 4.9116749080349556e-05, + "loss": 0.3164, + "num_input_tokens_seen": 13733800, + "step": 23685 + }, + { + "epoch": 3.528448019064641, + "grad_norm": 7.30644416809082, + "learning_rate": 4.911589277980339e-05, + "loss": 0.4657, + "num_input_tokens_seen": 13736776, + "step": 23690 + }, + { + "epoch": 3.5291927316056, + "grad_norm": 14.357654571533203, + "learning_rate": 4.911503607184375e-05, + "loss": 0.4163, + "num_input_tokens_seen": 13739912, + "step": 23695 + }, + { + "epoch": 3.5299374441465594, + "grad_norm": 24.438396453857422, + "learning_rate": 4.911417895648506e-05, + "loss": 0.3672, + "num_input_tokens_seen": 13743048, + "step": 23700 + }, + { + "epoch": 3.5306821566875186, + "grad_norm": 8.14791488647461, + "learning_rate": 4.9113321433741835e-05, + "loss": 0.3203, + "num_input_tokens_seen": 13746120, + "step": 23705 + }, + { + "epoch": 3.531426869228478, + "grad_norm": 17.341426849365234, + "learning_rate": 4.9112463503628545e-05, + "loss": 0.3569, + "num_input_tokens_seen": 13748808, + "step": 23710 + }, + { + "epoch": 3.532171581769437, + "grad_norm": 5.461435317993164, + "learning_rate": 4.91116051661597e-05, + "loss": 0.5111, + "num_input_tokens_seen": 13751720, + "step": 23715 + }, + { + "epoch": 3.532916294310396, + "grad_norm": 20.292329788208008, + "learning_rate": 4.911074642134979e-05, + "loss": 0.7546, + "num_input_tokens_seen": 13754376, + "step": 23720 + }, + { + "epoch": 3.5336610068513554, + "grad_norm": 10.986601829528809, + "learning_rate": 4.9109887269213315e-05, + "loss": 0.3383, + "num_input_tokens_seen": 13757544, + "step": 23725 + }, + { + "epoch": 3.5344057193923146, + "grad_norm": 3.1641390323638916, + "learning_rate": 4.910902770976481e-05, + "loss": 0.2549, + "num_input_tokens_seen": 13760296, + "step": 23730 + }, + { + "epoch": 3.535150431933274, + "grad_norm": 6.49847936630249, + "learning_rate": 4.910816774301878e-05, + "loss": 0.4959, + "num_input_tokens_seen": 13763144, + "step": 23735 + }, + { + "epoch": 3.535895144474233, + "grad_norm": 19.038776397705078, + "learning_rate": 4.910730736898976e-05, + "loss": 0.2996, + "num_input_tokens_seen": 13766024, + "step": 23740 + }, + { + "epoch": 3.5366398570151922, + "grad_norm": 7.186094284057617, + "learning_rate": 4.9106446587692276e-05, + "loss": 0.4983, + "num_input_tokens_seen": 13768872, + "step": 23745 + }, + { + "epoch": 3.5373845695561514, + "grad_norm": 3.1381287574768066, + "learning_rate": 4.910558539914088e-05, + "loss": 0.3046, + "num_input_tokens_seen": 13771784, + "step": 23750 + }, + { + "epoch": 3.5381292820971106, + "grad_norm": 19.22724151611328, + "learning_rate": 4.910472380335013e-05, + "loss": 0.6045, + "num_input_tokens_seen": 13774760, + "step": 23755 + }, + { + "epoch": 3.53887399463807, + "grad_norm": 2.598426342010498, + "learning_rate": 4.9103861800334567e-05, + "loss": 0.2805, + "num_input_tokens_seen": 13777832, + "step": 23760 + }, + { + "epoch": 3.539618707179029, + "grad_norm": 8.410475730895996, + "learning_rate": 4.9102999390108753e-05, + "loss": 0.253, + "num_input_tokens_seen": 13780616, + "step": 23765 + }, + { + "epoch": 3.5403634197199882, + "grad_norm": 6.352437496185303, + "learning_rate": 4.910213657268726e-05, + "loss": 0.5891, + "num_input_tokens_seen": 13783272, + "step": 23770 + }, + { + "epoch": 3.5411081322609474, + "grad_norm": 27.38022232055664, + "learning_rate": 4.910127334808466e-05, + "loss": 0.57, + "num_input_tokens_seen": 13785960, + "step": 23775 + }, + { + "epoch": 3.541852844801906, + "grad_norm": 8.030917167663574, + "learning_rate": 4.9100409716315556e-05, + "loss": 0.383, + "num_input_tokens_seen": 13788680, + "step": 23780 + }, + { + "epoch": 3.542597557342866, + "grad_norm": 13.795207977294922, + "learning_rate": 4.909954567739452e-05, + "loss": 0.2323, + "num_input_tokens_seen": 13791592, + "step": 23785 + }, + { + "epoch": 3.5433422698838246, + "grad_norm": 10.330897331237793, + "learning_rate": 4.909868123133615e-05, + "loss": 0.3301, + "num_input_tokens_seen": 13794344, + "step": 23790 + }, + { + "epoch": 3.5440869824247843, + "grad_norm": 2.6441261768341064, + "learning_rate": 4.909781637815506e-05, + "loss": 0.2741, + "num_input_tokens_seen": 13797288, + "step": 23795 + }, + { + "epoch": 3.544831694965743, + "grad_norm": 17.48191261291504, + "learning_rate": 4.909695111786584e-05, + "loss": 0.3842, + "num_input_tokens_seen": 13800168, + "step": 23800 + }, + { + "epoch": 3.5455764075067027, + "grad_norm": 6.444976329803467, + "learning_rate": 4.9096085450483134e-05, + "loss": 0.2925, + "num_input_tokens_seen": 13802984, + "step": 23805 + }, + { + "epoch": 3.5463211200476614, + "grad_norm": 13.128652572631836, + "learning_rate": 4.909521937602155e-05, + "loss": 0.3502, + "num_input_tokens_seen": 13805928, + "step": 23810 + }, + { + "epoch": 3.5470658325886206, + "grad_norm": 2.82637882232666, + "learning_rate": 4.909435289449573e-05, + "loss": 0.3926, + "num_input_tokens_seen": 13808936, + "step": 23815 + }, + { + "epoch": 3.54781054512958, + "grad_norm": 13.877197265625, + "learning_rate": 4.90934860059203e-05, + "loss": 0.4154, + "num_input_tokens_seen": 13811976, + "step": 23820 + }, + { + "epoch": 3.548555257670539, + "grad_norm": 23.36635398864746, + "learning_rate": 4.909261871030991e-05, + "loss": 0.4886, + "num_input_tokens_seen": 13814632, + "step": 23825 + }, + { + "epoch": 3.5492999702114982, + "grad_norm": 8.962882041931152, + "learning_rate": 4.9091751007679224e-05, + "loss": 0.3277, + "num_input_tokens_seen": 13817704, + "step": 23830 + }, + { + "epoch": 3.5500446827524574, + "grad_norm": 22.079261779785156, + "learning_rate": 4.9090882898042876e-05, + "loss": 0.4351, + "num_input_tokens_seen": 13821000, + "step": 23835 + }, + { + "epoch": 3.5507893952934166, + "grad_norm": 7.044853687286377, + "learning_rate": 4.909001438141556e-05, + "loss": 0.337, + "num_input_tokens_seen": 13823592, + "step": 23840 + }, + { + "epoch": 3.551534107834376, + "grad_norm": 9.029059410095215, + "learning_rate": 4.908914545781192e-05, + "loss": 0.3205, + "num_input_tokens_seen": 13826280, + "step": 23845 + }, + { + "epoch": 3.552278820375335, + "grad_norm": 11.551807403564453, + "learning_rate": 4.9088276127246666e-05, + "loss": 0.4399, + "num_input_tokens_seen": 13828840, + "step": 23850 + }, + { + "epoch": 3.5530235329162942, + "grad_norm": 11.766858100891113, + "learning_rate": 4.9087406389734465e-05, + "loss": 0.4502, + "num_input_tokens_seen": 13831592, + "step": 23855 + }, + { + "epoch": 3.5537682454572534, + "grad_norm": 15.149577140808105, + "learning_rate": 4.908653624529001e-05, + "loss": 0.3304, + "num_input_tokens_seen": 13834344, + "step": 23860 + }, + { + "epoch": 3.5545129579982127, + "grad_norm": 16.564353942871094, + "learning_rate": 4.908566569392801e-05, + "loss": 0.3638, + "num_input_tokens_seen": 13837384, + "step": 23865 + }, + { + "epoch": 3.555257670539172, + "grad_norm": 7.201262950897217, + "learning_rate": 4.908479473566316e-05, + "loss": 0.4129, + "num_input_tokens_seen": 13840072, + "step": 23870 + }, + { + "epoch": 3.556002383080131, + "grad_norm": 3.2164547443389893, + "learning_rate": 4.9083923370510184e-05, + "loss": 0.2267, + "num_input_tokens_seen": 13842728, + "step": 23875 + }, + { + "epoch": 3.5567470956210903, + "grad_norm": 12.226296424865723, + "learning_rate": 4.908305159848381e-05, + "loss": 0.361, + "num_input_tokens_seen": 13845864, + "step": 23880 + }, + { + "epoch": 3.5574918081620495, + "grad_norm": 12.942992210388184, + "learning_rate": 4.908217941959875e-05, + "loss": 0.6051, + "num_input_tokens_seen": 13848648, + "step": 23885 + }, + { + "epoch": 3.5582365207030087, + "grad_norm": 13.55399227142334, + "learning_rate": 4.908130683386974e-05, + "loss": 0.4009, + "num_input_tokens_seen": 13851688, + "step": 23890 + }, + { + "epoch": 3.558981233243968, + "grad_norm": 24.71256446838379, + "learning_rate": 4.9080433841311526e-05, + "loss": 0.3053, + "num_input_tokens_seen": 13854664, + "step": 23895 + }, + { + "epoch": 3.559725945784927, + "grad_norm": 0.5736576914787292, + "learning_rate": 4.9079560441938865e-05, + "loss": 0.2535, + "num_input_tokens_seen": 13857512, + "step": 23900 + }, + { + "epoch": 3.5604706583258863, + "grad_norm": 13.803542137145996, + "learning_rate": 4.90786866357665e-05, + "loss": 0.4202, + "num_input_tokens_seen": 13860776, + "step": 23905 + }, + { + "epoch": 3.5612153708668455, + "grad_norm": 34.155941009521484, + "learning_rate": 4.90778124228092e-05, + "loss": 0.2982, + "num_input_tokens_seen": 13863816, + "step": 23910 + }, + { + "epoch": 3.5619600834078047, + "grad_norm": 14.180377006530762, + "learning_rate": 4.907693780308172e-05, + "loss": 0.3513, + "num_input_tokens_seen": 13866568, + "step": 23915 + }, + { + "epoch": 3.562704795948764, + "grad_norm": 11.338881492614746, + "learning_rate": 4.907606277659885e-05, + "loss": 0.5366, + "num_input_tokens_seen": 13869384, + "step": 23920 + }, + { + "epoch": 3.563449508489723, + "grad_norm": 3.93795108795166, + "learning_rate": 4.907518734337538e-05, + "loss": 0.4379, + "num_input_tokens_seen": 13871912, + "step": 23925 + }, + { + "epoch": 3.5641942210306823, + "grad_norm": 2.7322704792022705, + "learning_rate": 4.907431150342608e-05, + "loss": 0.4969, + "num_input_tokens_seen": 13874888, + "step": 23930 + }, + { + "epoch": 3.5649389335716415, + "grad_norm": 9.373147010803223, + "learning_rate": 4.907343525676575e-05, + "loss": 0.2387, + "num_input_tokens_seen": 13877960, + "step": 23935 + }, + { + "epoch": 3.5656836461126007, + "grad_norm": 20.487939834594727, + "learning_rate": 4.9072558603409216e-05, + "loss": 0.2828, + "num_input_tokens_seen": 13880744, + "step": 23940 + }, + { + "epoch": 3.5664283586535594, + "grad_norm": 13.593388557434082, + "learning_rate": 4.907168154337125e-05, + "loss": 0.3615, + "num_input_tokens_seen": 13883624, + "step": 23945 + }, + { + "epoch": 3.567173071194519, + "grad_norm": 8.054205894470215, + "learning_rate": 4.90708040766667e-05, + "loss": 0.6698, + "num_input_tokens_seen": 13887080, + "step": 23950 + }, + { + "epoch": 3.567917783735478, + "grad_norm": 15.92448902130127, + "learning_rate": 4.906992620331038e-05, + "loss": 0.5271, + "num_input_tokens_seen": 13890184, + "step": 23955 + }, + { + "epoch": 3.5686624962764375, + "grad_norm": 21.241701126098633, + "learning_rate": 4.906904792331712e-05, + "loss": 0.4586, + "num_input_tokens_seen": 13893032, + "step": 23960 + }, + { + "epoch": 3.5694072088173963, + "grad_norm": 9.436457633972168, + "learning_rate": 4.906816923670176e-05, + "loss": 0.217, + "num_input_tokens_seen": 13896488, + "step": 23965 + }, + { + "epoch": 3.570151921358356, + "grad_norm": 5.621677875518799, + "learning_rate": 4.906729014347914e-05, + "loss": 0.4308, + "num_input_tokens_seen": 13899272, + "step": 23970 + }, + { + "epoch": 3.5708966338993147, + "grad_norm": 8.985726356506348, + "learning_rate": 4.9066410643664113e-05, + "loss": 0.543, + "num_input_tokens_seen": 13902088, + "step": 23975 + }, + { + "epoch": 3.5716413464402743, + "grad_norm": 4.5002923011779785, + "learning_rate": 4.906553073727154e-05, + "loss": 0.313, + "num_input_tokens_seen": 13904744, + "step": 23980 + }, + { + "epoch": 3.572386058981233, + "grad_norm": 4.809476375579834, + "learning_rate": 4.9064650424316284e-05, + "loss": 0.4466, + "num_input_tokens_seen": 13907528, + "step": 23985 + }, + { + "epoch": 3.5731307715221923, + "grad_norm": 5.773748874664307, + "learning_rate": 4.906376970481321e-05, + "loss": 0.1912, + "num_input_tokens_seen": 13910632, + "step": 23990 + }, + { + "epoch": 3.5738754840631515, + "grad_norm": 12.628750801086426, + "learning_rate": 4.9062888578777214e-05, + "loss": 0.3473, + "num_input_tokens_seen": 13913480, + "step": 23995 + }, + { + "epoch": 3.5746201966041107, + "grad_norm": 16.44523811340332, + "learning_rate": 4.906200704622317e-05, + "loss": 0.6579, + "num_input_tokens_seen": 13916456, + "step": 24000 + }, + { + "epoch": 3.57536490914507, + "grad_norm": 4.8737030029296875, + "learning_rate": 4.906112510716597e-05, + "loss": 0.5905, + "num_input_tokens_seen": 13919240, + "step": 24005 + }, + { + "epoch": 3.576109621686029, + "grad_norm": 10.341569900512695, + "learning_rate": 4.906024276162052e-05, + "loss": 0.33, + "num_input_tokens_seen": 13922248, + "step": 24010 + }, + { + "epoch": 3.5768543342269883, + "grad_norm": 12.692519187927246, + "learning_rate": 4.905936000960172e-05, + "loss": 0.3201, + "num_input_tokens_seen": 13924968, + "step": 24015 + }, + { + "epoch": 3.5775990467679475, + "grad_norm": 17.844484329223633, + "learning_rate": 4.905847685112448e-05, + "loss": 0.2374, + "num_input_tokens_seen": 13927816, + "step": 24020 + }, + { + "epoch": 3.5783437593089067, + "grad_norm": 16.96192741394043, + "learning_rate": 4.905759328620373e-05, + "loss": 0.3274, + "num_input_tokens_seen": 13930632, + "step": 24025 + }, + { + "epoch": 3.579088471849866, + "grad_norm": 37.585296630859375, + "learning_rate": 4.90567093148544e-05, + "loss": 0.5234, + "num_input_tokens_seen": 13933480, + "step": 24030 + }, + { + "epoch": 3.579833184390825, + "grad_norm": 24.938392639160156, + "learning_rate": 4.9055824937091406e-05, + "loss": 0.2818, + "num_input_tokens_seen": 13936232, + "step": 24035 + }, + { + "epoch": 3.5805778969317843, + "grad_norm": 12.519414901733398, + "learning_rate": 4.9054940152929704e-05, + "loss": 0.5224, + "num_input_tokens_seen": 13939368, + "step": 24040 + }, + { + "epoch": 3.5813226094727435, + "grad_norm": 15.996833801269531, + "learning_rate": 4.9054054962384235e-05, + "loss": 0.5002, + "num_input_tokens_seen": 13942152, + "step": 24045 + }, + { + "epoch": 3.5820673220137027, + "grad_norm": 10.726949691772461, + "learning_rate": 4.905316936546995e-05, + "loss": 0.3896, + "num_input_tokens_seen": 13945032, + "step": 24050 + }, + { + "epoch": 3.582812034554662, + "grad_norm": 13.3328218460083, + "learning_rate": 4.9052283362201823e-05, + "loss": 0.4864, + "num_input_tokens_seen": 13947816, + "step": 24055 + }, + { + "epoch": 3.583556747095621, + "grad_norm": 22.891971588134766, + "learning_rate": 4.9051396952594806e-05, + "loss": 0.4487, + "num_input_tokens_seen": 13950760, + "step": 24060 + }, + { + "epoch": 3.5843014596365803, + "grad_norm": 7.057608604431152, + "learning_rate": 4.905051013666389e-05, + "loss": 0.3549, + "num_input_tokens_seen": 13953608, + "step": 24065 + }, + { + "epoch": 3.5850461721775395, + "grad_norm": 30.980716705322266, + "learning_rate": 4.904962291442404e-05, + "loss": 0.4036, + "num_input_tokens_seen": 13956488, + "step": 24070 + }, + { + "epoch": 3.5857908847184987, + "grad_norm": 11.350682258605957, + "learning_rate": 4.904873528589027e-05, + "loss": 0.2883, + "num_input_tokens_seen": 13959080, + "step": 24075 + }, + { + "epoch": 3.586535597259458, + "grad_norm": 6.4165568351745605, + "learning_rate": 4.9047847251077544e-05, + "loss": 0.704, + "num_input_tokens_seen": 13961800, + "step": 24080 + }, + { + "epoch": 3.587280309800417, + "grad_norm": 3.125576972961426, + "learning_rate": 4.904695881000089e-05, + "loss": 0.2856, + "num_input_tokens_seen": 13964648, + "step": 24085 + }, + { + "epoch": 3.5880250223413763, + "grad_norm": 10.126800537109375, + "learning_rate": 4.90460699626753e-05, + "loss": 0.4376, + "num_input_tokens_seen": 13967528, + "step": 24090 + }, + { + "epoch": 3.5887697348823355, + "grad_norm": 8.911678314208984, + "learning_rate": 4.90451807091158e-05, + "loss": 0.4166, + "num_input_tokens_seen": 13970696, + "step": 24095 + }, + { + "epoch": 3.5895144474232947, + "grad_norm": 10.306991577148438, + "learning_rate": 4.904429104933741e-05, + "loss": 0.6338, + "num_input_tokens_seen": 13973736, + "step": 24100 + }, + { + "epoch": 3.590259159964254, + "grad_norm": 16.04872703552246, + "learning_rate": 4.904340098335516e-05, + "loss": 0.3015, + "num_input_tokens_seen": 13976840, + "step": 24105 + }, + { + "epoch": 3.591003872505213, + "grad_norm": 8.266858100891113, + "learning_rate": 4.904251051118408e-05, + "loss": 0.553, + "num_input_tokens_seen": 13979688, + "step": 24110 + }, + { + "epoch": 3.5917485850461723, + "grad_norm": 13.733492851257324, + "learning_rate": 4.904161963283923e-05, + "loss": 0.6262, + "num_input_tokens_seen": 13982504, + "step": 24115 + }, + { + "epoch": 3.592493297587131, + "grad_norm": 6.441036224365234, + "learning_rate": 4.9040728348335655e-05, + "loss": 0.4641, + "num_input_tokens_seen": 13985320, + "step": 24120 + }, + { + "epoch": 3.5932380101280907, + "grad_norm": 11.169591903686523, + "learning_rate": 4.90398366576884e-05, + "loss": 0.2345, + "num_input_tokens_seen": 13988040, + "step": 24125 + }, + { + "epoch": 3.5939827226690495, + "grad_norm": 1.854432225227356, + "learning_rate": 4.903894456091254e-05, + "loss": 0.4362, + "num_input_tokens_seen": 13991176, + "step": 24130 + }, + { + "epoch": 3.594727435210009, + "grad_norm": 37.10995864868164, + "learning_rate": 4.903805205802314e-05, + "loss": 0.2728, + "num_input_tokens_seen": 13994088, + "step": 24135 + }, + { + "epoch": 3.595472147750968, + "grad_norm": 6.258188724517822, + "learning_rate": 4.903715914903529e-05, + "loss": 0.454, + "num_input_tokens_seen": 13997096, + "step": 24140 + }, + { + "epoch": 3.5962168602919276, + "grad_norm": 14.597258567810059, + "learning_rate": 4.9036265833964057e-05, + "loss": 0.5606, + "num_input_tokens_seen": 14000072, + "step": 24145 + }, + { + "epoch": 3.5969615728328863, + "grad_norm": 1.5388518571853638, + "learning_rate": 4.903537211282455e-05, + "loss": 0.2531, + "num_input_tokens_seen": 14002728, + "step": 24150 + }, + { + "epoch": 3.597706285373846, + "grad_norm": 8.114012718200684, + "learning_rate": 4.9034477985631854e-05, + "loss": 0.3349, + "num_input_tokens_seen": 14005640, + "step": 24155 + }, + { + "epoch": 3.5984509979148047, + "grad_norm": 0.9489611387252808, + "learning_rate": 4.903358345240109e-05, + "loss": 0.4013, + "num_input_tokens_seen": 14008488, + "step": 24160 + }, + { + "epoch": 3.599195710455764, + "grad_norm": 18.65876007080078, + "learning_rate": 4.903268851314735e-05, + "loss": 0.5635, + "num_input_tokens_seen": 14011432, + "step": 24165 + }, + { + "epoch": 3.599940422996723, + "grad_norm": 23.23194122314453, + "learning_rate": 4.903179316788577e-05, + "loss": 0.4665, + "num_input_tokens_seen": 14014280, + "step": 24170 + }, + { + "epoch": 3.6006851355376823, + "grad_norm": 15.989681243896484, + "learning_rate": 4.903089741663146e-05, + "loss": 0.3246, + "num_input_tokens_seen": 14017128, + "step": 24175 + }, + { + "epoch": 3.6014298480786415, + "grad_norm": 10.762147903442383, + "learning_rate": 4.9030001259399563e-05, + "loss": 0.5158, + "num_input_tokens_seen": 14020200, + "step": 24180 + }, + { + "epoch": 3.6021745606196007, + "grad_norm": 6.843639850616455, + "learning_rate": 4.9029104696205225e-05, + "loss": 0.3267, + "num_input_tokens_seen": 14023304, + "step": 24185 + }, + { + "epoch": 3.60291927316056, + "grad_norm": 8.095023155212402, + "learning_rate": 4.9028207727063576e-05, + "loss": 0.1915, + "num_input_tokens_seen": 14026120, + "step": 24190 + }, + { + "epoch": 3.603663985701519, + "grad_norm": 12.40335464477539, + "learning_rate": 4.902731035198979e-05, + "loss": 0.2847, + "num_input_tokens_seen": 14029256, + "step": 24195 + }, + { + "epoch": 3.6044086982424783, + "grad_norm": 7.890224933624268, + "learning_rate": 4.902641257099901e-05, + "loss": 0.2143, + "num_input_tokens_seen": 14032040, + "step": 24200 + }, + { + "epoch": 3.6051534107834375, + "grad_norm": 23.866825103759766, + "learning_rate": 4.9025514384106414e-05, + "loss": 0.4269, + "num_input_tokens_seen": 14034888, + "step": 24205 + }, + { + "epoch": 3.6058981233243967, + "grad_norm": 0.19933012127876282, + "learning_rate": 4.902461579132717e-05, + "loss": 0.5412, + "num_input_tokens_seen": 14037768, + "step": 24210 + }, + { + "epoch": 3.606642835865356, + "grad_norm": 5.828395843505859, + "learning_rate": 4.902371679267646e-05, + "loss": 0.4376, + "num_input_tokens_seen": 14040392, + "step": 24215 + }, + { + "epoch": 3.607387548406315, + "grad_norm": 4.321510314941406, + "learning_rate": 4.9022817388169464e-05, + "loss": 0.3394, + "num_input_tokens_seen": 14043208, + "step": 24220 + }, + { + "epoch": 3.6081322609472744, + "grad_norm": 16.85309410095215, + "learning_rate": 4.9021917577821386e-05, + "loss": 0.5574, + "num_input_tokens_seen": 14046024, + "step": 24225 + }, + { + "epoch": 3.6088769734882336, + "grad_norm": 9.50383186340332, + "learning_rate": 4.902101736164742e-05, + "loss": 0.3973, + "num_input_tokens_seen": 14048904, + "step": 24230 + }, + { + "epoch": 3.6096216860291928, + "grad_norm": 3.8921239376068115, + "learning_rate": 4.902011673966279e-05, + "loss": 0.3469, + "num_input_tokens_seen": 14052296, + "step": 24235 + }, + { + "epoch": 3.610366398570152, + "grad_norm": 4.041768550872803, + "learning_rate": 4.90192157118827e-05, + "loss": 0.2337, + "num_input_tokens_seen": 14055016, + "step": 24240 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 14.113059997558594, + "learning_rate": 4.901831427832237e-05, + "loss": 0.4506, + "num_input_tokens_seen": 14057800, + "step": 24245 + }, + { + "epoch": 3.6118558236520704, + "grad_norm": 21.94554328918457, + "learning_rate": 4.9017412438997026e-05, + "loss": 0.2238, + "num_input_tokens_seen": 14060680, + "step": 24250 + }, + { + "epoch": 3.6126005361930296, + "grad_norm": 15.818258285522461, + "learning_rate": 4.901651019392191e-05, + "loss": 0.5511, + "num_input_tokens_seen": 14063304, + "step": 24255 + }, + { + "epoch": 3.6133452487339888, + "grad_norm": 15.0120210647583, + "learning_rate": 4.901560754311227e-05, + "loss": 0.4503, + "num_input_tokens_seen": 14066408, + "step": 24260 + }, + { + "epoch": 3.614089961274948, + "grad_norm": 18.512714385986328, + "learning_rate": 4.901470448658335e-05, + "loss": 0.3103, + "num_input_tokens_seen": 14069288, + "step": 24265 + }, + { + "epoch": 3.614834673815907, + "grad_norm": 1.9315931797027588, + "learning_rate": 4.9013801024350406e-05, + "loss": 0.2042, + "num_input_tokens_seen": 14072104, + "step": 24270 + }, + { + "epoch": 3.6155793863568664, + "grad_norm": 29.451358795166016, + "learning_rate": 4.9012897156428694e-05, + "loss": 0.2582, + "num_input_tokens_seen": 14074920, + "step": 24275 + }, + { + "epoch": 3.6163240988978256, + "grad_norm": 24.56781578063965, + "learning_rate": 4.901199288283349e-05, + "loss": 0.3097, + "num_input_tokens_seen": 14077800, + "step": 24280 + }, + { + "epoch": 3.617068811438785, + "grad_norm": 19.882686614990234, + "learning_rate": 4.901108820358008e-05, + "loss": 0.7283, + "num_input_tokens_seen": 14080456, + "step": 24285 + }, + { + "epoch": 3.617813523979744, + "grad_norm": 0.1423032432794571, + "learning_rate": 4.901018311868373e-05, + "loss": 0.3987, + "num_input_tokens_seen": 14083176, + "step": 24290 + }, + { + "epoch": 3.6185582365207027, + "grad_norm": 15.891279220581055, + "learning_rate": 4.9009277628159744e-05, + "loss": 0.692, + "num_input_tokens_seen": 14085832, + "step": 24295 + }, + { + "epoch": 3.6193029490616624, + "grad_norm": 4.792185306549072, + "learning_rate": 4.900837173202341e-05, + "loss": 0.4291, + "num_input_tokens_seen": 14088616, + "step": 24300 + }, + { + "epoch": 3.620047661602621, + "grad_norm": 5.872025966644287, + "learning_rate": 4.900746543029003e-05, + "loss": 0.3759, + "num_input_tokens_seen": 14091240, + "step": 24305 + }, + { + "epoch": 3.620792374143581, + "grad_norm": 6.419299602508545, + "learning_rate": 4.900655872297494e-05, + "loss": 0.3154, + "num_input_tokens_seen": 14094152, + "step": 24310 + }, + { + "epoch": 3.6215370866845396, + "grad_norm": 18.039953231811523, + "learning_rate": 4.900565161009343e-05, + "loss": 0.6705, + "num_input_tokens_seen": 14097192, + "step": 24315 + }, + { + "epoch": 3.622281799225499, + "grad_norm": 7.981554985046387, + "learning_rate": 4.9004744091660826e-05, + "loss": 0.352, + "num_input_tokens_seen": 14099912, + "step": 24320 + }, + { + "epoch": 3.623026511766458, + "grad_norm": 32.98520278930664, + "learning_rate": 4.900383616769247e-05, + "loss": 0.3423, + "num_input_tokens_seen": 14103112, + "step": 24325 + }, + { + "epoch": 3.6237712243074176, + "grad_norm": 10.820027351379395, + "learning_rate": 4.900292783820371e-05, + "loss": 0.5938, + "num_input_tokens_seen": 14106216, + "step": 24330 + }, + { + "epoch": 3.6245159368483764, + "grad_norm": 8.412002563476562, + "learning_rate": 4.9002019103209875e-05, + "loss": 0.4943, + "num_input_tokens_seen": 14109128, + "step": 24335 + }, + { + "epoch": 3.6252606493893356, + "grad_norm": 9.02131462097168, + "learning_rate": 4.9001109962726323e-05, + "loss": 0.6252, + "num_input_tokens_seen": 14111592, + "step": 24340 + }, + { + "epoch": 3.6260053619302948, + "grad_norm": 2.9446475505828857, + "learning_rate": 4.9000200416768405e-05, + "loss": 0.2626, + "num_input_tokens_seen": 14113992, + "step": 24345 + }, + { + "epoch": 3.626750074471254, + "grad_norm": 13.066225051879883, + "learning_rate": 4.89992904653515e-05, + "loss": 0.6267, + "num_input_tokens_seen": 14116776, + "step": 24350 + }, + { + "epoch": 3.627494787012213, + "grad_norm": 5.035510063171387, + "learning_rate": 4.899838010849097e-05, + "loss": 0.5208, + "num_input_tokens_seen": 14119592, + "step": 24355 + }, + { + "epoch": 3.6282394995531724, + "grad_norm": 11.141050338745117, + "learning_rate": 4.89974693462022e-05, + "loss": 0.4338, + "num_input_tokens_seen": 14122600, + "step": 24360 + }, + { + "epoch": 3.6289842120941316, + "grad_norm": 8.07302188873291, + "learning_rate": 4.899655817850058e-05, + "loss": 0.3117, + "num_input_tokens_seen": 14125416, + "step": 24365 + }, + { + "epoch": 3.629728924635091, + "grad_norm": 11.471197128295898, + "learning_rate": 4.899564660540149e-05, + "loss": 0.2808, + "num_input_tokens_seen": 14128424, + "step": 24370 + }, + { + "epoch": 3.63047363717605, + "grad_norm": 10.302956581115723, + "learning_rate": 4.899473462692035e-05, + "loss": 0.5419, + "num_input_tokens_seen": 14131368, + "step": 24375 + }, + { + "epoch": 3.631218349717009, + "grad_norm": 14.93570327758789, + "learning_rate": 4.899382224307255e-05, + "loss": 0.3508, + "num_input_tokens_seen": 14134312, + "step": 24380 + }, + { + "epoch": 3.6319630622579684, + "grad_norm": 14.972525596618652, + "learning_rate": 4.8992909453873505e-05, + "loss": 0.3401, + "num_input_tokens_seen": 14137224, + "step": 24385 + }, + { + "epoch": 3.6327077747989276, + "grad_norm": 3.6557204723358154, + "learning_rate": 4.899199625933865e-05, + "loss": 0.5186, + "num_input_tokens_seen": 14140264, + "step": 24390 + }, + { + "epoch": 3.633452487339887, + "grad_norm": 6.663401126861572, + "learning_rate": 4.899108265948339e-05, + "loss": 0.4078, + "num_input_tokens_seen": 14143048, + "step": 24395 + }, + { + "epoch": 3.634197199880846, + "grad_norm": 5.4431281089782715, + "learning_rate": 4.899016865432318e-05, + "loss": 0.5532, + "num_input_tokens_seen": 14146088, + "step": 24400 + }, + { + "epoch": 3.634941912421805, + "grad_norm": 8.836599349975586, + "learning_rate": 4.898925424387345e-05, + "loss": 0.4202, + "num_input_tokens_seen": 14149000, + "step": 24405 + }, + { + "epoch": 3.6356866249627644, + "grad_norm": 5.046600341796875, + "learning_rate": 4.8988339428149656e-05, + "loss": 0.3798, + "num_input_tokens_seen": 14151944, + "step": 24410 + }, + { + "epoch": 3.6364313375037236, + "grad_norm": 10.682574272155762, + "learning_rate": 4.898742420716724e-05, + "loss": 0.4646, + "num_input_tokens_seen": 14155208, + "step": 24415 + }, + { + "epoch": 3.637176050044683, + "grad_norm": 5.767951011657715, + "learning_rate": 4.898650858094168e-05, + "loss": 0.3098, + "num_input_tokens_seen": 14158216, + "step": 24420 + }, + { + "epoch": 3.637920762585642, + "grad_norm": 6.407562732696533, + "learning_rate": 4.898559254948843e-05, + "loss": 0.5216, + "num_input_tokens_seen": 14161064, + "step": 24425 + }, + { + "epoch": 3.638665475126601, + "grad_norm": 9.007087707519531, + "learning_rate": 4.898467611282297e-05, + "loss": 0.4498, + "num_input_tokens_seen": 14163848, + "step": 24430 + }, + { + "epoch": 3.6394101876675604, + "grad_norm": 5.038474082946777, + "learning_rate": 4.8983759270960796e-05, + "loss": 0.2867, + "num_input_tokens_seen": 14166696, + "step": 24435 + }, + { + "epoch": 3.6401549002085196, + "grad_norm": 10.991198539733887, + "learning_rate": 4.8982842023917374e-05, + "loss": 0.402, + "num_input_tokens_seen": 14169512, + "step": 24440 + }, + { + "epoch": 3.640899612749479, + "grad_norm": 7.638543128967285, + "learning_rate": 4.898192437170822e-05, + "loss": 0.2716, + "num_input_tokens_seen": 14172296, + "step": 24445 + }, + { + "epoch": 3.641644325290438, + "grad_norm": 11.904006958007812, + "learning_rate": 4.898100631434882e-05, + "loss": 0.5625, + "num_input_tokens_seen": 14175016, + "step": 24450 + }, + { + "epoch": 3.6423890378313972, + "grad_norm": 9.235105514526367, + "learning_rate": 4.898008785185469e-05, + "loss": 0.432, + "num_input_tokens_seen": 14178568, + "step": 24455 + }, + { + "epoch": 3.6431337503723564, + "grad_norm": 4.282695293426514, + "learning_rate": 4.8979168984241354e-05, + "loss": 0.4642, + "num_input_tokens_seen": 14181512, + "step": 24460 + }, + { + "epoch": 3.6438784629133156, + "grad_norm": 9.822853088378906, + "learning_rate": 4.8978249711524324e-05, + "loss": 0.529, + "num_input_tokens_seen": 14184712, + "step": 24465 + }, + { + "epoch": 3.6446231754542744, + "grad_norm": 11.065465927124023, + "learning_rate": 4.8977330033719147e-05, + "loss": 0.3101, + "num_input_tokens_seen": 14187464, + "step": 24470 + }, + { + "epoch": 3.645367887995234, + "grad_norm": 8.440125465393066, + "learning_rate": 4.897640995084133e-05, + "loss": 0.29, + "num_input_tokens_seen": 14190440, + "step": 24475 + }, + { + "epoch": 3.646112600536193, + "grad_norm": 20.817277908325195, + "learning_rate": 4.8975489462906456e-05, + "loss": 0.5193, + "num_input_tokens_seen": 14193384, + "step": 24480 + }, + { + "epoch": 3.6468573130771524, + "grad_norm": 16.636259078979492, + "learning_rate": 4.897456856993004e-05, + "loss": 0.1726, + "num_input_tokens_seen": 14196520, + "step": 24485 + }, + { + "epoch": 3.647602025618111, + "grad_norm": 3.212015151977539, + "learning_rate": 4.897364727192766e-05, + "loss": 0.2468, + "num_input_tokens_seen": 14199464, + "step": 24490 + }, + { + "epoch": 3.648346738159071, + "grad_norm": 15.396971702575684, + "learning_rate": 4.897272556891487e-05, + "loss": 0.5174, + "num_input_tokens_seen": 14202280, + "step": 24495 + }, + { + "epoch": 3.6490914507000296, + "grad_norm": 26.726974487304688, + "learning_rate": 4.897180346090726e-05, + "loss": 0.6175, + "num_input_tokens_seen": 14205288, + "step": 24500 + }, + { + "epoch": 3.6498361632409893, + "grad_norm": 5.154280662536621, + "learning_rate": 4.8970880947920386e-05, + "loss": 0.3507, + "num_input_tokens_seen": 14208456, + "step": 24505 + }, + { + "epoch": 3.650580875781948, + "grad_norm": 7.632009029388428, + "learning_rate": 4.8969958029969834e-05, + "loss": 0.5044, + "num_input_tokens_seen": 14211432, + "step": 24510 + }, + { + "epoch": 3.651325588322907, + "grad_norm": 3.0382189750671387, + "learning_rate": 4.896903470707121e-05, + "loss": 0.7653, + "num_input_tokens_seen": 14214408, + "step": 24515 + }, + { + "epoch": 3.6520703008638664, + "grad_norm": 15.990300178527832, + "learning_rate": 4.89681109792401e-05, + "loss": 0.5198, + "num_input_tokens_seen": 14217128, + "step": 24520 + }, + { + "epoch": 3.6528150134048256, + "grad_norm": 9.147013664245605, + "learning_rate": 4.896718684649213e-05, + "loss": 0.6693, + "num_input_tokens_seen": 14219976, + "step": 24525 + }, + { + "epoch": 3.653559725945785, + "grad_norm": 5.637096405029297, + "learning_rate": 4.8966262308842885e-05, + "loss": 0.4066, + "num_input_tokens_seen": 14222856, + "step": 24530 + }, + { + "epoch": 3.654304438486744, + "grad_norm": 10.334663391113281, + "learning_rate": 4.8965337366308e-05, + "loss": 0.5653, + "num_input_tokens_seen": 14225704, + "step": 24535 + }, + { + "epoch": 3.6550491510277032, + "grad_norm": 8.793766975402832, + "learning_rate": 4.896441201890309e-05, + "loss": 0.5597, + "num_input_tokens_seen": 14228520, + "step": 24540 + }, + { + "epoch": 3.6557938635686624, + "grad_norm": 6.60293436050415, + "learning_rate": 4.896348626664381e-05, + "loss": 0.3087, + "num_input_tokens_seen": 14231624, + "step": 24545 + }, + { + "epoch": 3.6565385761096216, + "grad_norm": 15.717781066894531, + "learning_rate": 4.896256010954578e-05, + "loss": 0.4246, + "num_input_tokens_seen": 14234568, + "step": 24550 + }, + { + "epoch": 3.657283288650581, + "grad_norm": 18.56627655029297, + "learning_rate": 4.896163354762464e-05, + "loss": 0.6674, + "num_input_tokens_seen": 14237704, + "step": 24555 + }, + { + "epoch": 3.65802800119154, + "grad_norm": 24.280757904052734, + "learning_rate": 4.8960706580896066e-05, + "loss": 0.7133, + "num_input_tokens_seen": 14240616, + "step": 24560 + }, + { + "epoch": 3.6587727137324992, + "grad_norm": 5.087642192840576, + "learning_rate": 4.8959779209375703e-05, + "loss": 0.6149, + "num_input_tokens_seen": 14243656, + "step": 24565 + }, + { + "epoch": 3.6595174262734584, + "grad_norm": 18.8642520904541, + "learning_rate": 4.895885143307922e-05, + "loss": 0.3733, + "num_input_tokens_seen": 14246440, + "step": 24570 + }, + { + "epoch": 3.6602621388144176, + "grad_norm": 6.860620975494385, + "learning_rate": 4.8957923252022304e-05, + "loss": 0.3936, + "num_input_tokens_seen": 14249448, + "step": 24575 + }, + { + "epoch": 3.661006851355377, + "grad_norm": 8.790231704711914, + "learning_rate": 4.8956994666220615e-05, + "loss": 0.4307, + "num_input_tokens_seen": 14252392, + "step": 24580 + }, + { + "epoch": 3.661751563896336, + "grad_norm": 9.526389122009277, + "learning_rate": 4.895606567568985e-05, + "loss": 0.4808, + "num_input_tokens_seen": 14255048, + "step": 24585 + }, + { + "epoch": 3.6624962764372953, + "grad_norm": 16.699264526367188, + "learning_rate": 4.8955136280445704e-05, + "loss": 0.5093, + "num_input_tokens_seen": 14257832, + "step": 24590 + }, + { + "epoch": 3.6632409889782545, + "grad_norm": 9.415952682495117, + "learning_rate": 4.895420648050388e-05, + "loss": 0.3917, + "num_input_tokens_seen": 14260712, + "step": 24595 + }, + { + "epoch": 3.6639857015192137, + "grad_norm": 5.796581268310547, + "learning_rate": 4.895327627588008e-05, + "loss": 0.4318, + "num_input_tokens_seen": 14263496, + "step": 24600 + }, + { + "epoch": 3.664730414060173, + "grad_norm": 8.694246292114258, + "learning_rate": 4.8952345666590025e-05, + "loss": 0.3139, + "num_input_tokens_seen": 14266344, + "step": 24605 + }, + { + "epoch": 3.665475126601132, + "grad_norm": 13.589954376220703, + "learning_rate": 4.895141465264943e-05, + "loss": 0.3338, + "num_input_tokens_seen": 14269096, + "step": 24610 + }, + { + "epoch": 3.6662198391420913, + "grad_norm": 8.812932968139648, + "learning_rate": 4.895048323407403e-05, + "loss": 0.2973, + "num_input_tokens_seen": 14271912, + "step": 24615 + }, + { + "epoch": 3.6669645516830505, + "grad_norm": 14.249136924743652, + "learning_rate": 4.894955141087956e-05, + "loss": 0.4186, + "num_input_tokens_seen": 14274472, + "step": 24620 + }, + { + "epoch": 3.6677092642240097, + "grad_norm": 9.42377758026123, + "learning_rate": 4.894861918308176e-05, + "loss": 0.6283, + "num_input_tokens_seen": 14277768, + "step": 24625 + }, + { + "epoch": 3.668453976764969, + "grad_norm": 6.160719394683838, + "learning_rate": 4.894768655069638e-05, + "loss": 0.5944, + "num_input_tokens_seen": 14280872, + "step": 24630 + }, + { + "epoch": 3.669198689305928, + "grad_norm": 9.625205039978027, + "learning_rate": 4.8946753513739166e-05, + "loss": 0.5744, + "num_input_tokens_seen": 14283752, + "step": 24635 + }, + { + "epoch": 3.6699434018468873, + "grad_norm": 9.342179298400879, + "learning_rate": 4.89458200722259e-05, + "loss": 0.2039, + "num_input_tokens_seen": 14287016, + "step": 24640 + }, + { + "epoch": 3.670688114387846, + "grad_norm": 22.78512191772461, + "learning_rate": 4.894488622617234e-05, + "loss": 0.5772, + "num_input_tokens_seen": 14289864, + "step": 24645 + }, + { + "epoch": 3.6714328269288057, + "grad_norm": 2.2272512912750244, + "learning_rate": 4.894395197559426e-05, + "loss": 0.5062, + "num_input_tokens_seen": 14292776, + "step": 24650 + }, + { + "epoch": 3.6721775394697644, + "grad_norm": 6.635778427124023, + "learning_rate": 4.8943017320507444e-05, + "loss": 0.3582, + "num_input_tokens_seen": 14295784, + "step": 24655 + }, + { + "epoch": 3.672922252010724, + "grad_norm": 4.685443878173828, + "learning_rate": 4.894208226092769e-05, + "loss": 0.555, + "num_input_tokens_seen": 14298632, + "step": 24660 + }, + { + "epoch": 3.673666964551683, + "grad_norm": 33.97140884399414, + "learning_rate": 4.894114679687079e-05, + "loss": 0.2832, + "num_input_tokens_seen": 14301576, + "step": 24665 + }, + { + "epoch": 3.6744116770926425, + "grad_norm": 17.612438201904297, + "learning_rate": 4.8940210928352545e-05, + "loss": 0.3923, + "num_input_tokens_seen": 14304296, + "step": 24670 + }, + { + "epoch": 3.6751563896336013, + "grad_norm": 8.560697555541992, + "learning_rate": 4.893927465538877e-05, + "loss": 0.2626, + "num_input_tokens_seen": 14306952, + "step": 24675 + }, + { + "epoch": 3.675901102174561, + "grad_norm": 22.24138832092285, + "learning_rate": 4.8938337977995286e-05, + "loss": 0.3659, + "num_input_tokens_seen": 14310024, + "step": 24680 + }, + { + "epoch": 3.6766458147155197, + "grad_norm": 11.210317611694336, + "learning_rate": 4.89374008961879e-05, + "loss": 0.5152, + "num_input_tokens_seen": 14312840, + "step": 24685 + }, + { + "epoch": 3.677390527256479, + "grad_norm": 6.026069164276123, + "learning_rate": 4.8936463409982466e-05, + "loss": 0.3955, + "num_input_tokens_seen": 14315560, + "step": 24690 + }, + { + "epoch": 3.678135239797438, + "grad_norm": 18.321239471435547, + "learning_rate": 4.89355255193948e-05, + "loss": 0.3512, + "num_input_tokens_seen": 14318376, + "step": 24695 + }, + { + "epoch": 3.6788799523383973, + "grad_norm": 11.166897773742676, + "learning_rate": 4.893458722444076e-05, + "loss": 0.3545, + "num_input_tokens_seen": 14321448, + "step": 24700 + }, + { + "epoch": 3.6796246648793565, + "grad_norm": 9.269083976745605, + "learning_rate": 4.89336485251362e-05, + "loss": 0.4394, + "num_input_tokens_seen": 14324200, + "step": 24705 + }, + { + "epoch": 3.6803693774203157, + "grad_norm": 17.658843994140625, + "learning_rate": 4.893270942149697e-05, + "loss": 0.5846, + "num_input_tokens_seen": 14326984, + "step": 24710 + }, + { + "epoch": 3.681114089961275, + "grad_norm": 8.0766019821167, + "learning_rate": 4.8931769913538945e-05, + "loss": 0.3192, + "num_input_tokens_seen": 14329768, + "step": 24715 + }, + { + "epoch": 3.681858802502234, + "grad_norm": 26.217252731323242, + "learning_rate": 4.893083000127798e-05, + "loss": 0.256, + "num_input_tokens_seen": 14332424, + "step": 24720 + }, + { + "epoch": 3.6826035150431933, + "grad_norm": 9.303807258605957, + "learning_rate": 4.8929889684729966e-05, + "loss": 0.3772, + "num_input_tokens_seen": 14335400, + "step": 24725 + }, + { + "epoch": 3.6833482275841525, + "grad_norm": 10.207794189453125, + "learning_rate": 4.892894896391079e-05, + "loss": 0.5029, + "num_input_tokens_seen": 14338216, + "step": 24730 + }, + { + "epoch": 3.6840929401251117, + "grad_norm": 18.480106353759766, + "learning_rate": 4.892800783883635e-05, + "loss": 0.3656, + "num_input_tokens_seen": 14341064, + "step": 24735 + }, + { + "epoch": 3.684837652666071, + "grad_norm": 19.211122512817383, + "learning_rate": 4.892706630952253e-05, + "loss": 0.562, + "num_input_tokens_seen": 14343944, + "step": 24740 + }, + { + "epoch": 3.68558236520703, + "grad_norm": 3.1799309253692627, + "learning_rate": 4.892612437598524e-05, + "loss": 0.4392, + "num_input_tokens_seen": 14346824, + "step": 24745 + }, + { + "epoch": 3.6863270777479893, + "grad_norm": 3.856353282928467, + "learning_rate": 4.8925182038240395e-05, + "loss": 0.2453, + "num_input_tokens_seen": 14349736, + "step": 24750 + }, + { + "epoch": 3.6870717902889485, + "grad_norm": 8.46776008605957, + "learning_rate": 4.892423929630392e-05, + "loss": 0.3118, + "num_input_tokens_seen": 14353096, + "step": 24755 + }, + { + "epoch": 3.6878165028299077, + "grad_norm": 13.179179191589355, + "learning_rate": 4.892329615019173e-05, + "loss": 0.4787, + "num_input_tokens_seen": 14356104, + "step": 24760 + }, + { + "epoch": 3.688561215370867, + "grad_norm": 2.440080165863037, + "learning_rate": 4.892235259991977e-05, + "loss": 0.1873, + "num_input_tokens_seen": 14359240, + "step": 24765 + }, + { + "epoch": 3.689305927911826, + "grad_norm": 21.673873901367188, + "learning_rate": 4.8921408645503986e-05, + "loss": 0.6365, + "num_input_tokens_seen": 14362216, + "step": 24770 + }, + { + "epoch": 3.6900506404527853, + "grad_norm": 12.88766098022461, + "learning_rate": 4.892046428696031e-05, + "loss": 0.3857, + "num_input_tokens_seen": 14365096, + "step": 24775 + }, + { + "epoch": 3.6907953529937445, + "grad_norm": 9.970998764038086, + "learning_rate": 4.8919519524304704e-05, + "loss": 0.3504, + "num_input_tokens_seen": 14368072, + "step": 24780 + }, + { + "epoch": 3.6915400655347037, + "grad_norm": 3.894859790802002, + "learning_rate": 4.891857435755312e-05, + "loss": 0.3395, + "num_input_tokens_seen": 14370728, + "step": 24785 + }, + { + "epoch": 3.692284778075663, + "grad_norm": 11.738598823547363, + "learning_rate": 4.891762878672153e-05, + "loss": 0.4089, + "num_input_tokens_seen": 14373960, + "step": 24790 + }, + { + "epoch": 3.693029490616622, + "grad_norm": 3.724177837371826, + "learning_rate": 4.891668281182592e-05, + "loss": 0.4594, + "num_input_tokens_seen": 14376648, + "step": 24795 + }, + { + "epoch": 3.6937742031575813, + "grad_norm": 12.09488582611084, + "learning_rate": 4.8915736432882254e-05, + "loss": 0.3053, + "num_input_tokens_seen": 14380008, + "step": 24800 + }, + { + "epoch": 3.6945189156985405, + "grad_norm": 7.783027648925781, + "learning_rate": 4.891478964990653e-05, + "loss": 0.317, + "num_input_tokens_seen": 14382760, + "step": 24805 + }, + { + "epoch": 3.6952636282394993, + "grad_norm": 2.7936606407165527, + "learning_rate": 4.891384246291474e-05, + "loss": 0.3099, + "num_input_tokens_seen": 14386024, + "step": 24810 + }, + { + "epoch": 3.696008340780459, + "grad_norm": 5.69700813293457, + "learning_rate": 4.891289487192289e-05, + "loss": 0.4549, + "num_input_tokens_seen": 14389000, + "step": 24815 + }, + { + "epoch": 3.6967530533214177, + "grad_norm": 19.643436431884766, + "learning_rate": 4.891194687694698e-05, + "loss": 0.353, + "num_input_tokens_seen": 14391752, + "step": 24820 + }, + { + "epoch": 3.6974977658623773, + "grad_norm": 59.40467834472656, + "learning_rate": 4.8910998478003034e-05, + "loss": 0.2039, + "num_input_tokens_seen": 14394728, + "step": 24825 + }, + { + "epoch": 3.698242478403336, + "grad_norm": 27.9779109954834, + "learning_rate": 4.891004967510707e-05, + "loss": 0.482, + "num_input_tokens_seen": 14397480, + "step": 24830 + }, + { + "epoch": 3.6989871909442957, + "grad_norm": 26.37592315673828, + "learning_rate": 4.890910046827511e-05, + "loss": 0.5722, + "num_input_tokens_seen": 14400520, + "step": 24835 + }, + { + "epoch": 3.6997319034852545, + "grad_norm": 5.271240711212158, + "learning_rate": 4.890815085752322e-05, + "loss": 0.4242, + "num_input_tokens_seen": 14403304, + "step": 24840 + }, + { + "epoch": 3.700476616026214, + "grad_norm": 9.30706787109375, + "learning_rate": 4.890720084286739e-05, + "loss": 0.7162, + "num_input_tokens_seen": 14406504, + "step": 24845 + }, + { + "epoch": 3.701221328567173, + "grad_norm": 9.264243125915527, + "learning_rate": 4.890625042432372e-05, + "loss": 0.3329, + "num_input_tokens_seen": 14409544, + "step": 24850 + }, + { + "epoch": 3.701966041108132, + "grad_norm": 10.221373558044434, + "learning_rate": 4.890529960190825e-05, + "loss": 0.4248, + "num_input_tokens_seen": 14412744, + "step": 24855 + }, + { + "epoch": 3.7027107536490913, + "grad_norm": 6.988046169281006, + "learning_rate": 4.8904348375637025e-05, + "loss": 0.3598, + "num_input_tokens_seen": 14415464, + "step": 24860 + }, + { + "epoch": 3.7034554661900505, + "grad_norm": 4.843865871429443, + "learning_rate": 4.8903396745526144e-05, + "loss": 0.2775, + "num_input_tokens_seen": 14418376, + "step": 24865 + }, + { + "epoch": 3.7042001787310097, + "grad_norm": 8.714752197265625, + "learning_rate": 4.8902444711591656e-05, + "loss": 0.4557, + "num_input_tokens_seen": 14421096, + "step": 24870 + }, + { + "epoch": 3.704944891271969, + "grad_norm": 17.993112564086914, + "learning_rate": 4.8901492273849666e-05, + "loss": 0.2424, + "num_input_tokens_seen": 14423880, + "step": 24875 + }, + { + "epoch": 3.705689603812928, + "grad_norm": 13.755505561828613, + "learning_rate": 4.890053943231625e-05, + "loss": 0.4807, + "num_input_tokens_seen": 14426632, + "step": 24880 + }, + { + "epoch": 3.7064343163538873, + "grad_norm": 25.751359939575195, + "learning_rate": 4.889958618700752e-05, + "loss": 0.6274, + "num_input_tokens_seen": 14429384, + "step": 24885 + }, + { + "epoch": 3.7071790288948465, + "grad_norm": 12.749993324279785, + "learning_rate": 4.8898632537939567e-05, + "loss": 0.5329, + "num_input_tokens_seen": 14432520, + "step": 24890 + }, + { + "epoch": 3.7079237414358057, + "grad_norm": 6.506709098815918, + "learning_rate": 4.889767848512851e-05, + "loss": 0.4183, + "num_input_tokens_seen": 14435624, + "step": 24895 + }, + { + "epoch": 3.708668453976765, + "grad_norm": 16.512487411499023, + "learning_rate": 4.889672402859046e-05, + "loss": 0.4704, + "num_input_tokens_seen": 14438184, + "step": 24900 + }, + { + "epoch": 3.709413166517724, + "grad_norm": 27.78196907043457, + "learning_rate": 4.8895769168341546e-05, + "loss": 0.4301, + "num_input_tokens_seen": 14440808, + "step": 24905 + }, + { + "epoch": 3.7101578790586833, + "grad_norm": 10.001937866210938, + "learning_rate": 4.8894813904397895e-05, + "loss": 0.4815, + "num_input_tokens_seen": 14443720, + "step": 24910 + }, + { + "epoch": 3.7109025915996425, + "grad_norm": 12.58072280883789, + "learning_rate": 4.889385823677565e-05, + "loss": 0.4157, + "num_input_tokens_seen": 14446504, + "step": 24915 + }, + { + "epoch": 3.7116473041406017, + "grad_norm": 20.040498733520508, + "learning_rate": 4.889290216549096e-05, + "loss": 0.4932, + "num_input_tokens_seen": 14449576, + "step": 24920 + }, + { + "epoch": 3.712392016681561, + "grad_norm": 8.815034866333008, + "learning_rate": 4.889194569055996e-05, + "loss": 0.4869, + "num_input_tokens_seen": 14452488, + "step": 24925 + }, + { + "epoch": 3.71313672922252, + "grad_norm": 3.239414930343628, + "learning_rate": 4.8890988811998835e-05, + "loss": 0.347, + "num_input_tokens_seen": 14455272, + "step": 24930 + }, + { + "epoch": 3.7138814417634793, + "grad_norm": 7.546043872833252, + "learning_rate": 4.889003152982373e-05, + "loss": 0.4375, + "num_input_tokens_seen": 14458344, + "step": 24935 + }, + { + "epoch": 3.7146261543044385, + "grad_norm": 19.242013931274414, + "learning_rate": 4.888907384405082e-05, + "loss": 0.1925, + "num_input_tokens_seen": 14461384, + "step": 24940 + }, + { + "epoch": 3.7153708668453977, + "grad_norm": 6.744050025939941, + "learning_rate": 4.888811575469629e-05, + "loss": 0.5305, + "num_input_tokens_seen": 14464232, + "step": 24945 + }, + { + "epoch": 3.716115579386357, + "grad_norm": 9.618995666503906, + "learning_rate": 4.8887157261776316e-05, + "loss": 0.2665, + "num_input_tokens_seen": 14467048, + "step": 24950 + }, + { + "epoch": 3.716860291927316, + "grad_norm": 48.783958435058594, + "learning_rate": 4.888619836530711e-05, + "loss": 0.5508, + "num_input_tokens_seen": 14469832, + "step": 24955 + }, + { + "epoch": 3.7176050044682754, + "grad_norm": 20.71678352355957, + "learning_rate": 4.8885239065304855e-05, + "loss": 0.5508, + "num_input_tokens_seen": 14472520, + "step": 24960 + }, + { + "epoch": 3.7183497170092346, + "grad_norm": 7.147912502288818, + "learning_rate": 4.8884279361785754e-05, + "loss": 0.3957, + "num_input_tokens_seen": 14475496, + "step": 24965 + }, + { + "epoch": 3.7190944295501938, + "grad_norm": 16.719898223876953, + "learning_rate": 4.888331925476604e-05, + "loss": 0.5314, + "num_input_tokens_seen": 14478216, + "step": 24970 + }, + { + "epoch": 3.719839142091153, + "grad_norm": 8.276984214782715, + "learning_rate": 4.8882358744261914e-05, + "loss": 0.3977, + "num_input_tokens_seen": 14481192, + "step": 24975 + }, + { + "epoch": 3.720583854632112, + "grad_norm": 19.555789947509766, + "learning_rate": 4.888139783028961e-05, + "loss": 0.3711, + "num_input_tokens_seen": 14484104, + "step": 24980 + }, + { + "epoch": 3.721328567173071, + "grad_norm": 7.680539608001709, + "learning_rate": 4.888043651286537e-05, + "loss": 0.3452, + "num_input_tokens_seen": 14487304, + "step": 24985 + }, + { + "epoch": 3.7220732797140306, + "grad_norm": 5.496543884277344, + "learning_rate": 4.887947479200542e-05, + "loss": 0.427, + "num_input_tokens_seen": 14490056, + "step": 24990 + }, + { + "epoch": 3.7228179922549893, + "grad_norm": 8.349090576171875, + "learning_rate": 4.887851266772601e-05, + "loss": 0.3087, + "num_input_tokens_seen": 14493384, + "step": 24995 + }, + { + "epoch": 3.723562704795949, + "grad_norm": 14.491366386413574, + "learning_rate": 4.8877550140043404e-05, + "loss": 0.2168, + "num_input_tokens_seen": 14496552, + "step": 25000 + }, + { + "epoch": 3.7243074173369077, + "grad_norm": 10.03173542022705, + "learning_rate": 4.887658720897385e-05, + "loss": 0.2553, + "num_input_tokens_seen": 14499336, + "step": 25005 + }, + { + "epoch": 3.7250521298778674, + "grad_norm": 1.065129280090332, + "learning_rate": 4.8875623874533627e-05, + "loss": 0.2523, + "num_input_tokens_seen": 14502216, + "step": 25010 + }, + { + "epoch": 3.725796842418826, + "grad_norm": 18.408628463745117, + "learning_rate": 4.8874660136739e-05, + "loss": 0.4825, + "num_input_tokens_seen": 14505352, + "step": 25015 + }, + { + "epoch": 3.726541554959786, + "grad_norm": 30.90492820739746, + "learning_rate": 4.887369599560626e-05, + "loss": 0.5203, + "num_input_tokens_seen": 14508456, + "step": 25020 + }, + { + "epoch": 3.7272862675007445, + "grad_norm": 17.25264549255371, + "learning_rate": 4.8872731451151684e-05, + "loss": 0.4461, + "num_input_tokens_seen": 14511336, + "step": 25025 + }, + { + "epoch": 3.7280309800417037, + "grad_norm": 20.305952072143555, + "learning_rate": 4.887176650339158e-05, + "loss": 0.4399, + "num_input_tokens_seen": 14514248, + "step": 25030 + }, + { + "epoch": 3.728775692582663, + "grad_norm": 5.929946422576904, + "learning_rate": 4.887080115234224e-05, + "loss": 0.377, + "num_input_tokens_seen": 14516904, + "step": 25035 + }, + { + "epoch": 3.729520405123622, + "grad_norm": 21.909774780273438, + "learning_rate": 4.886983539801998e-05, + "loss": 0.4491, + "num_input_tokens_seen": 14520200, + "step": 25040 + }, + { + "epoch": 3.7302651176645814, + "grad_norm": 9.360915184020996, + "learning_rate": 4.886886924044111e-05, + "loss": 0.4391, + "num_input_tokens_seen": 14523144, + "step": 25045 + }, + { + "epoch": 3.7310098302055406, + "grad_norm": 9.335952758789062, + "learning_rate": 4.8867902679621946e-05, + "loss": 0.5089, + "num_input_tokens_seen": 14526312, + "step": 25050 + }, + { + "epoch": 3.7317545427464998, + "grad_norm": 7.063935279846191, + "learning_rate": 4.8866935715578835e-05, + "loss": 0.6572, + "num_input_tokens_seen": 14529096, + "step": 25055 + }, + { + "epoch": 3.732499255287459, + "grad_norm": 5.477730751037598, + "learning_rate": 4.88659683483281e-05, + "loss": 0.2032, + "num_input_tokens_seen": 14531752, + "step": 25060 + }, + { + "epoch": 3.733243967828418, + "grad_norm": 20.700681686401367, + "learning_rate": 4.886500057788608e-05, + "loss": 0.4172, + "num_input_tokens_seen": 14534600, + "step": 25065 + }, + { + "epoch": 3.7339886803693774, + "grad_norm": 3.814225673675537, + "learning_rate": 4.8864032404269126e-05, + "loss": 0.2503, + "num_input_tokens_seen": 14537448, + "step": 25070 + }, + { + "epoch": 3.7347333929103366, + "grad_norm": 3.2684426307678223, + "learning_rate": 4.886306382749361e-05, + "loss": 0.3543, + "num_input_tokens_seen": 14540680, + "step": 25075 + }, + { + "epoch": 3.7354781054512958, + "grad_norm": 17.968555450439453, + "learning_rate": 4.886209484757588e-05, + "loss": 0.2102, + "num_input_tokens_seen": 14543848, + "step": 25080 + }, + { + "epoch": 3.736222817992255, + "grad_norm": 7.376684665679932, + "learning_rate": 4.886112546453231e-05, + "loss": 0.5988, + "num_input_tokens_seen": 14546504, + "step": 25085 + }, + { + "epoch": 3.736967530533214, + "grad_norm": 3.9815945625305176, + "learning_rate": 4.886015567837927e-05, + "loss": 0.2679, + "num_input_tokens_seen": 14549480, + "step": 25090 + }, + { + "epoch": 3.7377122430741734, + "grad_norm": 24.73816680908203, + "learning_rate": 4.885918548913316e-05, + "loss": 0.5085, + "num_input_tokens_seen": 14552264, + "step": 25095 + }, + { + "epoch": 3.7384569556151326, + "grad_norm": 1.1169252395629883, + "learning_rate": 4.885821489681036e-05, + "loss": 0.2502, + "num_input_tokens_seen": 14554952, + "step": 25100 + }, + { + "epoch": 3.739201668156092, + "grad_norm": 17.089303970336914, + "learning_rate": 4.885724390142726e-05, + "loss": 0.4169, + "num_input_tokens_seen": 14557992, + "step": 25105 + }, + { + "epoch": 3.739946380697051, + "grad_norm": 28.708091735839844, + "learning_rate": 4.885627250300028e-05, + "loss": 0.6779, + "num_input_tokens_seen": 14560648, + "step": 25110 + }, + { + "epoch": 3.74069109323801, + "grad_norm": 6.1353349685668945, + "learning_rate": 4.885530070154582e-05, + "loss": 0.4999, + "num_input_tokens_seen": 14563336, + "step": 25115 + }, + { + "epoch": 3.7414358057789694, + "grad_norm": 11.312433242797852, + "learning_rate": 4.88543284970803e-05, + "loss": 0.3575, + "num_input_tokens_seen": 14565896, + "step": 25120 + }, + { + "epoch": 3.7421805183199286, + "grad_norm": 10.08250904083252, + "learning_rate": 4.8853355889620143e-05, + "loss": 0.3848, + "num_input_tokens_seen": 14568808, + "step": 25125 + }, + { + "epoch": 3.742925230860888, + "grad_norm": 22.566667556762695, + "learning_rate": 4.885238287918178e-05, + "loss": 0.4393, + "num_input_tokens_seen": 14571656, + "step": 25130 + }, + { + "epoch": 3.743669943401847, + "grad_norm": 35.44300079345703, + "learning_rate": 4.885140946578166e-05, + "loss": 0.3146, + "num_input_tokens_seen": 14574568, + "step": 25135 + }, + { + "epoch": 3.744414655942806, + "grad_norm": 8.34203815460205, + "learning_rate": 4.885043564943621e-05, + "loss": 0.4436, + "num_input_tokens_seen": 14577576, + "step": 25140 + }, + { + "epoch": 3.7451593684837654, + "grad_norm": 19.725351333618164, + "learning_rate": 4.884946143016189e-05, + "loss": 0.3391, + "num_input_tokens_seen": 14580360, + "step": 25145 + }, + { + "epoch": 3.7459040810247246, + "grad_norm": 14.975335121154785, + "learning_rate": 4.884848680797516e-05, + "loss": 0.3102, + "num_input_tokens_seen": 14583368, + "step": 25150 + }, + { + "epoch": 3.746648793565684, + "grad_norm": 16.619718551635742, + "learning_rate": 4.884751178289249e-05, + "loss": 0.8416, + "num_input_tokens_seen": 14586088, + "step": 25155 + }, + { + "epoch": 3.7473935061066426, + "grad_norm": 16.60009765625, + "learning_rate": 4.884653635493034e-05, + "loss": 0.4433, + "num_input_tokens_seen": 14588904, + "step": 25160 + }, + { + "epoch": 3.748138218647602, + "grad_norm": 4.294874668121338, + "learning_rate": 4.8845560524105196e-05, + "loss": 0.4879, + "num_input_tokens_seen": 14591688, + "step": 25165 + }, + { + "epoch": 3.748882931188561, + "grad_norm": 8.476727485656738, + "learning_rate": 4.8844584290433536e-05, + "loss": 0.417, + "num_input_tokens_seen": 14594376, + "step": 25170 + }, + { + "epoch": 3.7496276437295206, + "grad_norm": 12.017607688903809, + "learning_rate": 4.8843607653931865e-05, + "loss": 0.2792, + "num_input_tokens_seen": 14597256, + "step": 25175 + }, + { + "epoch": 3.7503723562704794, + "grad_norm": 3.6118693351745605, + "learning_rate": 4.884263061461668e-05, + "loss": 0.3728, + "num_input_tokens_seen": 14599944, + "step": 25180 + }, + { + "epoch": 3.751117068811439, + "grad_norm": 7.214393138885498, + "learning_rate": 4.884165317250448e-05, + "loss": 0.3371, + "num_input_tokens_seen": 14602888, + "step": 25185 + }, + { + "epoch": 3.751861781352398, + "grad_norm": 15.22295093536377, + "learning_rate": 4.8840675327611785e-05, + "loss": 0.6922, + "num_input_tokens_seen": 14605704, + "step": 25190 + }, + { + "epoch": 3.7526064938933574, + "grad_norm": 6.264024257659912, + "learning_rate": 4.8839697079955104e-05, + "loss": 0.3642, + "num_input_tokens_seen": 14608744, + "step": 25195 + }, + { + "epoch": 3.753351206434316, + "grad_norm": 18.331146240234375, + "learning_rate": 4.883871842955097e-05, + "loss": 0.525, + "num_input_tokens_seen": 14611432, + "step": 25200 + }, + { + "epoch": 3.7540959189752754, + "grad_norm": 12.2254056930542, + "learning_rate": 4.883773937641593e-05, + "loss": 0.6043, + "num_input_tokens_seen": 14614536, + "step": 25205 + }, + { + "epoch": 3.7548406315162346, + "grad_norm": 9.411108016967773, + "learning_rate": 4.8836759920566494e-05, + "loss": 0.3993, + "num_input_tokens_seen": 14617224, + "step": 25210 + }, + { + "epoch": 3.755585344057194, + "grad_norm": 7.772242546081543, + "learning_rate": 4.8835780062019234e-05, + "loss": 0.2374, + "num_input_tokens_seen": 14619912, + "step": 25215 + }, + { + "epoch": 3.756330056598153, + "grad_norm": 9.262166023254395, + "learning_rate": 4.8834799800790694e-05, + "loss": 0.325, + "num_input_tokens_seen": 14622664, + "step": 25220 + }, + { + "epoch": 3.757074769139112, + "grad_norm": 13.908349990844727, + "learning_rate": 4.8833819136897436e-05, + "loss": 0.3118, + "num_input_tokens_seen": 14625480, + "step": 25225 + }, + { + "epoch": 3.7578194816800714, + "grad_norm": 8.393411636352539, + "learning_rate": 4.883283807035602e-05, + "loss": 0.3354, + "num_input_tokens_seen": 14628424, + "step": 25230 + }, + { + "epoch": 3.7585641942210306, + "grad_norm": 12.399382591247559, + "learning_rate": 4.883185660118304e-05, + "loss": 0.4028, + "num_input_tokens_seen": 14631176, + "step": 25235 + }, + { + "epoch": 3.75930890676199, + "grad_norm": 8.131918907165527, + "learning_rate": 4.883087472939506e-05, + "loss": 0.3362, + "num_input_tokens_seen": 14633928, + "step": 25240 + }, + { + "epoch": 3.760053619302949, + "grad_norm": 1.4856929779052734, + "learning_rate": 4.882989245500867e-05, + "loss": 0.224, + "num_input_tokens_seen": 14637096, + "step": 25245 + }, + { + "epoch": 3.760798331843908, + "grad_norm": 0.21168099343776703, + "learning_rate": 4.882890977804047e-05, + "loss": 0.3566, + "num_input_tokens_seen": 14639976, + "step": 25250 + }, + { + "epoch": 3.7615430443848674, + "grad_norm": 10.312244415283203, + "learning_rate": 4.882792669850705e-05, + "loss": 0.4601, + "num_input_tokens_seen": 14642792, + "step": 25255 + }, + { + "epoch": 3.7622877569258266, + "grad_norm": 2.3684146404266357, + "learning_rate": 4.882694321642504e-05, + "loss": 0.3357, + "num_input_tokens_seen": 14645704, + "step": 25260 + }, + { + "epoch": 3.763032469466786, + "grad_norm": 9.446898460388184, + "learning_rate": 4.8825959331811026e-05, + "loss": 0.4356, + "num_input_tokens_seen": 14648680, + "step": 25265 + }, + { + "epoch": 3.763777182007745, + "grad_norm": 18.483312606811523, + "learning_rate": 4.882497504468165e-05, + "loss": 0.6702, + "num_input_tokens_seen": 14651720, + "step": 25270 + }, + { + "epoch": 3.7645218945487042, + "grad_norm": 9.79218578338623, + "learning_rate": 4.8823990355053536e-05, + "loss": 0.4974, + "num_input_tokens_seen": 14654344, + "step": 25275 + }, + { + "epoch": 3.7652666070896634, + "grad_norm": 8.355977058410645, + "learning_rate": 4.8823005262943323e-05, + "loss": 0.1716, + "num_input_tokens_seen": 14657224, + "step": 25280 + }, + { + "epoch": 3.7660113196306226, + "grad_norm": 19.52881622314453, + "learning_rate": 4.882201976836764e-05, + "loss": 0.4263, + "num_input_tokens_seen": 14660200, + "step": 25285 + }, + { + "epoch": 3.766756032171582, + "grad_norm": 15.942508697509766, + "learning_rate": 4.8821033871343155e-05, + "loss": 0.4814, + "num_input_tokens_seen": 14662728, + "step": 25290 + }, + { + "epoch": 3.767500744712541, + "grad_norm": 17.18822479248047, + "learning_rate": 4.8820047571886504e-05, + "loss": 0.2897, + "num_input_tokens_seen": 14665800, + "step": 25295 + }, + { + "epoch": 3.7682454572535002, + "grad_norm": 0.6733365058898926, + "learning_rate": 4.8819060870014366e-05, + "loss": 0.5069, + "num_input_tokens_seen": 14668680, + "step": 25300 + }, + { + "epoch": 3.7689901697944594, + "grad_norm": 5.40199089050293, + "learning_rate": 4.88180737657434e-05, + "loss": 0.3269, + "num_input_tokens_seen": 14671656, + "step": 25305 + }, + { + "epoch": 3.7697348823354186, + "grad_norm": 19.527191162109375, + "learning_rate": 4.881708625909028e-05, + "loss": 0.3653, + "num_input_tokens_seen": 14674952, + "step": 25310 + }, + { + "epoch": 3.770479594876378, + "grad_norm": 10.87052059173584, + "learning_rate": 4.881609835007171e-05, + "loss": 0.4239, + "num_input_tokens_seen": 14677800, + "step": 25315 + }, + { + "epoch": 3.771224307417337, + "grad_norm": 7.892388820648193, + "learning_rate": 4.881511003870435e-05, + "loss": 0.4906, + "num_input_tokens_seen": 14680552, + "step": 25320 + }, + { + "epoch": 3.7719690199582963, + "grad_norm": 9.132307052612305, + "learning_rate": 4.881412132500491e-05, + "loss": 0.5757, + "num_input_tokens_seen": 14683368, + "step": 25325 + }, + { + "epoch": 3.7727137324992555, + "grad_norm": 3.6892192363739014, + "learning_rate": 4.8813132208990095e-05, + "loss": 0.55, + "num_input_tokens_seen": 14686312, + "step": 25330 + }, + { + "epoch": 3.773458445040214, + "grad_norm": 21.306013107299805, + "learning_rate": 4.881214269067662e-05, + "loss": 0.3644, + "num_input_tokens_seen": 14689544, + "step": 25335 + }, + { + "epoch": 3.774203157581174, + "grad_norm": 26.005970001220703, + "learning_rate": 4.881115277008119e-05, + "loss": 0.6505, + "num_input_tokens_seen": 14692488, + "step": 25340 + }, + { + "epoch": 3.7749478701221326, + "grad_norm": 5.673755645751953, + "learning_rate": 4.881016244722054e-05, + "loss": 0.353, + "num_input_tokens_seen": 14695368, + "step": 25345 + }, + { + "epoch": 3.7756925826630923, + "grad_norm": 14.320941925048828, + "learning_rate": 4.880917172211139e-05, + "loss": 0.2315, + "num_input_tokens_seen": 14698056, + "step": 25350 + }, + { + "epoch": 3.776437295204051, + "grad_norm": 38.254180908203125, + "learning_rate": 4.8808180594770486e-05, + "loss": 0.4274, + "num_input_tokens_seen": 14700872, + "step": 25355 + }, + { + "epoch": 3.7771820077450107, + "grad_norm": 0.3219538927078247, + "learning_rate": 4.880718906521456e-05, + "loss": 0.2679, + "num_input_tokens_seen": 14703464, + "step": 25360 + }, + { + "epoch": 3.7779267202859694, + "grad_norm": 10.447708129882812, + "learning_rate": 4.880619713346039e-05, + "loss": 0.6629, + "num_input_tokens_seen": 14706248, + "step": 25365 + }, + { + "epoch": 3.778671432826929, + "grad_norm": 9.697296142578125, + "learning_rate": 4.8805204799524695e-05, + "loss": 0.4213, + "num_input_tokens_seen": 14709288, + "step": 25370 + }, + { + "epoch": 3.779416145367888, + "grad_norm": 9.842754364013672, + "learning_rate": 4.880421206342427e-05, + "loss": 0.3325, + "num_input_tokens_seen": 14712104, + "step": 25375 + }, + { + "epoch": 3.780160857908847, + "grad_norm": 12.600984573364258, + "learning_rate": 4.880321892517587e-05, + "loss": 0.9435, + "num_input_tokens_seen": 14715144, + "step": 25380 + }, + { + "epoch": 3.7809055704498062, + "grad_norm": 3.892561435699463, + "learning_rate": 4.880222538479629e-05, + "loss": 0.3796, + "num_input_tokens_seen": 14717928, + "step": 25385 + }, + { + "epoch": 3.7816502829907654, + "grad_norm": 13.903931617736816, + "learning_rate": 4.880123144230229e-05, + "loss": 0.2831, + "num_input_tokens_seen": 14720968, + "step": 25390 + }, + { + "epoch": 3.7823949955317246, + "grad_norm": 3.744253635406494, + "learning_rate": 4.880023709771068e-05, + "loss": 0.3536, + "num_input_tokens_seen": 14724072, + "step": 25395 + }, + { + "epoch": 3.783139708072684, + "grad_norm": 12.224231719970703, + "learning_rate": 4.8799242351038257e-05, + "loss": 0.4278, + "num_input_tokens_seen": 14726984, + "step": 25400 + }, + { + "epoch": 3.783884420613643, + "grad_norm": 6.04528284072876, + "learning_rate": 4.8798247202301824e-05, + "loss": 0.463, + "num_input_tokens_seen": 14729832, + "step": 25405 + }, + { + "epoch": 3.7846291331546023, + "grad_norm": 22.1705379486084, + "learning_rate": 4.879725165151818e-05, + "loss": 0.4278, + "num_input_tokens_seen": 14732552, + "step": 25410 + }, + { + "epoch": 3.7853738456955615, + "grad_norm": 4.221048355102539, + "learning_rate": 4.8796255698704165e-05, + "loss": 0.6083, + "num_input_tokens_seen": 14735496, + "step": 25415 + }, + { + "epoch": 3.7861185582365207, + "grad_norm": 10.66069221496582, + "learning_rate": 4.87952593438766e-05, + "loss": 0.3963, + "num_input_tokens_seen": 14738344, + "step": 25420 + }, + { + "epoch": 3.78686327077748, + "grad_norm": 28.97307014465332, + "learning_rate": 4.879426258705231e-05, + "loss": 0.4096, + "num_input_tokens_seen": 14741448, + "step": 25425 + }, + { + "epoch": 3.787607983318439, + "grad_norm": 4.627959251403809, + "learning_rate": 4.879326542824813e-05, + "loss": 0.2107, + "num_input_tokens_seen": 14744264, + "step": 25430 + }, + { + "epoch": 3.7883526958593983, + "grad_norm": 6.1031389236450195, + "learning_rate": 4.8792267867480926e-05, + "loss": 0.49, + "num_input_tokens_seen": 14747304, + "step": 25435 + }, + { + "epoch": 3.7890974084003575, + "grad_norm": 20.913450241088867, + "learning_rate": 4.879126990476752e-05, + "loss": 0.819, + "num_input_tokens_seen": 14750344, + "step": 25440 + }, + { + "epoch": 3.7898421209413167, + "grad_norm": 13.304285049438477, + "learning_rate": 4.8790271540124796e-05, + "loss": 0.3427, + "num_input_tokens_seen": 14753576, + "step": 25445 + }, + { + "epoch": 3.790586833482276, + "grad_norm": 9.093703269958496, + "learning_rate": 4.8789272773569625e-05, + "loss": 0.3898, + "num_input_tokens_seen": 14756200, + "step": 25450 + }, + { + "epoch": 3.791331546023235, + "grad_norm": 18.553110122680664, + "learning_rate": 4.8788273605118855e-05, + "loss": 0.3948, + "num_input_tokens_seen": 14759176, + "step": 25455 + }, + { + "epoch": 3.7920762585641943, + "grad_norm": 13.500348091125488, + "learning_rate": 4.8787274034789386e-05, + "loss": 0.3316, + "num_input_tokens_seen": 14762280, + "step": 25460 + }, + { + "epoch": 3.7928209711051535, + "grad_norm": 34.62146759033203, + "learning_rate": 4.87862740625981e-05, + "loss": 0.4329, + "num_input_tokens_seen": 14765032, + "step": 25465 + }, + { + "epoch": 3.7935656836461127, + "grad_norm": 19.451589584350586, + "learning_rate": 4.878527368856189e-05, + "loss": 0.6651, + "num_input_tokens_seen": 14767752, + "step": 25470 + }, + { + "epoch": 3.794310396187072, + "grad_norm": 2.13606595993042, + "learning_rate": 4.878427291269765e-05, + "loss": 0.4776, + "num_input_tokens_seen": 14770536, + "step": 25475 + }, + { + "epoch": 3.795055108728031, + "grad_norm": 12.366264343261719, + "learning_rate": 4.878327173502229e-05, + "loss": 0.5199, + "num_input_tokens_seen": 14773384, + "step": 25480 + }, + { + "epoch": 3.7957998212689903, + "grad_norm": 26.112783432006836, + "learning_rate": 4.8782270155552735e-05, + "loss": 0.5294, + "num_input_tokens_seen": 14776328, + "step": 25485 + }, + { + "epoch": 3.7965445338099495, + "grad_norm": 5.0277910232543945, + "learning_rate": 4.878126817430588e-05, + "loss": 0.2596, + "num_input_tokens_seen": 14779176, + "step": 25490 + }, + { + "epoch": 3.7972892463509087, + "grad_norm": 10.255221366882324, + "learning_rate": 4.878026579129868e-05, + "loss": 0.4044, + "num_input_tokens_seen": 14782312, + "step": 25495 + }, + { + "epoch": 3.798033958891868, + "grad_norm": 26.02570152282715, + "learning_rate": 4.877926300654807e-05, + "loss": 0.4939, + "num_input_tokens_seen": 14785576, + "step": 25500 + }, + { + "epoch": 3.798778671432827, + "grad_norm": 1.9959425926208496, + "learning_rate": 4.877825982007097e-05, + "loss": 0.3819, + "num_input_tokens_seen": 14788456, + "step": 25505 + }, + { + "epoch": 3.799523383973786, + "grad_norm": 23.414710998535156, + "learning_rate": 4.877725623188434e-05, + "loss": 0.5772, + "num_input_tokens_seen": 14791272, + "step": 25510 + }, + { + "epoch": 3.8002680965147455, + "grad_norm": 10.882683753967285, + "learning_rate": 4.8776252242005124e-05, + "loss": 0.5465, + "num_input_tokens_seen": 14794056, + "step": 25515 + }, + { + "epoch": 3.8010128090557043, + "grad_norm": 6.499012470245361, + "learning_rate": 4.87752478504503e-05, + "loss": 0.3619, + "num_input_tokens_seen": 14797224, + "step": 25520 + }, + { + "epoch": 3.801757521596664, + "grad_norm": 13.756539344787598, + "learning_rate": 4.8774243057236824e-05, + "loss": 0.4509, + "num_input_tokens_seen": 14800136, + "step": 25525 + }, + { + "epoch": 3.8025022341376227, + "grad_norm": 4.441187381744385, + "learning_rate": 4.877323786238167e-05, + "loss": 0.3271, + "num_input_tokens_seen": 14803016, + "step": 25530 + }, + { + "epoch": 3.8032469466785823, + "grad_norm": 46.60797119140625, + "learning_rate": 4.877223226590184e-05, + "loss": 0.4649, + "num_input_tokens_seen": 14805800, + "step": 25535 + }, + { + "epoch": 3.803991659219541, + "grad_norm": 6.720878601074219, + "learning_rate": 4.877122626781429e-05, + "loss": 0.3122, + "num_input_tokens_seen": 14808936, + "step": 25540 + }, + { + "epoch": 3.8047363717605007, + "grad_norm": 12.088582992553711, + "learning_rate": 4.8770219868136036e-05, + "loss": 0.4471, + "num_input_tokens_seen": 14811496, + "step": 25545 + }, + { + "epoch": 3.8054810843014595, + "grad_norm": 7.043824672698975, + "learning_rate": 4.876921306688408e-05, + "loss": 0.3781, + "num_input_tokens_seen": 14814568, + "step": 25550 + }, + { + "epoch": 3.8062257968424187, + "grad_norm": 13.708864212036133, + "learning_rate": 4.8768205864075425e-05, + "loss": 0.4108, + "num_input_tokens_seen": 14817544, + "step": 25555 + }, + { + "epoch": 3.806970509383378, + "grad_norm": 5.36560583114624, + "learning_rate": 4.876719825972709e-05, + "loss": 0.5471, + "num_input_tokens_seen": 14820776, + "step": 25560 + }, + { + "epoch": 3.807715221924337, + "grad_norm": 9.27157211303711, + "learning_rate": 4.8766190253856106e-05, + "loss": 0.4355, + "num_input_tokens_seen": 14823976, + "step": 25565 + }, + { + "epoch": 3.8084599344652963, + "grad_norm": 11.277152061462402, + "learning_rate": 4.876518184647948e-05, + "loss": 0.3275, + "num_input_tokens_seen": 14826568, + "step": 25570 + }, + { + "epoch": 3.8092046470062555, + "grad_norm": 22.428003311157227, + "learning_rate": 4.8764173037614256e-05, + "loss": 0.4372, + "num_input_tokens_seen": 14829224, + "step": 25575 + }, + { + "epoch": 3.8099493595472147, + "grad_norm": 14.689688682556152, + "learning_rate": 4.876316382727749e-05, + "loss": 0.3694, + "num_input_tokens_seen": 14832232, + "step": 25580 + }, + { + "epoch": 3.810694072088174, + "grad_norm": 15.106724739074707, + "learning_rate": 4.8762154215486225e-05, + "loss": 0.4957, + "num_input_tokens_seen": 14835048, + "step": 25585 + }, + { + "epoch": 3.811438784629133, + "grad_norm": 27.76924705505371, + "learning_rate": 4.876114420225751e-05, + "loss": 0.6151, + "num_input_tokens_seen": 14838280, + "step": 25590 + }, + { + "epoch": 3.8121834971700923, + "grad_norm": 5.056285381317139, + "learning_rate": 4.876013378760842e-05, + "loss": 0.2723, + "num_input_tokens_seen": 14841096, + "step": 25595 + }, + { + "epoch": 3.8129282097110515, + "grad_norm": 13.316076278686523, + "learning_rate": 4.875912297155601e-05, + "loss": 0.6113, + "num_input_tokens_seen": 14844136, + "step": 25600 + }, + { + "epoch": 3.8136729222520107, + "grad_norm": 5.784060001373291, + "learning_rate": 4.875811175411737e-05, + "loss": 0.3701, + "num_input_tokens_seen": 14847112, + "step": 25605 + }, + { + "epoch": 3.81441763479297, + "grad_norm": 8.432548522949219, + "learning_rate": 4.875710013530958e-05, + "loss": 0.2982, + "num_input_tokens_seen": 14850088, + "step": 25610 + }, + { + "epoch": 3.815162347333929, + "grad_norm": 11.75053596496582, + "learning_rate": 4.8756088115149724e-05, + "loss": 0.6392, + "num_input_tokens_seen": 14852744, + "step": 25615 + }, + { + "epoch": 3.8159070598748883, + "grad_norm": 1.186359167098999, + "learning_rate": 4.8755075693654906e-05, + "loss": 0.2026, + "num_input_tokens_seen": 14855688, + "step": 25620 + }, + { + "epoch": 3.8166517724158475, + "grad_norm": 18.889896392822266, + "learning_rate": 4.8754062870842234e-05, + "loss": 0.387, + "num_input_tokens_seen": 14858728, + "step": 25625 + }, + { + "epoch": 3.8173964849568067, + "grad_norm": 14.506684303283691, + "learning_rate": 4.87530496467288e-05, + "loss": 0.6135, + "num_input_tokens_seen": 14861544, + "step": 25630 + }, + { + "epoch": 3.818141197497766, + "grad_norm": 10.59233570098877, + "learning_rate": 4.875203602133174e-05, + "loss": 0.4547, + "num_input_tokens_seen": 14864392, + "step": 25635 + }, + { + "epoch": 3.818885910038725, + "grad_norm": 6.740523338317871, + "learning_rate": 4.875102199466817e-05, + "loss": 0.4308, + "num_input_tokens_seen": 14867208, + "step": 25640 + }, + { + "epoch": 3.8196306225796843, + "grad_norm": 22.46809196472168, + "learning_rate": 4.875000756675523e-05, + "loss": 0.6238, + "num_input_tokens_seen": 14869800, + "step": 25645 + }, + { + "epoch": 3.8203753351206435, + "grad_norm": 7.390871047973633, + "learning_rate": 4.874899273761004e-05, + "loss": 0.4022, + "num_input_tokens_seen": 14872552, + "step": 25650 + }, + { + "epoch": 3.8211200476616027, + "grad_norm": 5.925538063049316, + "learning_rate": 4.8747977507249765e-05, + "loss": 0.3989, + "num_input_tokens_seen": 14875784, + "step": 25655 + }, + { + "epoch": 3.821864760202562, + "grad_norm": 4.278029441833496, + "learning_rate": 4.874696187569154e-05, + "loss": 0.2932, + "num_input_tokens_seen": 14878824, + "step": 25660 + }, + { + "epoch": 3.822609472743521, + "grad_norm": 10.228463172912598, + "learning_rate": 4.874594584295253e-05, + "loss": 0.2806, + "num_input_tokens_seen": 14881992, + "step": 25665 + }, + { + "epoch": 3.8233541852844803, + "grad_norm": 1.1899477243423462, + "learning_rate": 4.87449294090499e-05, + "loss": 0.3671, + "num_input_tokens_seen": 14884872, + "step": 25670 + }, + { + "epoch": 3.824098897825439, + "grad_norm": 14.321659088134766, + "learning_rate": 4.874391257400083e-05, + "loss": 0.4687, + "num_input_tokens_seen": 14887688, + "step": 25675 + }, + { + "epoch": 3.8248436103663987, + "grad_norm": 0.9097030758857727, + "learning_rate": 4.874289533782247e-05, + "loss": 0.3623, + "num_input_tokens_seen": 14890600, + "step": 25680 + }, + { + "epoch": 3.8255883229073575, + "grad_norm": 12.926660537719727, + "learning_rate": 4.874187770053204e-05, + "loss": 0.474, + "num_input_tokens_seen": 14893416, + "step": 25685 + }, + { + "epoch": 3.826333035448317, + "grad_norm": 13.808107376098633, + "learning_rate": 4.874085966214671e-05, + "loss": 0.6326, + "num_input_tokens_seen": 14896360, + "step": 25690 + }, + { + "epoch": 3.827077747989276, + "grad_norm": 11.381433486938477, + "learning_rate": 4.873984122268369e-05, + "loss": 0.435, + "num_input_tokens_seen": 14899368, + "step": 25695 + }, + { + "epoch": 3.8278224605302356, + "grad_norm": 12.140795707702637, + "learning_rate": 4.873882238216017e-05, + "loss": 0.5101, + "num_input_tokens_seen": 14902184, + "step": 25700 + }, + { + "epoch": 3.8285671730711943, + "grad_norm": 13.138545989990234, + "learning_rate": 4.873780314059338e-05, + "loss": 0.6801, + "num_input_tokens_seen": 14905224, + "step": 25705 + }, + { + "epoch": 3.829311885612154, + "grad_norm": 10.613889694213867, + "learning_rate": 4.873678349800054e-05, + "loss": 0.8036, + "num_input_tokens_seen": 14908200, + "step": 25710 + }, + { + "epoch": 3.8300565981531127, + "grad_norm": 13.632935523986816, + "learning_rate": 4.873576345439886e-05, + "loss": 0.3637, + "num_input_tokens_seen": 14910888, + "step": 25715 + }, + { + "epoch": 3.830801310694072, + "grad_norm": 2.8459362983703613, + "learning_rate": 4.873474300980558e-05, + "loss": 0.3466, + "num_input_tokens_seen": 14913704, + "step": 25720 + }, + { + "epoch": 3.831546023235031, + "grad_norm": 6.447680950164795, + "learning_rate": 4.873372216423794e-05, + "loss": 0.2747, + "num_input_tokens_seen": 14916584, + "step": 25725 + }, + { + "epoch": 3.8322907357759903, + "grad_norm": 15.598708152770996, + "learning_rate": 4.8732700917713186e-05, + "loss": 0.6609, + "num_input_tokens_seen": 14919240, + "step": 25730 + }, + { + "epoch": 3.8330354483169495, + "grad_norm": 11.598516464233398, + "learning_rate": 4.8731679270248575e-05, + "loss": 0.5027, + "num_input_tokens_seen": 14922120, + "step": 25735 + }, + { + "epoch": 3.8337801608579087, + "grad_norm": 8.709067344665527, + "learning_rate": 4.8730657221861354e-05, + "loss": 0.4621, + "num_input_tokens_seen": 14925192, + "step": 25740 + }, + { + "epoch": 3.834524873398868, + "grad_norm": 4.959622859954834, + "learning_rate": 4.8729634772568805e-05, + "loss": 0.4569, + "num_input_tokens_seen": 14928424, + "step": 25745 + }, + { + "epoch": 3.835269585939827, + "grad_norm": 12.843873977661133, + "learning_rate": 4.872861192238819e-05, + "loss": 0.2837, + "num_input_tokens_seen": 14931432, + "step": 25750 + }, + { + "epoch": 3.8360142984807863, + "grad_norm": 15.500310897827148, + "learning_rate": 4.87275886713368e-05, + "loss": 0.6637, + "num_input_tokens_seen": 14934376, + "step": 25755 + }, + { + "epoch": 3.8367590110217455, + "grad_norm": 11.241668701171875, + "learning_rate": 4.872656501943191e-05, + "loss": 0.331, + "num_input_tokens_seen": 14937352, + "step": 25760 + }, + { + "epoch": 3.8375037235627047, + "grad_norm": 10.667028427124023, + "learning_rate": 4.872554096669082e-05, + "loss": 0.3183, + "num_input_tokens_seen": 14940200, + "step": 25765 + }, + { + "epoch": 3.838248436103664, + "grad_norm": 16.007280349731445, + "learning_rate": 4.8724516513130826e-05, + "loss": 0.6983, + "num_input_tokens_seen": 14943432, + "step": 25770 + }, + { + "epoch": 3.838993148644623, + "grad_norm": 6.5326080322265625, + "learning_rate": 4.872349165876924e-05, + "loss": 0.4347, + "num_input_tokens_seen": 14946120, + "step": 25775 + }, + { + "epoch": 3.8397378611855824, + "grad_norm": 14.250378608703613, + "learning_rate": 4.872246640362337e-05, + "loss": 0.3654, + "num_input_tokens_seen": 14949224, + "step": 25780 + }, + { + "epoch": 3.8404825737265416, + "grad_norm": 5.743088722229004, + "learning_rate": 4.872144074771054e-05, + "loss": 0.2818, + "num_input_tokens_seen": 14951912, + "step": 25785 + }, + { + "epoch": 3.8412272862675008, + "grad_norm": 11.902946472167969, + "learning_rate": 4.872041469104809e-05, + "loss": 0.3555, + "num_input_tokens_seen": 14954792, + "step": 25790 + }, + { + "epoch": 3.84197199880846, + "grad_norm": 24.75139617919922, + "learning_rate": 4.871938823365333e-05, + "loss": 0.657, + "num_input_tokens_seen": 14957928, + "step": 25795 + }, + { + "epoch": 3.842716711349419, + "grad_norm": 5.431997776031494, + "learning_rate": 4.871836137554362e-05, + "loss": 0.3296, + "num_input_tokens_seen": 14960904, + "step": 25800 + }, + { + "epoch": 3.8434614238903784, + "grad_norm": 6.392638206481934, + "learning_rate": 4.8717334116736293e-05, + "loss": 0.6437, + "num_input_tokens_seen": 14963944, + "step": 25805 + }, + { + "epoch": 3.8442061364313376, + "grad_norm": 24.799394607543945, + "learning_rate": 4.8716306457248717e-05, + "loss": 0.5693, + "num_input_tokens_seen": 14966728, + "step": 25810 + }, + { + "epoch": 3.8449508489722968, + "grad_norm": 9.728927612304688, + "learning_rate": 4.871527839709825e-05, + "loss": 0.3982, + "num_input_tokens_seen": 14969320, + "step": 25815 + }, + { + "epoch": 3.845695561513256, + "grad_norm": 5.526030540466309, + "learning_rate": 4.871424993630226e-05, + "loss": 0.1903, + "num_input_tokens_seen": 14973576, + "step": 25820 + }, + { + "epoch": 3.846440274054215, + "grad_norm": 29.003849029541016, + "learning_rate": 4.871322107487811e-05, + "loss": 0.1628, + "num_input_tokens_seen": 14976488, + "step": 25825 + }, + { + "epoch": 3.8471849865951744, + "grad_norm": 4.498303413391113, + "learning_rate": 4.8712191812843194e-05, + "loss": 0.578, + "num_input_tokens_seen": 14979528, + "step": 25830 + }, + { + "epoch": 3.8479296991361336, + "grad_norm": 17.807373046875, + "learning_rate": 4.87111621502149e-05, + "loss": 0.7756, + "num_input_tokens_seen": 14982536, + "step": 25835 + }, + { + "epoch": 3.848674411677093, + "grad_norm": 19.39900779724121, + "learning_rate": 4.871013208701062e-05, + "loss": 0.6415, + "num_input_tokens_seen": 14985640, + "step": 25840 + }, + { + "epoch": 3.849419124218052, + "grad_norm": 7.9176483154296875, + "learning_rate": 4.870910162324776e-05, + "loss": 0.5564, + "num_input_tokens_seen": 14988584, + "step": 25845 + }, + { + "epoch": 3.8501638367590107, + "grad_norm": 27.239513397216797, + "learning_rate": 4.8708070758943716e-05, + "loss": 0.3849, + "num_input_tokens_seen": 14991432, + "step": 25850 + }, + { + "epoch": 3.8509085492999704, + "grad_norm": 8.764863967895508, + "learning_rate": 4.870703949411591e-05, + "loss": 0.3792, + "num_input_tokens_seen": 14994536, + "step": 25855 + }, + { + "epoch": 3.851653261840929, + "grad_norm": 6.130487442016602, + "learning_rate": 4.8706007828781776e-05, + "loss": 0.2404, + "num_input_tokens_seen": 14997512, + "step": 25860 + }, + { + "epoch": 3.852397974381889, + "grad_norm": 20.864274978637695, + "learning_rate": 4.8704975762958734e-05, + "loss": 0.5366, + "num_input_tokens_seen": 15000264, + "step": 25865 + }, + { + "epoch": 3.8531426869228476, + "grad_norm": 15.868950843811035, + "learning_rate": 4.8703943296664214e-05, + "loss": 0.3985, + "num_input_tokens_seen": 15003272, + "step": 25870 + }, + { + "epoch": 3.853887399463807, + "grad_norm": 17.75833511352539, + "learning_rate": 4.8702910429915663e-05, + "loss": 0.3495, + "num_input_tokens_seen": 15006024, + "step": 25875 + }, + { + "epoch": 3.854632112004766, + "grad_norm": 2.188629150390625, + "learning_rate": 4.870187716273054e-05, + "loss": 0.3363, + "num_input_tokens_seen": 15008808, + "step": 25880 + }, + { + "epoch": 3.8553768245457256, + "grad_norm": 11.794265747070312, + "learning_rate": 4.870084349512628e-05, + "loss": 0.6307, + "num_input_tokens_seen": 15011752, + "step": 25885 + }, + { + "epoch": 3.8561215370866844, + "grad_norm": 8.162751197814941, + "learning_rate": 4.8699809427120364e-05, + "loss": 0.3356, + "num_input_tokens_seen": 15014536, + "step": 25890 + }, + { + "epoch": 3.8568662496276436, + "grad_norm": 5.646932125091553, + "learning_rate": 4.869877495873025e-05, + "loss": 0.4388, + "num_input_tokens_seen": 15017736, + "step": 25895 + }, + { + "epoch": 3.8576109621686028, + "grad_norm": 4.558351993560791, + "learning_rate": 4.869774008997343e-05, + "loss": 0.3928, + "num_input_tokens_seen": 15020744, + "step": 25900 + }, + { + "epoch": 3.858355674709562, + "grad_norm": 6.344761371612549, + "learning_rate": 4.869670482086737e-05, + "loss": 0.5769, + "num_input_tokens_seen": 15023688, + "step": 25905 + }, + { + "epoch": 3.859100387250521, + "grad_norm": 21.08473777770996, + "learning_rate": 4.869566915142956e-05, + "loss": 0.6703, + "num_input_tokens_seen": 15026664, + "step": 25910 + }, + { + "epoch": 3.8598450997914804, + "grad_norm": 5.317467212677002, + "learning_rate": 4.8694633081677507e-05, + "loss": 0.4163, + "num_input_tokens_seen": 15029512, + "step": 25915 + }, + { + "epoch": 3.8605898123324396, + "grad_norm": 5.33976411819458, + "learning_rate": 4.869359661162871e-05, + "loss": 0.4248, + "num_input_tokens_seen": 15032456, + "step": 25920 + }, + { + "epoch": 3.861334524873399, + "grad_norm": 14.429243087768555, + "learning_rate": 4.869255974130068e-05, + "loss": 0.4718, + "num_input_tokens_seen": 15035720, + "step": 25925 + }, + { + "epoch": 3.862079237414358, + "grad_norm": 13.008408546447754, + "learning_rate": 4.869152247071094e-05, + "loss": 0.2758, + "num_input_tokens_seen": 15038536, + "step": 25930 + }, + { + "epoch": 3.862823949955317, + "grad_norm": 6.653022289276123, + "learning_rate": 4.8690484799877004e-05, + "loss": 0.491, + "num_input_tokens_seen": 15041352, + "step": 25935 + }, + { + "epoch": 3.8635686624962764, + "grad_norm": 9.519402503967285, + "learning_rate": 4.86894467288164e-05, + "loss": 0.3745, + "num_input_tokens_seen": 15044616, + "step": 25940 + }, + { + "epoch": 3.8643133750372356, + "grad_norm": 14.596129417419434, + "learning_rate": 4.868840825754667e-05, + "loss": 0.3437, + "num_input_tokens_seen": 15047496, + "step": 25945 + }, + { + "epoch": 3.865058087578195, + "grad_norm": 12.514973640441895, + "learning_rate": 4.868736938608536e-05, + "loss": 0.2823, + "num_input_tokens_seen": 15050376, + "step": 25950 + }, + { + "epoch": 3.865802800119154, + "grad_norm": 6.808320045471191, + "learning_rate": 4.8686330114450025e-05, + "loss": 0.4265, + "num_input_tokens_seen": 15053480, + "step": 25955 + }, + { + "epoch": 3.866547512660113, + "grad_norm": 17.688764572143555, + "learning_rate": 4.868529044265821e-05, + "loss": 0.6215, + "num_input_tokens_seen": 15056328, + "step": 25960 + }, + { + "epoch": 3.8672922252010724, + "grad_norm": 13.195158958435059, + "learning_rate": 4.868425037072749e-05, + "loss": 0.2831, + "num_input_tokens_seen": 15059080, + "step": 25965 + }, + { + "epoch": 3.8680369377420316, + "grad_norm": 7.793754577636719, + "learning_rate": 4.868320989867543e-05, + "loss": 0.2304, + "num_input_tokens_seen": 15061992, + "step": 25970 + }, + { + "epoch": 3.868781650282991, + "grad_norm": 13.05263614654541, + "learning_rate": 4.868216902651961e-05, + "loss": 0.4346, + "num_input_tokens_seen": 15064968, + "step": 25975 + }, + { + "epoch": 3.86952636282395, + "grad_norm": 5.137773513793945, + "learning_rate": 4.8681127754277606e-05, + "loss": 0.3723, + "num_input_tokens_seen": 15068040, + "step": 25980 + }, + { + "epoch": 3.870271075364909, + "grad_norm": 6.5425848960876465, + "learning_rate": 4.868008608196702e-05, + "loss": 0.4856, + "num_input_tokens_seen": 15070792, + "step": 25985 + }, + { + "epoch": 3.8710157879058684, + "grad_norm": 5.584671497344971, + "learning_rate": 4.8679044009605455e-05, + "loss": 0.5853, + "num_input_tokens_seen": 15073672, + "step": 25990 + }, + { + "epoch": 3.8717605004468276, + "grad_norm": 29.260377883911133, + "learning_rate": 4.867800153721051e-05, + "loss": 0.3781, + "num_input_tokens_seen": 15076872, + "step": 25995 + }, + { + "epoch": 3.872505212987787, + "grad_norm": 3.782252788543701, + "learning_rate": 4.867695866479978e-05, + "loss": 0.2959, + "num_input_tokens_seen": 15079880, + "step": 26000 + }, + { + "epoch": 3.873249925528746, + "grad_norm": 16.319133758544922, + "learning_rate": 4.86759153923909e-05, + "loss": 0.6427, + "num_input_tokens_seen": 15082696, + "step": 26005 + }, + { + "epoch": 3.8739946380697052, + "grad_norm": 0.7012577652931213, + "learning_rate": 4.86748717200015e-05, + "loss": 0.419, + "num_input_tokens_seen": 15085608, + "step": 26010 + }, + { + "epoch": 3.8747393506106644, + "grad_norm": 15.677506446838379, + "learning_rate": 4.8673827647649206e-05, + "loss": 0.5738, + "num_input_tokens_seen": 15088328, + "step": 26015 + }, + { + "epoch": 3.8754840631516236, + "grad_norm": 6.710115909576416, + "learning_rate": 4.867278317535164e-05, + "loss": 0.4534, + "num_input_tokens_seen": 15091240, + "step": 26020 + }, + { + "epoch": 3.8762287756925824, + "grad_norm": 9.624505043029785, + "learning_rate": 4.867173830312648e-05, + "loss": 0.3749, + "num_input_tokens_seen": 15094088, + "step": 26025 + }, + { + "epoch": 3.876973488233542, + "grad_norm": 11.415849685668945, + "learning_rate": 4.867069303099135e-05, + "loss": 0.3668, + "num_input_tokens_seen": 15097096, + "step": 26030 + }, + { + "epoch": 3.877718200774501, + "grad_norm": 3.432267427444458, + "learning_rate": 4.8669647358963924e-05, + "loss": 0.2861, + "num_input_tokens_seen": 15099784, + "step": 26035 + }, + { + "epoch": 3.8784629133154604, + "grad_norm": 11.16938304901123, + "learning_rate": 4.866860128706186e-05, + "loss": 0.4068, + "num_input_tokens_seen": 15102888, + "step": 26040 + }, + { + "epoch": 3.879207625856419, + "grad_norm": 5.203248977661133, + "learning_rate": 4.866755481530284e-05, + "loss": 0.6683, + "num_input_tokens_seen": 15105640, + "step": 26045 + }, + { + "epoch": 3.879952338397379, + "grad_norm": 13.64001178741455, + "learning_rate": 4.866650794370452e-05, + "loss": 0.2983, + "num_input_tokens_seen": 15108584, + "step": 26050 + }, + { + "epoch": 3.8806970509383376, + "grad_norm": 7.482985496520996, + "learning_rate": 4.866546067228461e-05, + "loss": 0.4324, + "num_input_tokens_seen": 15111496, + "step": 26055 + }, + { + "epoch": 3.8814417634792973, + "grad_norm": 9.155741691589355, + "learning_rate": 4.866441300106081e-05, + "loss": 0.3149, + "num_input_tokens_seen": 15114728, + "step": 26060 + }, + { + "epoch": 3.882186476020256, + "grad_norm": 19.941265106201172, + "learning_rate": 4.866336493005078e-05, + "loss": 0.618, + "num_input_tokens_seen": 15117608, + "step": 26065 + }, + { + "epoch": 3.882931188561215, + "grad_norm": 11.461898803710938, + "learning_rate": 4.866231645927226e-05, + "loss": 0.7007, + "num_input_tokens_seen": 15120456, + "step": 26070 + }, + { + "epoch": 3.8836759011021744, + "grad_norm": 1.7923320531845093, + "learning_rate": 4.866126758874295e-05, + "loss": 0.5388, + "num_input_tokens_seen": 15123272, + "step": 26075 + }, + { + "epoch": 3.8844206136431336, + "grad_norm": 4.52647590637207, + "learning_rate": 4.8660218318480574e-05, + "loss": 0.2961, + "num_input_tokens_seen": 15126216, + "step": 26080 + }, + { + "epoch": 3.885165326184093, + "grad_norm": 4.274024486541748, + "learning_rate": 4.865916864850286e-05, + "loss": 0.4021, + "num_input_tokens_seen": 15128936, + "step": 26085 + }, + { + "epoch": 3.885910038725052, + "grad_norm": 6.002283573150635, + "learning_rate": 4.865811857882754e-05, + "loss": 0.4387, + "num_input_tokens_seen": 15131432, + "step": 26090 + }, + { + "epoch": 3.8866547512660112, + "grad_norm": 13.605327606201172, + "learning_rate": 4.8657068109472345e-05, + "loss": 0.508, + "num_input_tokens_seen": 15134536, + "step": 26095 + }, + { + "epoch": 3.8873994638069704, + "grad_norm": 1.1962512731552124, + "learning_rate": 4.8656017240455025e-05, + "loss": 0.3725, + "num_input_tokens_seen": 15137320, + "step": 26100 + }, + { + "epoch": 3.8881441763479296, + "grad_norm": 4.9241042137146, + "learning_rate": 4.865496597179334e-05, + "loss": 0.5103, + "num_input_tokens_seen": 15140200, + "step": 26105 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 10.600468635559082, + "learning_rate": 4.8653914303505054e-05, + "loss": 0.4634, + "num_input_tokens_seen": 15143272, + "step": 26110 + }, + { + "epoch": 3.889633601429848, + "grad_norm": 21.982677459716797, + "learning_rate": 4.865286223560792e-05, + "loss": 0.5871, + "num_input_tokens_seen": 15146088, + "step": 26115 + }, + { + "epoch": 3.8903783139708072, + "grad_norm": 10.510473251342773, + "learning_rate": 4.865180976811972e-05, + "loss": 0.3164, + "num_input_tokens_seen": 15148936, + "step": 26120 + }, + { + "epoch": 3.8911230265117664, + "grad_norm": 18.036865234375, + "learning_rate": 4.8650756901058225e-05, + "loss": 0.5174, + "num_input_tokens_seen": 15152040, + "step": 26125 + }, + { + "epoch": 3.8918677390527256, + "grad_norm": 10.684965133666992, + "learning_rate": 4.864970363444124e-05, + "loss": 0.2717, + "num_input_tokens_seen": 15155080, + "step": 26130 + }, + { + "epoch": 3.892612451593685, + "grad_norm": 0.9743322730064392, + "learning_rate": 4.864864996828654e-05, + "loss": 0.2943, + "num_input_tokens_seen": 15157768, + "step": 26135 + }, + { + "epoch": 3.893357164134644, + "grad_norm": 2.27021861076355, + "learning_rate": 4.864759590261194e-05, + "loss": 0.4112, + "num_input_tokens_seen": 15160488, + "step": 26140 + }, + { + "epoch": 3.8941018766756033, + "grad_norm": 8.408170700073242, + "learning_rate": 4.8646541437435246e-05, + "loss": 0.4038, + "num_input_tokens_seen": 15163336, + "step": 26145 + }, + { + "epoch": 3.8948465892165625, + "grad_norm": 10.58103084564209, + "learning_rate": 4.8645486572774266e-05, + "loss": 0.4596, + "num_input_tokens_seen": 15166376, + "step": 26150 + }, + { + "epoch": 3.8955913017575217, + "grad_norm": 3.547830820083618, + "learning_rate": 4.8644431308646815e-05, + "loss": 0.4426, + "num_input_tokens_seen": 15169448, + "step": 26155 + }, + { + "epoch": 3.896336014298481, + "grad_norm": 11.639758110046387, + "learning_rate": 4.8643375645070735e-05, + "loss": 0.3991, + "num_input_tokens_seen": 15172392, + "step": 26160 + }, + { + "epoch": 3.89708072683944, + "grad_norm": 13.834491729736328, + "learning_rate": 4.864231958206384e-05, + "loss": 0.5334, + "num_input_tokens_seen": 15176008, + "step": 26165 + }, + { + "epoch": 3.8978254393803993, + "grad_norm": 3.4569873809814453, + "learning_rate": 4.8641263119644004e-05, + "loss": 0.288, + "num_input_tokens_seen": 15178856, + "step": 26170 + }, + { + "epoch": 3.8985701519213585, + "grad_norm": 10.434660911560059, + "learning_rate": 4.864020625782905e-05, + "loss": 0.7516, + "num_input_tokens_seen": 15181512, + "step": 26175 + }, + { + "epoch": 3.8993148644623177, + "grad_norm": 15.874046325683594, + "learning_rate": 4.863914899663683e-05, + "loss": 0.2832, + "num_input_tokens_seen": 15184392, + "step": 26180 + }, + { + "epoch": 3.900059577003277, + "grad_norm": 5.40805196762085, + "learning_rate": 4.8638091336085224e-05, + "loss": 0.2381, + "num_input_tokens_seen": 15187272, + "step": 26185 + }, + { + "epoch": 3.900804289544236, + "grad_norm": 7.776261329650879, + "learning_rate": 4.863703327619208e-05, + "loss": 0.389, + "num_input_tokens_seen": 15189960, + "step": 26190 + }, + { + "epoch": 3.9015490020851953, + "grad_norm": 13.23540210723877, + "learning_rate": 4.863597481697528e-05, + "loss": 0.3587, + "num_input_tokens_seen": 15192648, + "step": 26195 + }, + { + "epoch": 3.902293714626154, + "grad_norm": 6.829612731933594, + "learning_rate": 4.8634915958452724e-05, + "loss": 0.4901, + "num_input_tokens_seen": 15195368, + "step": 26200 + }, + { + "epoch": 3.9030384271671137, + "grad_norm": 12.180083274841309, + "learning_rate": 4.863385670064227e-05, + "loss": 0.4749, + "num_input_tokens_seen": 15198472, + "step": 26205 + }, + { + "epoch": 3.9037831397080724, + "grad_norm": 12.980423927307129, + "learning_rate": 4.863279704356183e-05, + "loss": 0.3395, + "num_input_tokens_seen": 15201256, + "step": 26210 + }, + { + "epoch": 3.904527852249032, + "grad_norm": 29.948041915893555, + "learning_rate": 4.863173698722931e-05, + "loss": 0.5017, + "num_input_tokens_seen": 15204328, + "step": 26215 + }, + { + "epoch": 3.905272564789991, + "grad_norm": 0.13477498292922974, + "learning_rate": 4.863067653166261e-05, + "loss": 0.1997, + "num_input_tokens_seen": 15207272, + "step": 26220 + }, + { + "epoch": 3.9060172773309505, + "grad_norm": 19.20974349975586, + "learning_rate": 4.8629615676879634e-05, + "loss": 0.4941, + "num_input_tokens_seen": 15210376, + "step": 26225 + }, + { + "epoch": 3.9067619898719093, + "grad_norm": 13.974281311035156, + "learning_rate": 4.8628554422898334e-05, + "loss": 0.4551, + "num_input_tokens_seen": 15213256, + "step": 26230 + }, + { + "epoch": 3.907506702412869, + "grad_norm": 13.076735496520996, + "learning_rate": 4.8627492769736616e-05, + "loss": 0.4339, + "num_input_tokens_seen": 15216072, + "step": 26235 + }, + { + "epoch": 3.9082514149538277, + "grad_norm": 11.037123680114746, + "learning_rate": 4.862643071741242e-05, + "loss": 0.4263, + "num_input_tokens_seen": 15219176, + "step": 26240 + }, + { + "epoch": 3.908996127494787, + "grad_norm": 12.176637649536133, + "learning_rate": 4.8625368265943696e-05, + "loss": 0.3255, + "num_input_tokens_seen": 15222408, + "step": 26245 + }, + { + "epoch": 3.909740840035746, + "grad_norm": 5.939245223999023, + "learning_rate": 4.8624305415348374e-05, + "loss": 0.4127, + "num_input_tokens_seen": 15225384, + "step": 26250 + }, + { + "epoch": 3.9104855525767053, + "grad_norm": 9.089250564575195, + "learning_rate": 4.8623242165644436e-05, + "loss": 0.4611, + "num_input_tokens_seen": 15228264, + "step": 26255 + }, + { + "epoch": 3.9112302651176645, + "grad_norm": 10.06890869140625, + "learning_rate": 4.8622178516849824e-05, + "loss": 0.5269, + "num_input_tokens_seen": 15231176, + "step": 26260 + }, + { + "epoch": 3.9119749776586237, + "grad_norm": 13.459753036499023, + "learning_rate": 4.862111446898252e-05, + "loss": 0.4363, + "num_input_tokens_seen": 15233960, + "step": 26265 + }, + { + "epoch": 3.912719690199583, + "grad_norm": 14.311978340148926, + "learning_rate": 4.862005002206049e-05, + "loss": 0.294, + "num_input_tokens_seen": 15236584, + "step": 26270 + }, + { + "epoch": 3.913464402740542, + "grad_norm": 7.164458751678467, + "learning_rate": 4.8618985176101716e-05, + "loss": 0.4925, + "num_input_tokens_seen": 15239528, + "step": 26275 + }, + { + "epoch": 3.9142091152815013, + "grad_norm": 7.174775123596191, + "learning_rate": 4.86179199311242e-05, + "loss": 0.3412, + "num_input_tokens_seen": 15242728, + "step": 26280 + }, + { + "epoch": 3.9149538278224605, + "grad_norm": 8.011117935180664, + "learning_rate": 4.861685428714593e-05, + "loss": 0.2787, + "num_input_tokens_seen": 15245352, + "step": 26285 + }, + { + "epoch": 3.9156985403634197, + "grad_norm": 13.648384094238281, + "learning_rate": 4.861578824418491e-05, + "loss": 0.2793, + "num_input_tokens_seen": 15248360, + "step": 26290 + }, + { + "epoch": 3.916443252904379, + "grad_norm": 12.60913372039795, + "learning_rate": 4.861472180225915e-05, + "loss": 0.4555, + "num_input_tokens_seen": 15251112, + "step": 26295 + }, + { + "epoch": 3.917187965445338, + "grad_norm": 11.446046829223633, + "learning_rate": 4.861365496138667e-05, + "loss": 0.2791, + "num_input_tokens_seen": 15253736, + "step": 26300 + }, + { + "epoch": 3.9179326779862973, + "grad_norm": 34.21271514892578, + "learning_rate": 4.861258772158548e-05, + "loss": 0.5215, + "num_input_tokens_seen": 15256648, + "step": 26305 + }, + { + "epoch": 3.9186773905272565, + "grad_norm": 12.136170387268066, + "learning_rate": 4.861152008287362e-05, + "loss": 0.5952, + "num_input_tokens_seen": 15259624, + "step": 26310 + }, + { + "epoch": 3.9194221030682157, + "grad_norm": 17.790401458740234, + "learning_rate": 4.861045204526913e-05, + "loss": 0.4814, + "num_input_tokens_seen": 15262696, + "step": 26315 + }, + { + "epoch": 3.920166815609175, + "grad_norm": 2.7647290229797363, + "learning_rate": 4.8609383608790046e-05, + "loss": 0.2098, + "num_input_tokens_seen": 15265352, + "step": 26320 + }, + { + "epoch": 3.920911528150134, + "grad_norm": 14.02996826171875, + "learning_rate": 4.860831477345443e-05, + "loss": 0.5911, + "num_input_tokens_seen": 15268200, + "step": 26325 + }, + { + "epoch": 3.9216562406910933, + "grad_norm": 5.9926438331604, + "learning_rate": 4.860724553928032e-05, + "loss": 0.2447, + "num_input_tokens_seen": 15271240, + "step": 26330 + }, + { + "epoch": 3.9224009532320525, + "grad_norm": 32.242034912109375, + "learning_rate": 4.86061759062858e-05, + "loss": 0.4577, + "num_input_tokens_seen": 15274472, + "step": 26335 + }, + { + "epoch": 3.9231456657730117, + "grad_norm": 15.123550415039062, + "learning_rate": 4.8605105874488924e-05, + "loss": 0.512, + "num_input_tokens_seen": 15277544, + "step": 26340 + }, + { + "epoch": 3.923890378313971, + "grad_norm": 24.2602481842041, + "learning_rate": 4.8604035443907775e-05, + "loss": 0.4729, + "num_input_tokens_seen": 15280456, + "step": 26345 + }, + { + "epoch": 3.92463509085493, + "grad_norm": 29.573820114135742, + "learning_rate": 4.860296461456044e-05, + "loss": 0.5872, + "num_input_tokens_seen": 15283560, + "step": 26350 + }, + { + "epoch": 3.9253798033958893, + "grad_norm": 15.577838897705078, + "learning_rate": 4.8601893386465e-05, + "loss": 0.4142, + "num_input_tokens_seen": 15286568, + "step": 26355 + }, + { + "epoch": 3.9261245159368485, + "grad_norm": 13.556657791137695, + "learning_rate": 4.860082175963957e-05, + "loss": 0.3357, + "num_input_tokens_seen": 15289160, + "step": 26360 + }, + { + "epoch": 3.9268692284778077, + "grad_norm": 6.033516883850098, + "learning_rate": 4.859974973410224e-05, + "loss": 0.332, + "num_input_tokens_seen": 15292296, + "step": 26365 + }, + { + "epoch": 3.927613941018767, + "grad_norm": 12.41026782989502, + "learning_rate": 4.8598677309871123e-05, + "loss": 0.35, + "num_input_tokens_seen": 15295304, + "step": 26370 + }, + { + "epoch": 3.9283586535597257, + "grad_norm": 8.209663391113281, + "learning_rate": 4.859760448696433e-05, + "loss": 0.364, + "num_input_tokens_seen": 15297992, + "step": 26375 + }, + { + "epoch": 3.9291033661006853, + "grad_norm": 11.462959289550781, + "learning_rate": 4.85965312654e-05, + "loss": 0.4681, + "num_input_tokens_seen": 15301128, + "step": 26380 + }, + { + "epoch": 3.929848078641644, + "grad_norm": 16.12947654724121, + "learning_rate": 4.859545764519625e-05, + "loss": 0.4051, + "num_input_tokens_seen": 15304072, + "step": 26385 + }, + { + "epoch": 3.9305927911826037, + "grad_norm": 4.250662326812744, + "learning_rate": 4.859438362637123e-05, + "loss": 0.5064, + "num_input_tokens_seen": 15306920, + "step": 26390 + }, + { + "epoch": 3.9313375037235625, + "grad_norm": 7.391462802886963, + "learning_rate": 4.8593309208943085e-05, + "loss": 0.2493, + "num_input_tokens_seen": 15309960, + "step": 26395 + }, + { + "epoch": 3.932082216264522, + "grad_norm": 8.574335098266602, + "learning_rate": 4.859223439292995e-05, + "loss": 0.5453, + "num_input_tokens_seen": 15313032, + "step": 26400 + }, + { + "epoch": 3.932826928805481, + "grad_norm": 5.783840179443359, + "learning_rate": 4.859115917835e-05, + "loss": 0.3669, + "num_input_tokens_seen": 15315816, + "step": 26405 + }, + { + "epoch": 3.9335716413464406, + "grad_norm": 10.44391918182373, + "learning_rate": 4.859008356522139e-05, + "loss": 0.3716, + "num_input_tokens_seen": 15319080, + "step": 26410 + }, + { + "epoch": 3.9343163538873993, + "grad_norm": 4.080263137817383, + "learning_rate": 4.8589007553562293e-05, + "loss": 0.3621, + "num_input_tokens_seen": 15322088, + "step": 26415 + }, + { + "epoch": 3.9350610664283585, + "grad_norm": 12.785852432250977, + "learning_rate": 4.858793114339089e-05, + "loss": 0.5079, + "num_input_tokens_seen": 15325256, + "step": 26420 + }, + { + "epoch": 3.9358057789693177, + "grad_norm": 13.71063232421875, + "learning_rate": 4.8586854334725365e-05, + "loss": 0.455, + "num_input_tokens_seen": 15328232, + "step": 26425 + }, + { + "epoch": 3.936550491510277, + "grad_norm": 30.184518814086914, + "learning_rate": 4.8585777127583906e-05, + "loss": 0.4156, + "num_input_tokens_seen": 15330984, + "step": 26430 + }, + { + "epoch": 3.937295204051236, + "grad_norm": 9.415228843688965, + "learning_rate": 4.858469952198471e-05, + "loss": 0.3925, + "num_input_tokens_seen": 15333768, + "step": 26435 + }, + { + "epoch": 3.9380399165921953, + "grad_norm": 25.50408172607422, + "learning_rate": 4.8583621517945995e-05, + "loss": 0.6653, + "num_input_tokens_seen": 15336712, + "step": 26440 + }, + { + "epoch": 3.9387846291331545, + "grad_norm": 10.61294937133789, + "learning_rate": 4.858254311548596e-05, + "loss": 0.4678, + "num_input_tokens_seen": 15339432, + "step": 26445 + }, + { + "epoch": 3.9395293416741137, + "grad_norm": 13.006932258605957, + "learning_rate": 4.858146431462283e-05, + "loss": 0.2961, + "num_input_tokens_seen": 15342024, + "step": 26450 + }, + { + "epoch": 3.940274054215073, + "grad_norm": 11.704257011413574, + "learning_rate": 4.858038511537482e-05, + "loss": 0.3744, + "num_input_tokens_seen": 15345096, + "step": 26455 + }, + { + "epoch": 3.941018766756032, + "grad_norm": 8.968038558959961, + "learning_rate": 4.857930551776017e-05, + "loss": 0.1526, + "num_input_tokens_seen": 15347848, + "step": 26460 + }, + { + "epoch": 3.9417634792969913, + "grad_norm": 0.6313516497612, + "learning_rate": 4.857822552179713e-05, + "loss": 0.4259, + "num_input_tokens_seen": 15350792, + "step": 26465 + }, + { + "epoch": 3.9425081918379505, + "grad_norm": 33.3843879699707, + "learning_rate": 4.857714512750392e-05, + "loss": 0.598, + "num_input_tokens_seen": 15354056, + "step": 26470 + }, + { + "epoch": 3.9432529043789097, + "grad_norm": 6.686774730682373, + "learning_rate": 4.857606433489881e-05, + "loss": 0.3243, + "num_input_tokens_seen": 15357384, + "step": 26475 + }, + { + "epoch": 3.943997616919869, + "grad_norm": 22.16776466369629, + "learning_rate": 4.8574983144000055e-05, + "loss": 0.5344, + "num_input_tokens_seen": 15360392, + "step": 26480 + }, + { + "epoch": 3.944742329460828, + "grad_norm": 5.969074249267578, + "learning_rate": 4.8573901554825915e-05, + "loss": 0.2744, + "num_input_tokens_seen": 15363496, + "step": 26485 + }, + { + "epoch": 3.9454870420017873, + "grad_norm": 17.939550399780273, + "learning_rate": 4.857281956739468e-05, + "loss": 0.4014, + "num_input_tokens_seen": 15366248, + "step": 26490 + }, + { + "epoch": 3.9462317545427466, + "grad_norm": 4.337984561920166, + "learning_rate": 4.8571737181724606e-05, + "loss": 0.4254, + "num_input_tokens_seen": 15369160, + "step": 26495 + }, + { + "epoch": 3.9469764670837058, + "grad_norm": 19.775293350219727, + "learning_rate": 4.8570654397834e-05, + "loss": 0.6325, + "num_input_tokens_seen": 15372040, + "step": 26500 + }, + { + "epoch": 3.947721179624665, + "grad_norm": 23.191720962524414, + "learning_rate": 4.856957121574114e-05, + "loss": 0.6168, + "num_input_tokens_seen": 15375048, + "step": 26505 + }, + { + "epoch": 3.948465892165624, + "grad_norm": 9.100627899169922, + "learning_rate": 4.856848763546433e-05, + "loss": 0.4985, + "num_input_tokens_seen": 15378056, + "step": 26510 + }, + { + "epoch": 3.9492106047065834, + "grad_norm": 12.246919631958008, + "learning_rate": 4.856740365702187e-05, + "loss": 0.4059, + "num_input_tokens_seen": 15381000, + "step": 26515 + }, + { + "epoch": 3.9499553172475426, + "grad_norm": 5.83845329284668, + "learning_rate": 4.8566319280432085e-05, + "loss": 0.5131, + "num_input_tokens_seen": 15384072, + "step": 26520 + }, + { + "epoch": 3.9507000297885018, + "grad_norm": 12.214409828186035, + "learning_rate": 4.8565234505713276e-05, + "loss": 0.259, + "num_input_tokens_seen": 15387144, + "step": 26525 + }, + { + "epoch": 3.951444742329461, + "grad_norm": 13.87551212310791, + "learning_rate": 4.856414933288379e-05, + "loss": 0.4117, + "num_input_tokens_seen": 15390184, + "step": 26530 + }, + { + "epoch": 3.95218945487042, + "grad_norm": 5.487908363342285, + "learning_rate": 4.856306376196195e-05, + "loss": 0.431, + "num_input_tokens_seen": 15393416, + "step": 26535 + }, + { + "epoch": 3.9529341674113794, + "grad_norm": 46.90226364135742, + "learning_rate": 4.856197779296609e-05, + "loss": 0.3992, + "num_input_tokens_seen": 15396552, + "step": 26540 + }, + { + "epoch": 3.9536788799523386, + "grad_norm": 1.1309226751327515, + "learning_rate": 4.856089142591457e-05, + "loss": 0.1127, + "num_input_tokens_seen": 15399176, + "step": 26545 + }, + { + "epoch": 3.9544235924932973, + "grad_norm": 5.576254367828369, + "learning_rate": 4.855980466082574e-05, + "loss": 0.2565, + "num_input_tokens_seen": 15402184, + "step": 26550 + }, + { + "epoch": 3.955168305034257, + "grad_norm": 60.22212600708008, + "learning_rate": 4.855871749771794e-05, + "loss": 0.7338, + "num_input_tokens_seen": 15404840, + "step": 26555 + }, + { + "epoch": 3.9559130175752157, + "grad_norm": 29.270910263061523, + "learning_rate": 4.855762993660956e-05, + "loss": 0.4872, + "num_input_tokens_seen": 15407560, + "step": 26560 + }, + { + "epoch": 3.9566577301161754, + "grad_norm": 19.706134796142578, + "learning_rate": 4.855654197751896e-05, + "loss": 0.7649, + "num_input_tokens_seen": 15410408, + "step": 26565 + }, + { + "epoch": 3.957402442657134, + "grad_norm": 6.548169136047363, + "learning_rate": 4.855545362046454e-05, + "loss": 0.2486, + "num_input_tokens_seen": 15413192, + "step": 26570 + }, + { + "epoch": 3.958147155198094, + "grad_norm": 17.423208236694336, + "learning_rate": 4.855436486546466e-05, + "loss": 0.5201, + "num_input_tokens_seen": 15415944, + "step": 26575 + }, + { + "epoch": 3.9588918677390526, + "grad_norm": 12.365945816040039, + "learning_rate": 4.855327571253773e-05, + "loss": 0.652, + "num_input_tokens_seen": 15418600, + "step": 26580 + }, + { + "epoch": 3.9596365802800118, + "grad_norm": 2.5490317344665527, + "learning_rate": 4.855218616170214e-05, + "loss": 0.3574, + "num_input_tokens_seen": 15421416, + "step": 26585 + }, + { + "epoch": 3.960381292820971, + "grad_norm": 5.106264591217041, + "learning_rate": 4.855109621297631e-05, + "loss": 0.4191, + "num_input_tokens_seen": 15424392, + "step": 26590 + }, + { + "epoch": 3.96112600536193, + "grad_norm": 13.033598899841309, + "learning_rate": 4.855000586637864e-05, + "loss": 0.3165, + "num_input_tokens_seen": 15427336, + "step": 26595 + }, + { + "epoch": 3.9618707179028894, + "grad_norm": 2.6522626876831055, + "learning_rate": 4.854891512192755e-05, + "loss": 0.3816, + "num_input_tokens_seen": 15430536, + "step": 26600 + }, + { + "epoch": 3.9626154304438486, + "grad_norm": 13.815646171569824, + "learning_rate": 4.8547823979641484e-05, + "loss": 0.468, + "num_input_tokens_seen": 15433480, + "step": 26605 + }, + { + "epoch": 3.9633601429848078, + "grad_norm": 15.012660026550293, + "learning_rate": 4.854673243953886e-05, + "loss": 0.5238, + "num_input_tokens_seen": 15436648, + "step": 26610 + }, + { + "epoch": 3.964104855525767, + "grad_norm": 12.959755897521973, + "learning_rate": 4.854564050163812e-05, + "loss": 0.4581, + "num_input_tokens_seen": 15439592, + "step": 26615 + }, + { + "epoch": 3.964849568066726, + "grad_norm": 1.0236493349075317, + "learning_rate": 4.854454816595773e-05, + "loss": 0.2788, + "num_input_tokens_seen": 15442408, + "step": 26620 + }, + { + "epoch": 3.9655942806076854, + "grad_norm": 18.220605850219727, + "learning_rate": 4.854345543251611e-05, + "loss": 0.3733, + "num_input_tokens_seen": 15445160, + "step": 26625 + }, + { + "epoch": 3.9663389931486446, + "grad_norm": 10.508269309997559, + "learning_rate": 4.854236230133175e-05, + "loss": 0.4608, + "num_input_tokens_seen": 15447944, + "step": 26630 + }, + { + "epoch": 3.967083705689604, + "grad_norm": 27.930408477783203, + "learning_rate": 4.85412687724231e-05, + "loss": 0.5938, + "num_input_tokens_seen": 15450824, + "step": 26635 + }, + { + "epoch": 3.967828418230563, + "grad_norm": 9.11245346069336, + "learning_rate": 4.854017484580864e-05, + "loss": 0.4235, + "num_input_tokens_seen": 15453640, + "step": 26640 + }, + { + "epoch": 3.968573130771522, + "grad_norm": 7.354598045349121, + "learning_rate": 4.853908052150685e-05, + "loss": 0.4927, + "num_input_tokens_seen": 15456552, + "step": 26645 + }, + { + "epoch": 3.9693178433124814, + "grad_norm": 17.461292266845703, + "learning_rate": 4.8537985799536226e-05, + "loss": 0.7345, + "num_input_tokens_seen": 15459496, + "step": 26650 + }, + { + "epoch": 3.9700625558534406, + "grad_norm": 7.1842570304870605, + "learning_rate": 4.853689067991525e-05, + "loss": 0.5005, + "num_input_tokens_seen": 15462568, + "step": 26655 + }, + { + "epoch": 3.9708072683944, + "grad_norm": 20.628328323364258, + "learning_rate": 4.853579516266243e-05, + "loss": 0.5296, + "num_input_tokens_seen": 15465672, + "step": 26660 + }, + { + "epoch": 3.971551980935359, + "grad_norm": 19.300006866455078, + "learning_rate": 4.853469924779627e-05, + "loss": 0.4329, + "num_input_tokens_seen": 15468488, + "step": 26665 + }, + { + "epoch": 3.972296693476318, + "grad_norm": 22.74579429626465, + "learning_rate": 4.853360293533529e-05, + "loss": 0.5419, + "num_input_tokens_seen": 15471272, + "step": 26670 + }, + { + "epoch": 3.9730414060172774, + "grad_norm": 6.100747108459473, + "learning_rate": 4.8532506225298004e-05, + "loss": 0.4329, + "num_input_tokens_seen": 15474024, + "step": 26675 + }, + { + "epoch": 3.9737861185582366, + "grad_norm": 4.247342109680176, + "learning_rate": 4.853140911770294e-05, + "loss": 0.2893, + "num_input_tokens_seen": 15476872, + "step": 26680 + }, + { + "epoch": 3.974530831099196, + "grad_norm": 5.184132099151611, + "learning_rate": 4.853031161256863e-05, + "loss": 0.5198, + "num_input_tokens_seen": 15479496, + "step": 26685 + }, + { + "epoch": 3.975275543640155, + "grad_norm": 1.018433928489685, + "learning_rate": 4.8529213709913626e-05, + "loss": 0.1479, + "num_input_tokens_seen": 15482664, + "step": 26690 + }, + { + "epoch": 3.976020256181114, + "grad_norm": 12.856128692626953, + "learning_rate": 4.852811540975647e-05, + "loss": 0.3187, + "num_input_tokens_seen": 15485512, + "step": 26695 + }, + { + "epoch": 3.9767649687220734, + "grad_norm": 14.538322448730469, + "learning_rate": 4.8527016712115725e-05, + "loss": 0.3502, + "num_input_tokens_seen": 15488552, + "step": 26700 + }, + { + "epoch": 3.9775096812630326, + "grad_norm": 15.8223295211792, + "learning_rate": 4.8525917617009945e-05, + "loss": 0.4848, + "num_input_tokens_seen": 15491240, + "step": 26705 + }, + { + "epoch": 3.978254393803992, + "grad_norm": 6.416890621185303, + "learning_rate": 4.8524818124457684e-05, + "loss": 0.4235, + "num_input_tokens_seen": 15494184, + "step": 26710 + }, + { + "epoch": 3.9789991063449506, + "grad_norm": 21.94428825378418, + "learning_rate": 4.852371823447753e-05, + "loss": 0.631, + "num_input_tokens_seen": 15497064, + "step": 26715 + }, + { + "epoch": 3.9797438188859102, + "grad_norm": 29.24120330810547, + "learning_rate": 4.852261794708808e-05, + "loss": 0.5484, + "num_input_tokens_seen": 15499880, + "step": 26720 + }, + { + "epoch": 3.980488531426869, + "grad_norm": 5.439812183380127, + "learning_rate": 4.8521517262307895e-05, + "loss": 0.2288, + "num_input_tokens_seen": 15502824, + "step": 26725 + }, + { + "epoch": 3.9812332439678286, + "grad_norm": 21.78496551513672, + "learning_rate": 4.8520416180155594e-05, + "loss": 0.3158, + "num_input_tokens_seen": 15505864, + "step": 26730 + }, + { + "epoch": 3.9819779565087874, + "grad_norm": 5.004377841949463, + "learning_rate": 4.8519314700649757e-05, + "loss": 0.2598, + "num_input_tokens_seen": 15508840, + "step": 26735 + }, + { + "epoch": 3.982722669049747, + "grad_norm": 20.812952041625977, + "learning_rate": 4.8518212823809e-05, + "loss": 0.7653, + "num_input_tokens_seen": 15511592, + "step": 26740 + }, + { + "epoch": 3.983467381590706, + "grad_norm": 18.357929229736328, + "learning_rate": 4.851711054965194e-05, + "loss": 0.6425, + "num_input_tokens_seen": 15514536, + "step": 26745 + }, + { + "epoch": 3.9842120941316654, + "grad_norm": 17.09514808654785, + "learning_rate": 4.851600787819721e-05, + "loss": 0.6707, + "num_input_tokens_seen": 15517416, + "step": 26750 + }, + { + "epoch": 3.984956806672624, + "grad_norm": 10.607458114624023, + "learning_rate": 4.851490480946342e-05, + "loss": 0.5006, + "num_input_tokens_seen": 15520776, + "step": 26755 + }, + { + "epoch": 3.9857015192135834, + "grad_norm": 24.125062942504883, + "learning_rate": 4.851380134346921e-05, + "loss": 0.267, + "num_input_tokens_seen": 15523464, + "step": 26760 + }, + { + "epoch": 3.9864462317545426, + "grad_norm": 21.0236873626709, + "learning_rate": 4.851269748023323e-05, + "loss": 0.2585, + "num_input_tokens_seen": 15526312, + "step": 26765 + }, + { + "epoch": 3.987190944295502, + "grad_norm": 12.422174453735352, + "learning_rate": 4.851159321977412e-05, + "loss": 0.474, + "num_input_tokens_seen": 15529352, + "step": 26770 + }, + { + "epoch": 3.987935656836461, + "grad_norm": 38.26484298706055, + "learning_rate": 4.851048856211054e-05, + "loss": 0.4909, + "num_input_tokens_seen": 15532104, + "step": 26775 + }, + { + "epoch": 3.98868036937742, + "grad_norm": 5.840790748596191, + "learning_rate": 4.850938350726115e-05, + "loss": 0.3699, + "num_input_tokens_seen": 15534952, + "step": 26780 + }, + { + "epoch": 3.9894250819183794, + "grad_norm": 5.626121520996094, + "learning_rate": 4.8508278055244625e-05, + "loss": 0.7603, + "num_input_tokens_seen": 15537672, + "step": 26785 + }, + { + "epoch": 3.9901697944593386, + "grad_norm": 12.285517692565918, + "learning_rate": 4.8507172206079625e-05, + "loss": 0.5288, + "num_input_tokens_seen": 15541032, + "step": 26790 + }, + { + "epoch": 3.990914507000298, + "grad_norm": 10.372867584228516, + "learning_rate": 4.8506065959784854e-05, + "loss": 0.4772, + "num_input_tokens_seen": 15544008, + "step": 26795 + }, + { + "epoch": 3.991659219541257, + "grad_norm": 12.99700927734375, + "learning_rate": 4.8504959316378974e-05, + "loss": 0.5621, + "num_input_tokens_seen": 15547048, + "step": 26800 + }, + { + "epoch": 3.9924039320822162, + "grad_norm": 9.913450241088867, + "learning_rate": 4.850385227588071e-05, + "loss": 0.4639, + "num_input_tokens_seen": 15549928, + "step": 26805 + }, + { + "epoch": 3.9931486446231754, + "grad_norm": 8.139845848083496, + "learning_rate": 4.8502744838308744e-05, + "loss": 0.3323, + "num_input_tokens_seen": 15552808, + "step": 26810 + }, + { + "epoch": 3.9938933571641346, + "grad_norm": 14.891562461853027, + "learning_rate": 4.8501637003681786e-05, + "loss": 0.3153, + "num_input_tokens_seen": 15555560, + "step": 26815 + }, + { + "epoch": 3.994638069705094, + "grad_norm": 19.39789390563965, + "learning_rate": 4.850052877201857e-05, + "loss": 0.3888, + "num_input_tokens_seen": 15558312, + "step": 26820 + }, + { + "epoch": 3.995382782246053, + "grad_norm": 8.740579605102539, + "learning_rate": 4.8499420143337795e-05, + "loss": 0.5573, + "num_input_tokens_seen": 15561384, + "step": 26825 + }, + { + "epoch": 3.9961274947870122, + "grad_norm": 5.309429168701172, + "learning_rate": 4.84983111176582e-05, + "loss": 0.5368, + "num_input_tokens_seen": 15564296, + "step": 26830 + }, + { + "epoch": 3.9968722073279714, + "grad_norm": 14.711877822875977, + "learning_rate": 4.8497201694998526e-05, + "loss": 0.5567, + "num_input_tokens_seen": 15567240, + "step": 26835 + }, + { + "epoch": 3.9976169198689306, + "grad_norm": 5.8442254066467285, + "learning_rate": 4.849609187537751e-05, + "loss": 0.2883, + "num_input_tokens_seen": 15570184, + "step": 26840 + }, + { + "epoch": 3.99836163240989, + "grad_norm": 8.912090301513672, + "learning_rate": 4.8494981658813895e-05, + "loss": 0.6175, + "num_input_tokens_seen": 15573320, + "step": 26845 + }, + { + "epoch": 3.999106344950849, + "grad_norm": 9.485164642333984, + "learning_rate": 4.8493871045326455e-05, + "loss": 0.2469, + "num_input_tokens_seen": 15576104, + "step": 26850 + }, + { + "epoch": 3.9998510574918082, + "grad_norm": 21.908693313598633, + "learning_rate": 4.849276003493394e-05, + "loss": 0.5139, + "num_input_tokens_seen": 15578728, + "step": 26855 + }, + { + "epoch": 4.0, + "eval_loss": 0.7030687928199768, + "eval_runtime": 51.2449, + "eval_samples_per_second": 58.23, + "eval_steps_per_second": 14.558, + "num_input_tokens_seen": 15578848, + "step": 26856 + }, + { + "epoch": 4.000595770032767, + "grad_norm": 4.7098493576049805, + "learning_rate": 4.849164862765512e-05, + "loss": 0.2694, + "num_input_tokens_seen": 15581088, + "step": 26860 + }, + { + "epoch": 4.001340482573727, + "grad_norm": 16.698318481445312, + "learning_rate": 4.8490536823508767e-05, + "loss": 0.2156, + "num_input_tokens_seen": 15584096, + "step": 26865 + }, + { + "epoch": 4.002085195114685, + "grad_norm": 18.01613998413086, + "learning_rate": 4.848942462251367e-05, + "loss": 0.1679, + "num_input_tokens_seen": 15586848, + "step": 26870 + }, + { + "epoch": 4.002829907655645, + "grad_norm": 17.321083068847656, + "learning_rate": 4.848831202468862e-05, + "loss": 0.2021, + "num_input_tokens_seen": 15589696, + "step": 26875 + }, + { + "epoch": 4.003574620196604, + "grad_norm": 0.8553655743598938, + "learning_rate": 4.848719903005241e-05, + "loss": 0.1183, + "num_input_tokens_seen": 15592512, + "step": 26880 + }, + { + "epoch": 4.0043193327375635, + "grad_norm": 18.658159255981445, + "learning_rate": 4.848608563862385e-05, + "loss": 0.3157, + "num_input_tokens_seen": 15595296, + "step": 26885 + }, + { + "epoch": 4.005064045278522, + "grad_norm": 6.643109321594238, + "learning_rate": 4.848497185042173e-05, + "loss": 0.2801, + "num_input_tokens_seen": 15598176, + "step": 26890 + }, + { + "epoch": 4.005808757819482, + "grad_norm": 1.366552710533142, + "learning_rate": 4.84838576654649e-05, + "loss": 0.2122, + "num_input_tokens_seen": 15601120, + "step": 26895 + }, + { + "epoch": 4.006553470360441, + "grad_norm": 13.974163055419922, + "learning_rate": 4.848274308377214e-05, + "loss": 0.1029, + "num_input_tokens_seen": 15604064, + "step": 26900 + }, + { + "epoch": 4.0072981829014, + "grad_norm": 30.352985382080078, + "learning_rate": 4.8481628105362317e-05, + "loss": 0.2151, + "num_input_tokens_seen": 15607264, + "step": 26905 + }, + { + "epoch": 4.008042895442359, + "grad_norm": 0.06216701865196228, + "learning_rate": 4.848051273025425e-05, + "loss": 0.1147, + "num_input_tokens_seen": 15609920, + "step": 26910 + }, + { + "epoch": 4.008787607983319, + "grad_norm": 7.9658026695251465, + "learning_rate": 4.8479396958466783e-05, + "loss": 0.3277, + "num_input_tokens_seen": 15612800, + "step": 26915 + }, + { + "epoch": 4.009532320524277, + "grad_norm": 53.27970886230469, + "learning_rate": 4.8478280790018765e-05, + "loss": 0.3063, + "num_input_tokens_seen": 15615616, + "step": 26920 + }, + { + "epoch": 4.010277033065237, + "grad_norm": 19.78545379638672, + "learning_rate": 4.847716422492906e-05, + "loss": 0.5216, + "num_input_tokens_seen": 15618528, + "step": 26925 + }, + { + "epoch": 4.011021745606196, + "grad_norm": 8.71235179901123, + "learning_rate": 4.847604726321652e-05, + "loss": 0.2352, + "num_input_tokens_seen": 15621408, + "step": 26930 + }, + { + "epoch": 4.0117664581471555, + "grad_norm": 16.97102165222168, + "learning_rate": 4.847492990490003e-05, + "loss": 0.3477, + "num_input_tokens_seen": 15624064, + "step": 26935 + }, + { + "epoch": 4.012511170688114, + "grad_norm": 3.2110838890075684, + "learning_rate": 4.847381214999845e-05, + "loss": 0.3807, + "num_input_tokens_seen": 15627008, + "step": 26940 + }, + { + "epoch": 4.013255883229074, + "grad_norm": 13.341882705688477, + "learning_rate": 4.847269399853068e-05, + "loss": 0.0867, + "num_input_tokens_seen": 15629888, + "step": 26945 + }, + { + "epoch": 4.014000595770033, + "grad_norm": 29.357349395751953, + "learning_rate": 4.84715754505156e-05, + "loss": 0.3077, + "num_input_tokens_seen": 15632736, + "step": 26950 + }, + { + "epoch": 4.014745308310992, + "grad_norm": 0.10789491981267929, + "learning_rate": 4.8470456505972105e-05, + "loss": 0.0641, + "num_input_tokens_seen": 15635680, + "step": 26955 + }, + { + "epoch": 4.015490020851951, + "grad_norm": 0.3638428747653961, + "learning_rate": 4.8469337164919105e-05, + "loss": 0.1219, + "num_input_tokens_seen": 15638656, + "step": 26960 + }, + { + "epoch": 4.016234733392911, + "grad_norm": 66.97663116455078, + "learning_rate": 4.84682174273755e-05, + "loss": 0.293, + "num_input_tokens_seen": 15641568, + "step": 26965 + }, + { + "epoch": 4.0169794459338695, + "grad_norm": 0.007793432101607323, + "learning_rate": 4.846709729336022e-05, + "loss": 0.2983, + "num_input_tokens_seen": 15644768, + "step": 26970 + }, + { + "epoch": 4.017724158474829, + "grad_norm": 24.634437561035156, + "learning_rate": 4.846597676289218e-05, + "loss": 0.3034, + "num_input_tokens_seen": 15647904, + "step": 26975 + }, + { + "epoch": 4.018468871015788, + "grad_norm": 24.877416610717773, + "learning_rate": 4.846485583599031e-05, + "loss": 0.3101, + "num_input_tokens_seen": 15650688, + "step": 26980 + }, + { + "epoch": 4.0192135835567475, + "grad_norm": 7.892526149749756, + "learning_rate": 4.846373451267355e-05, + "loss": 0.2856, + "num_input_tokens_seen": 15653280, + "step": 26985 + }, + { + "epoch": 4.019958296097706, + "grad_norm": 0.03136107698082924, + "learning_rate": 4.846261279296085e-05, + "loss": 0.4486, + "num_input_tokens_seen": 15656640, + "step": 26990 + }, + { + "epoch": 4.020703008638666, + "grad_norm": 0.21841539442539215, + "learning_rate": 4.8461490676871146e-05, + "loss": 0.5341, + "num_input_tokens_seen": 15659296, + "step": 26995 + }, + { + "epoch": 4.021447721179625, + "grad_norm": 17.886075973510742, + "learning_rate": 4.84603681644234e-05, + "loss": 0.3158, + "num_input_tokens_seen": 15662080, + "step": 27000 + }, + { + "epoch": 4.022192433720583, + "grad_norm": 0.3499857187271118, + "learning_rate": 4.8459245255636585e-05, + "loss": 0.2947, + "num_input_tokens_seen": 15664704, + "step": 27005 + }, + { + "epoch": 4.022937146261543, + "grad_norm": 0.018281854689121246, + "learning_rate": 4.8458121950529654e-05, + "loss": 0.2666, + "num_input_tokens_seen": 15667392, + "step": 27010 + }, + { + "epoch": 4.023681858802502, + "grad_norm": 8.498516082763672, + "learning_rate": 4.845699824912161e-05, + "loss": 0.2002, + "num_input_tokens_seen": 15670272, + "step": 27015 + }, + { + "epoch": 4.0244265713434615, + "grad_norm": 0.7291115522384644, + "learning_rate": 4.845587415143141e-05, + "loss": 0.3085, + "num_input_tokens_seen": 15673248, + "step": 27020 + }, + { + "epoch": 4.02517128388442, + "grad_norm": 42.24843215942383, + "learning_rate": 4.845474965747806e-05, + "loss": 0.3924, + "num_input_tokens_seen": 15676032, + "step": 27025 + }, + { + "epoch": 4.02591599642538, + "grad_norm": 14.244879722595215, + "learning_rate": 4.8453624767280545e-05, + "loss": 0.3532, + "num_input_tokens_seen": 15679168, + "step": 27030 + }, + { + "epoch": 4.026660708966339, + "grad_norm": 19.58888053894043, + "learning_rate": 4.845249948085789e-05, + "loss": 0.1317, + "num_input_tokens_seen": 15681952, + "step": 27035 + }, + { + "epoch": 4.027405421507298, + "grad_norm": 6.90444803237915, + "learning_rate": 4.8451373798229085e-05, + "loss": 0.2599, + "num_input_tokens_seen": 15684928, + "step": 27040 + }, + { + "epoch": 4.028150134048257, + "grad_norm": 33.274662017822266, + "learning_rate": 4.845024771941316e-05, + "loss": 0.317, + "num_input_tokens_seen": 15687808, + "step": 27045 + }, + { + "epoch": 4.028894846589217, + "grad_norm": 0.12328913062810898, + "learning_rate": 4.844912124442912e-05, + "loss": 0.1846, + "num_input_tokens_seen": 15690816, + "step": 27050 + }, + { + "epoch": 4.0296395591301755, + "grad_norm": 6.310104846954346, + "learning_rate": 4.844799437329602e-05, + "loss": 0.4284, + "num_input_tokens_seen": 15693664, + "step": 27055 + }, + { + "epoch": 4.030384271671135, + "grad_norm": 0.625235915184021, + "learning_rate": 4.844686710603289e-05, + "loss": 0.3277, + "num_input_tokens_seen": 15696512, + "step": 27060 + }, + { + "epoch": 4.031128984212094, + "grad_norm": 14.040131568908691, + "learning_rate": 4.844573944265876e-05, + "loss": 0.4414, + "num_input_tokens_seen": 15699392, + "step": 27065 + }, + { + "epoch": 4.0318736967530535, + "grad_norm": 14.123116493225098, + "learning_rate": 4.8444611383192695e-05, + "loss": 0.2088, + "num_input_tokens_seen": 15702528, + "step": 27070 + }, + { + "epoch": 4.032618409294012, + "grad_norm": 1.2350318431854248, + "learning_rate": 4.844348292765375e-05, + "loss": 0.392, + "num_input_tokens_seen": 15705536, + "step": 27075 + }, + { + "epoch": 4.033363121834972, + "grad_norm": 25.492982864379883, + "learning_rate": 4.844235407606099e-05, + "loss": 0.2863, + "num_input_tokens_seen": 15708192, + "step": 27080 + }, + { + "epoch": 4.034107834375931, + "grad_norm": 6.014194965362549, + "learning_rate": 4.844122482843347e-05, + "loss": 0.0422, + "num_input_tokens_seen": 15711136, + "step": 27085 + }, + { + "epoch": 4.03485254691689, + "grad_norm": 23.787443161010742, + "learning_rate": 4.8440095184790304e-05, + "loss": 0.3798, + "num_input_tokens_seen": 15713920, + "step": 27090 + }, + { + "epoch": 4.035597259457849, + "grad_norm": 19.584115982055664, + "learning_rate": 4.843896514515054e-05, + "loss": 0.5502, + "num_input_tokens_seen": 15716576, + "step": 27095 + }, + { + "epoch": 4.036341971998809, + "grad_norm": 5.284081935882568, + "learning_rate": 4.843783470953328e-05, + "loss": 0.3671, + "num_input_tokens_seen": 15719392, + "step": 27100 + }, + { + "epoch": 4.0370866845397675, + "grad_norm": 11.621091842651367, + "learning_rate": 4.843670387795763e-05, + "loss": 0.4449, + "num_input_tokens_seen": 15722368, + "step": 27105 + }, + { + "epoch": 4.037831397080727, + "grad_norm": 21.021453857421875, + "learning_rate": 4.843557265044268e-05, + "loss": 0.2431, + "num_input_tokens_seen": 15725568, + "step": 27110 + }, + { + "epoch": 4.038576109621686, + "grad_norm": 19.3635196685791, + "learning_rate": 4.843444102700756e-05, + "loss": 0.3377, + "num_input_tokens_seen": 15728384, + "step": 27115 + }, + { + "epoch": 4.0393208221626455, + "grad_norm": 47.81679153442383, + "learning_rate": 4.843330900767137e-05, + "loss": 0.3272, + "num_input_tokens_seen": 15731104, + "step": 27120 + }, + { + "epoch": 4.040065534703604, + "grad_norm": 12.446748733520508, + "learning_rate": 4.843217659245324e-05, + "loss": 0.3477, + "num_input_tokens_seen": 15733856, + "step": 27125 + }, + { + "epoch": 4.040810247244564, + "grad_norm": 4.204812049865723, + "learning_rate": 4.843104378137231e-05, + "loss": 0.2667, + "num_input_tokens_seen": 15736704, + "step": 27130 + }, + { + "epoch": 4.041554959785523, + "grad_norm": 4.352664470672607, + "learning_rate": 4.84299105744477e-05, + "loss": 0.257, + "num_input_tokens_seen": 15739584, + "step": 27135 + }, + { + "epoch": 4.042299672326482, + "grad_norm": 4.822329521179199, + "learning_rate": 4.8428776971698566e-05, + "loss": 0.1032, + "num_input_tokens_seen": 15742464, + "step": 27140 + }, + { + "epoch": 4.043044384867441, + "grad_norm": 0.7639176845550537, + "learning_rate": 4.842764297314406e-05, + "loss": 0.1781, + "num_input_tokens_seen": 15745568, + "step": 27145 + }, + { + "epoch": 4.043789097408401, + "grad_norm": 9.70367431640625, + "learning_rate": 4.842650857880333e-05, + "loss": 0.434, + "num_input_tokens_seen": 15748512, + "step": 27150 + }, + { + "epoch": 4.0445338099493595, + "grad_norm": 2.7667033672332764, + "learning_rate": 4.842537378869556e-05, + "loss": 0.1973, + "num_input_tokens_seen": 15751328, + "step": 27155 + }, + { + "epoch": 4.045278522490319, + "grad_norm": 0.046423595398664474, + "learning_rate": 4.84242386028399e-05, + "loss": 0.2063, + "num_input_tokens_seen": 15754176, + "step": 27160 + }, + { + "epoch": 4.046023235031278, + "grad_norm": 3.6402294635772705, + "learning_rate": 4.8423103021255535e-05, + "loss": 0.391, + "num_input_tokens_seen": 15756992, + "step": 27165 + }, + { + "epoch": 4.046767947572237, + "grad_norm": 28.18526268005371, + "learning_rate": 4.842196704396165e-05, + "loss": 0.3556, + "num_input_tokens_seen": 15759680, + "step": 27170 + }, + { + "epoch": 4.047512660113196, + "grad_norm": 15.309419631958008, + "learning_rate": 4.842083067097744e-05, + "loss": 0.3121, + "num_input_tokens_seen": 15762848, + "step": 27175 + }, + { + "epoch": 4.048257372654155, + "grad_norm": 11.117042541503906, + "learning_rate": 4.84196939023221e-05, + "loss": 0.4152, + "num_input_tokens_seen": 15765888, + "step": 27180 + }, + { + "epoch": 4.049002085195115, + "grad_norm": 27.778825759887695, + "learning_rate": 4.841855673801483e-05, + "loss": 0.2782, + "num_input_tokens_seen": 15768896, + "step": 27185 + }, + { + "epoch": 4.0497467977360735, + "grad_norm": 19.715961456298828, + "learning_rate": 4.8417419178074854e-05, + "loss": 0.1813, + "num_input_tokens_seen": 15772032, + "step": 27190 + }, + { + "epoch": 4.050491510277033, + "grad_norm": 3.4521048069000244, + "learning_rate": 4.841628122252138e-05, + "loss": 0.5602, + "num_input_tokens_seen": 15774848, + "step": 27195 + }, + { + "epoch": 4.051236222817992, + "grad_norm": 6.392195701599121, + "learning_rate": 4.841514287137362e-05, + "loss": 0.2805, + "num_input_tokens_seen": 15777792, + "step": 27200 + }, + { + "epoch": 4.0519809353589515, + "grad_norm": 5.630928993225098, + "learning_rate": 4.841400412465083e-05, + "loss": 0.1663, + "num_input_tokens_seen": 15780736, + "step": 27205 + }, + { + "epoch": 4.05272564789991, + "grad_norm": 7.0007853507995605, + "learning_rate": 4.8412864982372244e-05, + "loss": 0.2368, + "num_input_tokens_seen": 15783488, + "step": 27210 + }, + { + "epoch": 4.05347036044087, + "grad_norm": 12.07503604888916, + "learning_rate": 4.841172544455709e-05, + "loss": 0.2548, + "num_input_tokens_seen": 15786432, + "step": 27215 + }, + { + "epoch": 4.054215072981829, + "grad_norm": 15.572641372680664, + "learning_rate": 4.841058551122463e-05, + "loss": 0.3149, + "num_input_tokens_seen": 15789184, + "step": 27220 + }, + { + "epoch": 4.054959785522788, + "grad_norm": 11.79189395904541, + "learning_rate": 4.840944518239412e-05, + "loss": 0.4491, + "num_input_tokens_seen": 15792128, + "step": 27225 + }, + { + "epoch": 4.055704498063747, + "grad_norm": 10.097400665283203, + "learning_rate": 4.840830445808483e-05, + "loss": 0.1032, + "num_input_tokens_seen": 15794944, + "step": 27230 + }, + { + "epoch": 4.056449210604707, + "grad_norm": 4.599643707275391, + "learning_rate": 4.840716333831602e-05, + "loss": 0.1291, + "num_input_tokens_seen": 15797408, + "step": 27235 + }, + { + "epoch": 4.0571939231456655, + "grad_norm": 30.633596420288086, + "learning_rate": 4.8406021823106985e-05, + "loss": 0.3318, + "num_input_tokens_seen": 15800544, + "step": 27240 + }, + { + "epoch": 4.057938635686625, + "grad_norm": 10.295047760009766, + "learning_rate": 4.8404879912477e-05, + "loss": 0.1332, + "num_input_tokens_seen": 15803520, + "step": 27245 + }, + { + "epoch": 4.058683348227584, + "grad_norm": 3.1456780433654785, + "learning_rate": 4.8403737606445355e-05, + "loss": 0.1403, + "num_input_tokens_seen": 15806592, + "step": 27250 + }, + { + "epoch": 4.059428060768544, + "grad_norm": 21.413076400756836, + "learning_rate": 4.8402594905031346e-05, + "loss": 0.1557, + "num_input_tokens_seen": 15809408, + "step": 27255 + }, + { + "epoch": 4.060172773309502, + "grad_norm": 34.854522705078125, + "learning_rate": 4.840145180825428e-05, + "loss": 0.4345, + "num_input_tokens_seen": 15812320, + "step": 27260 + }, + { + "epoch": 4.060917485850462, + "grad_norm": 12.199585914611816, + "learning_rate": 4.840030831613347e-05, + "loss": 0.1369, + "num_input_tokens_seen": 15815136, + "step": 27265 + }, + { + "epoch": 4.061662198391421, + "grad_norm": 2.2529194355010986, + "learning_rate": 4.8399164428688244e-05, + "loss": 0.4754, + "num_input_tokens_seen": 15818080, + "step": 27270 + }, + { + "epoch": 4.06240691093238, + "grad_norm": 36.69416427612305, + "learning_rate": 4.83980201459379e-05, + "loss": 0.11, + "num_input_tokens_seen": 15820864, + "step": 27275 + }, + { + "epoch": 4.063151623473339, + "grad_norm": 33.02505111694336, + "learning_rate": 4.83968754679018e-05, + "loss": 0.2938, + "num_input_tokens_seen": 15823616, + "step": 27280 + }, + { + "epoch": 4.063896336014299, + "grad_norm": 17.222274780273438, + "learning_rate": 4.839573039459927e-05, + "loss": 0.405, + "num_input_tokens_seen": 15826496, + "step": 27285 + }, + { + "epoch": 4.0646410485552575, + "grad_norm": 0.053471215069293976, + "learning_rate": 4.8394584926049644e-05, + "loss": 0.3299, + "num_input_tokens_seen": 15829344, + "step": 27290 + }, + { + "epoch": 4.065385761096217, + "grad_norm": 55.3663444519043, + "learning_rate": 4.839343906227229e-05, + "loss": 0.1441, + "num_input_tokens_seen": 15832448, + "step": 27295 + }, + { + "epoch": 4.066130473637176, + "grad_norm": 17.483356475830078, + "learning_rate": 4.8392292803286554e-05, + "loss": 0.1992, + "num_input_tokens_seen": 15835488, + "step": 27300 + }, + { + "epoch": 4.066875186178136, + "grad_norm": 19.193506240844727, + "learning_rate": 4.839114614911181e-05, + "loss": 0.3456, + "num_input_tokens_seen": 15838432, + "step": 27305 + }, + { + "epoch": 4.067619898719094, + "grad_norm": 2.017730951309204, + "learning_rate": 4.838999909976742e-05, + "loss": 0.2432, + "num_input_tokens_seen": 15841504, + "step": 27310 + }, + { + "epoch": 4.068364611260054, + "grad_norm": 7.076565742492676, + "learning_rate": 4.838885165527277e-05, + "loss": 0.1638, + "num_input_tokens_seen": 15844256, + "step": 27315 + }, + { + "epoch": 4.069109323801013, + "grad_norm": 0.09930155426263809, + "learning_rate": 4.8387703815647245e-05, + "loss": 0.2077, + "num_input_tokens_seen": 15847488, + "step": 27320 + }, + { + "epoch": 4.069854036341972, + "grad_norm": 18.006675720214844, + "learning_rate": 4.838655558091024e-05, + "loss": 0.4233, + "num_input_tokens_seen": 15850240, + "step": 27325 + }, + { + "epoch": 4.070598748882931, + "grad_norm": 26.346887588500977, + "learning_rate": 4.8385406951081135e-05, + "loss": 0.3078, + "num_input_tokens_seen": 15853504, + "step": 27330 + }, + { + "epoch": 4.071343461423891, + "grad_norm": 1.2083667516708374, + "learning_rate": 4.838425792617935e-05, + "loss": 0.2328, + "num_input_tokens_seen": 15856512, + "step": 27335 + }, + { + "epoch": 4.07208817396485, + "grad_norm": 16.84954833984375, + "learning_rate": 4.8383108506224304e-05, + "loss": 0.2328, + "num_input_tokens_seen": 15859232, + "step": 27340 + }, + { + "epoch": 4.072832886505808, + "grad_norm": 28.691280364990234, + "learning_rate": 4.8381958691235396e-05, + "loss": 0.427, + "num_input_tokens_seen": 15862336, + "step": 27345 + }, + { + "epoch": 4.073577599046768, + "grad_norm": 0.429085373878479, + "learning_rate": 4.838080848123206e-05, + "loss": 0.0369, + "num_input_tokens_seen": 15865280, + "step": 27350 + }, + { + "epoch": 4.074322311587727, + "grad_norm": 1.8387038707733154, + "learning_rate": 4.837965787623373e-05, + "loss": 0.334, + "num_input_tokens_seen": 15868192, + "step": 27355 + }, + { + "epoch": 4.075067024128686, + "grad_norm": 0.42709043622016907, + "learning_rate": 4.837850687625985e-05, + "loss": 0.1346, + "num_input_tokens_seen": 15871168, + "step": 27360 + }, + { + "epoch": 4.075811736669645, + "grad_norm": 36.081295013427734, + "learning_rate": 4.8377355481329846e-05, + "loss": 0.2507, + "num_input_tokens_seen": 15874080, + "step": 27365 + }, + { + "epoch": 4.076556449210605, + "grad_norm": 7.133817195892334, + "learning_rate": 4.8376203691463184e-05, + "loss": 0.2133, + "num_input_tokens_seen": 15876896, + "step": 27370 + }, + { + "epoch": 4.0773011617515635, + "grad_norm": 24.313020706176758, + "learning_rate": 4.837505150667932e-05, + "loss": 0.948, + "num_input_tokens_seen": 15879776, + "step": 27375 + }, + { + "epoch": 4.078045874292523, + "grad_norm": 22.487070083618164, + "learning_rate": 4.837389892699772e-05, + "loss": 0.4428, + "num_input_tokens_seen": 15882336, + "step": 27380 + }, + { + "epoch": 4.078790586833482, + "grad_norm": 9.228412628173828, + "learning_rate": 4.837274595243785e-05, + "loss": 0.2238, + "num_input_tokens_seen": 15885120, + "step": 27385 + }, + { + "epoch": 4.079535299374442, + "grad_norm": 12.200891494750977, + "learning_rate": 4.8371592583019196e-05, + "loss": 0.1886, + "num_input_tokens_seen": 15887840, + "step": 27390 + }, + { + "epoch": 4.0802800119154, + "grad_norm": 8.231766700744629, + "learning_rate": 4.8370438818761235e-05, + "loss": 0.3296, + "num_input_tokens_seen": 15890912, + "step": 27395 + }, + { + "epoch": 4.08102472445636, + "grad_norm": 18.195165634155273, + "learning_rate": 4.836928465968347e-05, + "loss": 0.2751, + "num_input_tokens_seen": 15893760, + "step": 27400 + }, + { + "epoch": 4.081769436997319, + "grad_norm": 2.1641786098480225, + "learning_rate": 4.836813010580538e-05, + "loss": 0.4012, + "num_input_tokens_seen": 15896352, + "step": 27405 + }, + { + "epoch": 4.082514149538278, + "grad_norm": 39.920413970947266, + "learning_rate": 4.836697515714649e-05, + "loss": 0.2418, + "num_input_tokens_seen": 15899360, + "step": 27410 + }, + { + "epoch": 4.083258862079237, + "grad_norm": 17.46534538269043, + "learning_rate": 4.8365819813726306e-05, + "loss": 0.1919, + "num_input_tokens_seen": 15902048, + "step": 27415 + }, + { + "epoch": 4.084003574620197, + "grad_norm": 9.190884590148926, + "learning_rate": 4.8364664075564334e-05, + "loss": 0.0171, + "num_input_tokens_seen": 15904928, + "step": 27420 + }, + { + "epoch": 4.084748287161156, + "grad_norm": 25.925884246826172, + "learning_rate": 4.836350794268012e-05, + "loss": 0.1777, + "num_input_tokens_seen": 15907872, + "step": 27425 + }, + { + "epoch": 4.085492999702115, + "grad_norm": 16.864013671875, + "learning_rate": 4.836235141509318e-05, + "loss": 0.4068, + "num_input_tokens_seen": 15910816, + "step": 27430 + }, + { + "epoch": 4.086237712243074, + "grad_norm": 15.773993492126465, + "learning_rate": 4.836119449282306e-05, + "loss": 0.1839, + "num_input_tokens_seen": 15913664, + "step": 27435 + }, + { + "epoch": 4.086982424784034, + "grad_norm": 1.7528049945831299, + "learning_rate": 4.8360037175889304e-05, + "loss": 0.229, + "num_input_tokens_seen": 15916512, + "step": 27440 + }, + { + "epoch": 4.087727137324992, + "grad_norm": 1.1369457244873047, + "learning_rate": 4.8358879464311455e-05, + "loss": 0.4422, + "num_input_tokens_seen": 15919552, + "step": 27445 + }, + { + "epoch": 4.088471849865952, + "grad_norm": 35.09120559692383, + "learning_rate": 4.835772135810909e-05, + "loss": 0.3398, + "num_input_tokens_seen": 15922592, + "step": 27450 + }, + { + "epoch": 4.089216562406911, + "grad_norm": 11.743972778320312, + "learning_rate": 4.8356562857301744e-05, + "loss": 0.3459, + "num_input_tokens_seen": 15925536, + "step": 27455 + }, + { + "epoch": 4.08996127494787, + "grad_norm": 12.166817665100098, + "learning_rate": 4.835540396190902e-05, + "loss": 0.4636, + "num_input_tokens_seen": 15928416, + "step": 27460 + }, + { + "epoch": 4.090705987488829, + "grad_norm": 35.00386428833008, + "learning_rate": 4.835424467195049e-05, + "loss": 0.4785, + "num_input_tokens_seen": 15931360, + "step": 27465 + }, + { + "epoch": 4.091450700029789, + "grad_norm": 8.450550079345703, + "learning_rate": 4.835308498744572e-05, + "loss": 0.418, + "num_input_tokens_seen": 15933984, + "step": 27470 + }, + { + "epoch": 4.092195412570748, + "grad_norm": 12.432250022888184, + "learning_rate": 4.8351924908414314e-05, + "loss": 0.0962, + "num_input_tokens_seen": 15936896, + "step": 27475 + }, + { + "epoch": 4.092940125111707, + "grad_norm": 6.962387561798096, + "learning_rate": 4.835076443487587e-05, + "loss": 0.3578, + "num_input_tokens_seen": 15939936, + "step": 27480 + }, + { + "epoch": 4.093684837652666, + "grad_norm": 16.462472915649414, + "learning_rate": 4.8349603566850003e-05, + "loss": 0.296, + "num_input_tokens_seen": 15942784, + "step": 27485 + }, + { + "epoch": 4.094429550193626, + "grad_norm": 10.331208229064941, + "learning_rate": 4.834844230435631e-05, + "loss": 0.2349, + "num_input_tokens_seen": 15945600, + "step": 27490 + }, + { + "epoch": 4.095174262734584, + "grad_norm": 0.015407692641019821, + "learning_rate": 4.8347280647414416e-05, + "loss": 0.1807, + "num_input_tokens_seen": 15948448, + "step": 27495 + }, + { + "epoch": 4.095918975275544, + "grad_norm": 32.41704559326172, + "learning_rate": 4.834611859604394e-05, + "loss": 0.4174, + "num_input_tokens_seen": 15951232, + "step": 27500 + }, + { + "epoch": 4.096663687816503, + "grad_norm": 2.1938555240631104, + "learning_rate": 4.8344956150264524e-05, + "loss": 0.1813, + "num_input_tokens_seen": 15954240, + "step": 27505 + }, + { + "epoch": 4.0974084003574625, + "grad_norm": 24.764019012451172, + "learning_rate": 4.83437933100958e-05, + "loss": 0.039, + "num_input_tokens_seen": 15956864, + "step": 27510 + }, + { + "epoch": 4.098153112898421, + "grad_norm": 8.638675689697266, + "learning_rate": 4.834263007555741e-05, + "loss": 0.1978, + "num_input_tokens_seen": 15959904, + "step": 27515 + }, + { + "epoch": 4.09889782543938, + "grad_norm": 4.988013744354248, + "learning_rate": 4.834146644666901e-05, + "loss": 0.4329, + "num_input_tokens_seen": 15962720, + "step": 27520 + }, + { + "epoch": 4.09964253798034, + "grad_norm": 12.770992279052734, + "learning_rate": 4.834030242345026e-05, + "loss": 0.1791, + "num_input_tokens_seen": 15965408, + "step": 27525 + }, + { + "epoch": 4.100387250521298, + "grad_norm": 4.351633071899414, + "learning_rate": 4.8339138005920825e-05, + "loss": 0.235, + "num_input_tokens_seen": 15968512, + "step": 27530 + }, + { + "epoch": 4.101131963062258, + "grad_norm": 19.69894790649414, + "learning_rate": 4.833797319410037e-05, + "loss": 0.2165, + "num_input_tokens_seen": 15971264, + "step": 27535 + }, + { + "epoch": 4.101876675603217, + "grad_norm": 20.27800750732422, + "learning_rate": 4.833680798800858e-05, + "loss": 0.3823, + "num_input_tokens_seen": 15974144, + "step": 27540 + }, + { + "epoch": 4.102621388144176, + "grad_norm": 15.961993217468262, + "learning_rate": 4.833564238766513e-05, + "loss": 0.4739, + "num_input_tokens_seen": 15977216, + "step": 27545 + }, + { + "epoch": 4.103366100685135, + "grad_norm": 0.2642837166786194, + "learning_rate": 4.8334476393089726e-05, + "loss": 0.2391, + "num_input_tokens_seen": 15980096, + "step": 27550 + }, + { + "epoch": 4.104110813226095, + "grad_norm": 3.7230772972106934, + "learning_rate": 4.8333310004302054e-05, + "loss": 0.1212, + "num_input_tokens_seen": 15982720, + "step": 27555 + }, + { + "epoch": 4.104855525767054, + "grad_norm": 0.04004412889480591, + "learning_rate": 4.833214322132183e-05, + "loss": 0.3666, + "num_input_tokens_seen": 15985344, + "step": 27560 + }, + { + "epoch": 4.105600238308013, + "grad_norm": 22.758804321289062, + "learning_rate": 4.8330976044168766e-05, + "loss": 0.4951, + "num_input_tokens_seen": 15988288, + "step": 27565 + }, + { + "epoch": 4.106344950848972, + "grad_norm": 5.759998321533203, + "learning_rate": 4.832980847286256e-05, + "loss": 0.0347, + "num_input_tokens_seen": 15991616, + "step": 27570 + }, + { + "epoch": 4.107089663389932, + "grad_norm": 0.47135117650032043, + "learning_rate": 4.832864050742296e-05, + "loss": 0.302, + "num_input_tokens_seen": 15994368, + "step": 27575 + }, + { + "epoch": 4.10783437593089, + "grad_norm": 32.361392974853516, + "learning_rate": 4.8327472147869684e-05, + "loss": 0.3016, + "num_input_tokens_seen": 15997184, + "step": 27580 + }, + { + "epoch": 4.10857908847185, + "grad_norm": 17.27287483215332, + "learning_rate": 4.8326303394222476e-05, + "loss": 0.1597, + "num_input_tokens_seen": 16000288, + "step": 27585 + }, + { + "epoch": 4.109323801012809, + "grad_norm": 0.7133654952049255, + "learning_rate": 4.832513424650108e-05, + "loss": 0.1207, + "num_input_tokens_seen": 16003040, + "step": 27590 + }, + { + "epoch": 4.1100685135537685, + "grad_norm": 0.44656112790107727, + "learning_rate": 4.8323964704725254e-05, + "loss": 0.1525, + "num_input_tokens_seen": 16005728, + "step": 27595 + }, + { + "epoch": 4.110813226094727, + "grad_norm": 8.561556816101074, + "learning_rate": 4.8322794768914745e-05, + "loss": 0.2082, + "num_input_tokens_seen": 16008768, + "step": 27600 + }, + { + "epoch": 4.111557938635687, + "grad_norm": 4.032522201538086, + "learning_rate": 4.832162443908932e-05, + "loss": 0.1259, + "num_input_tokens_seen": 16011680, + "step": 27605 + }, + { + "epoch": 4.112302651176646, + "grad_norm": 21.715042114257812, + "learning_rate": 4.832045371526876e-05, + "loss": 0.1542, + "num_input_tokens_seen": 16014368, + "step": 27610 + }, + { + "epoch": 4.113047363717605, + "grad_norm": 0.09694839268922806, + "learning_rate": 4.8319282597472823e-05, + "loss": 0.1403, + "num_input_tokens_seen": 16017056, + "step": 27615 + }, + { + "epoch": 4.113792076258564, + "grad_norm": 0.3768385946750641, + "learning_rate": 4.8318111085721324e-05, + "loss": 0.5772, + "num_input_tokens_seen": 16020256, + "step": 27620 + }, + { + "epoch": 4.114536788799524, + "grad_norm": 0.03471757471561432, + "learning_rate": 4.8316939180034025e-05, + "loss": 0.181, + "num_input_tokens_seen": 16023232, + "step": 27625 + }, + { + "epoch": 4.115281501340482, + "grad_norm": 46.428123474121094, + "learning_rate": 4.831576688043075e-05, + "loss": 0.5671, + "num_input_tokens_seen": 16025728, + "step": 27630 + }, + { + "epoch": 4.116026213881442, + "grad_norm": 31.08864402770996, + "learning_rate": 4.831459418693128e-05, + "loss": 0.4429, + "num_input_tokens_seen": 16028384, + "step": 27635 + }, + { + "epoch": 4.116770926422401, + "grad_norm": 14.723160743713379, + "learning_rate": 4.8313421099555436e-05, + "loss": 0.0531, + "num_input_tokens_seen": 16031296, + "step": 27640 + }, + { + "epoch": 4.1175156389633605, + "grad_norm": 0.8645750284194946, + "learning_rate": 4.831224761832304e-05, + "loss": 0.3773, + "num_input_tokens_seen": 16034048, + "step": 27645 + }, + { + "epoch": 4.118260351504319, + "grad_norm": 1.2342873811721802, + "learning_rate": 4.831107374325391e-05, + "loss": 0.628, + "num_input_tokens_seen": 16037280, + "step": 27650 + }, + { + "epoch": 4.119005064045279, + "grad_norm": 34.715736389160156, + "learning_rate": 4.8309899474367894e-05, + "loss": 0.6052, + "num_input_tokens_seen": 16040000, + "step": 27655 + }, + { + "epoch": 4.119749776586238, + "grad_norm": 16.856353759765625, + "learning_rate": 4.8308724811684805e-05, + "loss": 0.3022, + "num_input_tokens_seen": 16042720, + "step": 27660 + }, + { + "epoch": 4.120494489127197, + "grad_norm": 34.50474548339844, + "learning_rate": 4.830754975522451e-05, + "loss": 0.4069, + "num_input_tokens_seen": 16045888, + "step": 27665 + }, + { + "epoch": 4.121239201668156, + "grad_norm": 1.4051374197006226, + "learning_rate": 4.830637430500684e-05, + "loss": 0.6908, + "num_input_tokens_seen": 16048800, + "step": 27670 + }, + { + "epoch": 4.121983914209116, + "grad_norm": 1.2610232830047607, + "learning_rate": 4.830519846105167e-05, + "loss": 0.1253, + "num_input_tokens_seen": 16051872, + "step": 27675 + }, + { + "epoch": 4.1227286267500745, + "grad_norm": 2.811077356338501, + "learning_rate": 4.830402222337886e-05, + "loss": 0.1405, + "num_input_tokens_seen": 16054656, + "step": 27680 + }, + { + "epoch": 4.123473339291033, + "grad_norm": 7.4478230476379395, + "learning_rate": 4.830284559200828e-05, + "loss": 0.2046, + "num_input_tokens_seen": 16057472, + "step": 27685 + }, + { + "epoch": 4.124218051831993, + "grad_norm": 14.415043830871582, + "learning_rate": 4.83016685669598e-05, + "loss": 0.1637, + "num_input_tokens_seen": 16060512, + "step": 27690 + }, + { + "epoch": 4.124962764372952, + "grad_norm": 53.41264724731445, + "learning_rate": 4.8300491148253315e-05, + "loss": 0.2298, + "num_input_tokens_seen": 16063392, + "step": 27695 + }, + { + "epoch": 4.125707476913911, + "grad_norm": 0.31954795122146606, + "learning_rate": 4.829931333590872e-05, + "loss": 0.446, + "num_input_tokens_seen": 16066464, + "step": 27700 + }, + { + "epoch": 4.12645218945487, + "grad_norm": 10.926308631896973, + "learning_rate": 4.82981351299459e-05, + "loss": 0.2209, + "num_input_tokens_seen": 16069152, + "step": 27705 + }, + { + "epoch": 4.12719690199583, + "grad_norm": 12.352554321289062, + "learning_rate": 4.829695653038477e-05, + "loss": 0.2317, + "num_input_tokens_seen": 16071808, + "step": 27710 + }, + { + "epoch": 4.127941614536788, + "grad_norm": 11.972990036010742, + "learning_rate": 4.829577753724523e-05, + "loss": 0.3999, + "num_input_tokens_seen": 16074560, + "step": 27715 + }, + { + "epoch": 4.128686327077748, + "grad_norm": 15.959561347961426, + "learning_rate": 4.829459815054722e-05, + "loss": 0.0444, + "num_input_tokens_seen": 16077280, + "step": 27720 + }, + { + "epoch": 4.129431039618707, + "grad_norm": 1.028469443321228, + "learning_rate": 4.829341837031064e-05, + "loss": 0.202, + "num_input_tokens_seen": 16080320, + "step": 27725 + }, + { + "epoch": 4.1301757521596665, + "grad_norm": 8.664840698242188, + "learning_rate": 4.829223819655543e-05, + "loss": 0.1468, + "num_input_tokens_seen": 16083008, + "step": 27730 + }, + { + "epoch": 4.130920464700625, + "grad_norm": 30.4819278717041, + "learning_rate": 4.829105762930153e-05, + "loss": 0.3579, + "num_input_tokens_seen": 16085888, + "step": 27735 + }, + { + "epoch": 4.131665177241585, + "grad_norm": 1.2637711763381958, + "learning_rate": 4.8289876668568886e-05, + "loss": 0.0999, + "num_input_tokens_seen": 16088800, + "step": 27740 + }, + { + "epoch": 4.132409889782544, + "grad_norm": 74.02582550048828, + "learning_rate": 4.828869531437744e-05, + "loss": 0.2429, + "num_input_tokens_seen": 16091840, + "step": 27745 + }, + { + "epoch": 4.133154602323503, + "grad_norm": 32.580039978027344, + "learning_rate": 4.828751356674717e-05, + "loss": 0.3457, + "num_input_tokens_seen": 16094816, + "step": 27750 + }, + { + "epoch": 4.133899314864462, + "grad_norm": 31.32094955444336, + "learning_rate": 4.8286331425698014e-05, + "loss": 0.2534, + "num_input_tokens_seen": 16097472, + "step": 27755 + }, + { + "epoch": 4.134644027405422, + "grad_norm": 20.28478240966797, + "learning_rate": 4.828514889124995e-05, + "loss": 0.3121, + "num_input_tokens_seen": 16100576, + "step": 27760 + }, + { + "epoch": 4.1353887399463805, + "grad_norm": 17.288888931274414, + "learning_rate": 4.828396596342298e-05, + "loss": 0.4041, + "num_input_tokens_seen": 16103680, + "step": 27765 + }, + { + "epoch": 4.13613345248734, + "grad_norm": 23.858596801757812, + "learning_rate": 4.828278264223706e-05, + "loss": 0.6796, + "num_input_tokens_seen": 16106304, + "step": 27770 + }, + { + "epoch": 4.136878165028299, + "grad_norm": 24.841371536254883, + "learning_rate": 4.828159892771219e-05, + "loss": 0.1822, + "num_input_tokens_seen": 16109376, + "step": 27775 + }, + { + "epoch": 4.1376228775692585, + "grad_norm": 15.013252258300781, + "learning_rate": 4.828041481986837e-05, + "loss": 0.3025, + "num_input_tokens_seen": 16112160, + "step": 27780 + }, + { + "epoch": 4.138367590110217, + "grad_norm": 18.89687728881836, + "learning_rate": 4.82792303187256e-05, + "loss": 0.4909, + "num_input_tokens_seen": 16114880, + "step": 27785 + }, + { + "epoch": 4.139112302651177, + "grad_norm": 1.9619392156600952, + "learning_rate": 4.82780454243039e-05, + "loss": 0.1806, + "num_input_tokens_seen": 16117568, + "step": 27790 + }, + { + "epoch": 4.139857015192136, + "grad_norm": 0.42168059945106506, + "learning_rate": 4.827686013662327e-05, + "loss": 0.0915, + "num_input_tokens_seen": 16120608, + "step": 27795 + }, + { + "epoch": 4.140601727733095, + "grad_norm": 13.533555030822754, + "learning_rate": 4.827567445570376e-05, + "loss": 0.3847, + "num_input_tokens_seen": 16123328, + "step": 27800 + }, + { + "epoch": 4.141346440274054, + "grad_norm": 4.311286449432373, + "learning_rate": 4.827448838156537e-05, + "loss": 0.0832, + "num_input_tokens_seen": 16126464, + "step": 27805 + }, + { + "epoch": 4.142091152815014, + "grad_norm": 12.841445922851562, + "learning_rate": 4.827330191422817e-05, + "loss": 0.3247, + "num_input_tokens_seen": 16129312, + "step": 27810 + }, + { + "epoch": 4.1428358653559725, + "grad_norm": 13.627771377563477, + "learning_rate": 4.8272115053712185e-05, + "loss": 0.1424, + "num_input_tokens_seen": 16132224, + "step": 27815 + }, + { + "epoch": 4.143580577896932, + "grad_norm": 16.262836456298828, + "learning_rate": 4.8270927800037465e-05, + "loss": 0.249, + "num_input_tokens_seen": 16135168, + "step": 27820 + }, + { + "epoch": 4.144325290437891, + "grad_norm": 28.1720027923584, + "learning_rate": 4.826974015322407e-05, + "loss": 0.3542, + "num_input_tokens_seen": 16138688, + "step": 27825 + }, + { + "epoch": 4.1450700029788505, + "grad_norm": 42.45447540283203, + "learning_rate": 4.826855211329206e-05, + "loss": 0.6143, + "num_input_tokens_seen": 16141920, + "step": 27830 + }, + { + "epoch": 4.145814715519809, + "grad_norm": 7.676212787628174, + "learning_rate": 4.826736368026152e-05, + "loss": 0.1083, + "num_input_tokens_seen": 16144832, + "step": 27835 + }, + { + "epoch": 4.146559428060769, + "grad_norm": 22.087636947631836, + "learning_rate": 4.826617485415252e-05, + "loss": 0.3739, + "num_input_tokens_seen": 16147808, + "step": 27840 + }, + { + "epoch": 4.147304140601728, + "grad_norm": 20.77439308166504, + "learning_rate": 4.826498563498514e-05, + "loss": 0.4212, + "num_input_tokens_seen": 16150688, + "step": 27845 + }, + { + "epoch": 4.148048853142687, + "grad_norm": 25.85776710510254, + "learning_rate": 4.826379602277947e-05, + "loss": 0.5319, + "num_input_tokens_seen": 16153664, + "step": 27850 + }, + { + "epoch": 4.148793565683646, + "grad_norm": 12.572827339172363, + "learning_rate": 4.8262606017555616e-05, + "loss": 0.2141, + "num_input_tokens_seen": 16156672, + "step": 27855 + }, + { + "epoch": 4.149538278224606, + "grad_norm": 6.430176734924316, + "learning_rate": 4.826141561933367e-05, + "loss": 0.2085, + "num_input_tokens_seen": 16159584, + "step": 27860 + }, + { + "epoch": 4.1502829907655645, + "grad_norm": 9.181468963623047, + "learning_rate": 4.826022482813376e-05, + "loss": 0.1534, + "num_input_tokens_seen": 16162528, + "step": 27865 + }, + { + "epoch": 4.151027703306523, + "grad_norm": 2.106104850769043, + "learning_rate": 4.825903364397598e-05, + "loss": 0.0538, + "num_input_tokens_seen": 16165312, + "step": 27870 + }, + { + "epoch": 4.151772415847483, + "grad_norm": 10.1141357421875, + "learning_rate": 4.8257842066880474e-05, + "loss": 0.2291, + "num_input_tokens_seen": 16168288, + "step": 27875 + }, + { + "epoch": 4.152517128388442, + "grad_norm": 2.8290257453918457, + "learning_rate": 4.8256650096867364e-05, + "loss": 0.0973, + "num_input_tokens_seen": 16171200, + "step": 27880 + }, + { + "epoch": 4.153261840929401, + "grad_norm": 2.9087445735931396, + "learning_rate": 4.8255457733956774e-05, + "loss": 0.3833, + "num_input_tokens_seen": 16174432, + "step": 27885 + }, + { + "epoch": 4.15400655347036, + "grad_norm": 36.11223220825195, + "learning_rate": 4.825426497816888e-05, + "loss": 0.0727, + "num_input_tokens_seen": 16177088, + "step": 27890 + }, + { + "epoch": 4.15475126601132, + "grad_norm": 9.964189529418945, + "learning_rate": 4.82530718295238e-05, + "loss": 0.0905, + "num_input_tokens_seen": 16179808, + "step": 27895 + }, + { + "epoch": 4.1554959785522785, + "grad_norm": 0.09062611311674118, + "learning_rate": 4.825187828804171e-05, + "loss": 0.4423, + "num_input_tokens_seen": 16182688, + "step": 27900 + }, + { + "epoch": 4.156240691093238, + "grad_norm": 19.689010620117188, + "learning_rate": 4.825068435374277e-05, + "loss": 0.2424, + "num_input_tokens_seen": 16185696, + "step": 27905 + }, + { + "epoch": 4.156985403634197, + "grad_norm": 0.8129891753196716, + "learning_rate": 4.824949002664715e-05, + "loss": 0.2446, + "num_input_tokens_seen": 16188768, + "step": 27910 + }, + { + "epoch": 4.1577301161751565, + "grad_norm": 0.4026012718677521, + "learning_rate": 4.824829530677503e-05, + "loss": 0.1713, + "num_input_tokens_seen": 16191712, + "step": 27915 + }, + { + "epoch": 4.158474828716115, + "grad_norm": 19.675161361694336, + "learning_rate": 4.824710019414658e-05, + "loss": 0.433, + "num_input_tokens_seen": 16194912, + "step": 27920 + }, + { + "epoch": 4.159219541257075, + "grad_norm": 17.179386138916016, + "learning_rate": 4.8245904688781994e-05, + "loss": 0.2005, + "num_input_tokens_seen": 16197664, + "step": 27925 + }, + { + "epoch": 4.159964253798034, + "grad_norm": 28.16466522216797, + "learning_rate": 4.8244708790701486e-05, + "loss": 0.3964, + "num_input_tokens_seen": 16200800, + "step": 27930 + }, + { + "epoch": 4.160708966338993, + "grad_norm": 15.464975357055664, + "learning_rate": 4.824351249992525e-05, + "loss": 0.6122, + "num_input_tokens_seen": 16203936, + "step": 27935 + }, + { + "epoch": 4.161453678879952, + "grad_norm": 30.794910430908203, + "learning_rate": 4.824231581647348e-05, + "loss": 0.4344, + "num_input_tokens_seen": 16206912, + "step": 27940 + }, + { + "epoch": 4.162198391420912, + "grad_norm": 5.438680648803711, + "learning_rate": 4.824111874036642e-05, + "loss": 0.1685, + "num_input_tokens_seen": 16209760, + "step": 27945 + }, + { + "epoch": 4.1629431039618705, + "grad_norm": 0.02820441499352455, + "learning_rate": 4.823992127162428e-05, + "loss": 0.2848, + "num_input_tokens_seen": 16212544, + "step": 27950 + }, + { + "epoch": 4.16368781650283, + "grad_norm": 59.67649459838867, + "learning_rate": 4.8238723410267285e-05, + "loss": 0.211, + "num_input_tokens_seen": 16215584, + "step": 27955 + }, + { + "epoch": 4.164432529043789, + "grad_norm": 20.746240615844727, + "learning_rate": 4.823752515631568e-05, + "loss": 0.18, + "num_input_tokens_seen": 16218144, + "step": 27960 + }, + { + "epoch": 4.165177241584749, + "grad_norm": 24.606218338012695, + "learning_rate": 4.8236326509789695e-05, + "loss": 0.3259, + "num_input_tokens_seen": 16221312, + "step": 27965 + }, + { + "epoch": 4.165921954125707, + "grad_norm": 24.481416702270508, + "learning_rate": 4.8235127470709594e-05, + "loss": 0.2894, + "num_input_tokens_seen": 16224128, + "step": 27970 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.9532557725906372, + "learning_rate": 4.8233928039095635e-05, + "loss": 0.0772, + "num_input_tokens_seen": 16226976, + "step": 27975 + }, + { + "epoch": 4.167411379207626, + "grad_norm": 0.05874883756041527, + "learning_rate": 4.823272821496808e-05, + "loss": 0.169, + "num_input_tokens_seen": 16229760, + "step": 27980 + }, + { + "epoch": 4.168156091748585, + "grad_norm": 18.823823928833008, + "learning_rate": 4.823152799834718e-05, + "loss": 0.21, + "num_input_tokens_seen": 16232928, + "step": 27985 + }, + { + "epoch": 4.168900804289544, + "grad_norm": 0.9691752791404724, + "learning_rate": 4.823032738925324e-05, + "loss": 0.2609, + "num_input_tokens_seen": 16236096, + "step": 27990 + }, + { + "epoch": 4.169645516830504, + "grad_norm": 4.164416790008545, + "learning_rate": 4.8229126387706516e-05, + "loss": 0.4142, + "num_input_tokens_seen": 16238976, + "step": 27995 + }, + { + "epoch": 4.1703902293714625, + "grad_norm": 1.7294354438781738, + "learning_rate": 4.822792499372732e-05, + "loss": 0.2193, + "num_input_tokens_seen": 16242080, + "step": 28000 + }, + { + "epoch": 4.171134941912422, + "grad_norm": 7.833669662475586, + "learning_rate": 4.822672320733594e-05, + "loss": 0.3618, + "num_input_tokens_seen": 16244928, + "step": 28005 + }, + { + "epoch": 4.171879654453381, + "grad_norm": 7.102877616882324, + "learning_rate": 4.822552102855267e-05, + "loss": 0.3501, + "num_input_tokens_seen": 16247616, + "step": 28010 + }, + { + "epoch": 4.172624366994341, + "grad_norm": 12.540436744689941, + "learning_rate": 4.822431845739783e-05, + "loss": 0.4812, + "num_input_tokens_seen": 16250624, + "step": 28015 + }, + { + "epoch": 4.173369079535299, + "grad_norm": 24.38249397277832, + "learning_rate": 4.822311549389174e-05, + "loss": 0.3244, + "num_input_tokens_seen": 16253152, + "step": 28020 + }, + { + "epoch": 4.174113792076259, + "grad_norm": 1.0831356048583984, + "learning_rate": 4.8221912138054715e-05, + "loss": 0.2719, + "num_input_tokens_seen": 16256160, + "step": 28025 + }, + { + "epoch": 4.174858504617218, + "grad_norm": 3.978224992752075, + "learning_rate": 4.822070838990708e-05, + "loss": 0.2055, + "num_input_tokens_seen": 16259040, + "step": 28030 + }, + { + "epoch": 4.1756032171581765, + "grad_norm": 8.935843467712402, + "learning_rate": 4.8219504249469186e-05, + "loss": 0.1017, + "num_input_tokens_seen": 16261984, + "step": 28035 + }, + { + "epoch": 4.176347929699136, + "grad_norm": 0.14452765882015228, + "learning_rate": 4.821829971676136e-05, + "loss": 0.6299, + "num_input_tokens_seen": 16264928, + "step": 28040 + }, + { + "epoch": 4.177092642240095, + "grad_norm": 19.146625518798828, + "learning_rate": 4.8217094791803966e-05, + "loss": 0.1588, + "num_input_tokens_seen": 16267520, + "step": 28045 + }, + { + "epoch": 4.177837354781055, + "grad_norm": 11.966507911682129, + "learning_rate": 4.821588947461734e-05, + "loss": 0.4928, + "num_input_tokens_seen": 16270240, + "step": 28050 + }, + { + "epoch": 4.178582067322013, + "grad_norm": 14.453315734863281, + "learning_rate": 4.821468376522186e-05, + "loss": 0.3303, + "num_input_tokens_seen": 16273024, + "step": 28055 + }, + { + "epoch": 4.179326779862973, + "grad_norm": 3.0343289375305176, + "learning_rate": 4.82134776636379e-05, + "loss": 0.1168, + "num_input_tokens_seen": 16276160, + "step": 28060 + }, + { + "epoch": 4.180071492403932, + "grad_norm": 7.023776054382324, + "learning_rate": 4.821227116988583e-05, + "loss": 0.2737, + "num_input_tokens_seen": 16279008, + "step": 28065 + }, + { + "epoch": 4.180816204944891, + "grad_norm": 20.360822677612305, + "learning_rate": 4.8211064283986015e-05, + "loss": 0.4225, + "num_input_tokens_seen": 16281760, + "step": 28070 + }, + { + "epoch": 4.18156091748585, + "grad_norm": 1.189713716506958, + "learning_rate": 4.8209857005958866e-05, + "loss": 0.4851, + "num_input_tokens_seen": 16284736, + "step": 28075 + }, + { + "epoch": 4.18230563002681, + "grad_norm": 20.471158981323242, + "learning_rate": 4.820864933582478e-05, + "loss": 0.4969, + "num_input_tokens_seen": 16287712, + "step": 28080 + }, + { + "epoch": 4.1830503425677685, + "grad_norm": 0.03755849599838257, + "learning_rate": 4.8207441273604145e-05, + "loss": 0.3784, + "num_input_tokens_seen": 16290432, + "step": 28085 + }, + { + "epoch": 4.183795055108728, + "grad_norm": 1.5578479766845703, + "learning_rate": 4.820623281931738e-05, + "loss": 0.339, + "num_input_tokens_seen": 16293472, + "step": 28090 + }, + { + "epoch": 4.184539767649687, + "grad_norm": 1.9817050695419312, + "learning_rate": 4.8205023972984896e-05, + "loss": 0.519, + "num_input_tokens_seen": 16296480, + "step": 28095 + }, + { + "epoch": 4.185284480190647, + "grad_norm": 33.906402587890625, + "learning_rate": 4.820381473462712e-05, + "loss": 0.3529, + "num_input_tokens_seen": 16299232, + "step": 28100 + }, + { + "epoch": 4.186029192731605, + "grad_norm": 25.262014389038086, + "learning_rate": 4.820260510426447e-05, + "loss": 0.5663, + "num_input_tokens_seen": 16302432, + "step": 28105 + }, + { + "epoch": 4.186773905272565, + "grad_norm": 10.006753921508789, + "learning_rate": 4.820139508191739e-05, + "loss": 0.2927, + "num_input_tokens_seen": 16305408, + "step": 28110 + }, + { + "epoch": 4.187518617813524, + "grad_norm": 9.165633201599121, + "learning_rate": 4.820018466760633e-05, + "loss": 0.3107, + "num_input_tokens_seen": 16308384, + "step": 28115 + }, + { + "epoch": 4.188263330354483, + "grad_norm": 12.49060344696045, + "learning_rate": 4.819897386135172e-05, + "loss": 0.2236, + "num_input_tokens_seen": 16311168, + "step": 28120 + }, + { + "epoch": 4.189008042895442, + "grad_norm": 18.357284545898438, + "learning_rate": 4.819776266317403e-05, + "loss": 0.268, + "num_input_tokens_seen": 16314144, + "step": 28125 + }, + { + "epoch": 4.189752755436402, + "grad_norm": 13.411520004272461, + "learning_rate": 4.819655107309371e-05, + "loss": 0.3898, + "num_input_tokens_seen": 16316864, + "step": 28130 + }, + { + "epoch": 4.190497467977361, + "grad_norm": 18.12580108642578, + "learning_rate": 4.819533909113124e-05, + "loss": 0.0691, + "num_input_tokens_seen": 16319424, + "step": 28135 + }, + { + "epoch": 4.19124218051832, + "grad_norm": 8.228819847106934, + "learning_rate": 4.819412671730709e-05, + "loss": 0.4574, + "num_input_tokens_seen": 16322080, + "step": 28140 + }, + { + "epoch": 4.191986893059279, + "grad_norm": 10.223456382751465, + "learning_rate": 4.8192913951641746e-05, + "loss": 0.2676, + "num_input_tokens_seen": 16324960, + "step": 28145 + }, + { + "epoch": 4.192731605600239, + "grad_norm": 12.878521919250488, + "learning_rate": 4.819170079415569e-05, + "loss": 0.3331, + "num_input_tokens_seen": 16327712, + "step": 28150 + }, + { + "epoch": 4.193476318141197, + "grad_norm": 16.70598793029785, + "learning_rate": 4.819048724486942e-05, + "loss": 0.2692, + "num_input_tokens_seen": 16330432, + "step": 28155 + }, + { + "epoch": 4.194221030682157, + "grad_norm": 1.2943227291107178, + "learning_rate": 4.818927330380344e-05, + "loss": 0.4385, + "num_input_tokens_seen": 16333376, + "step": 28160 + }, + { + "epoch": 4.194965743223116, + "grad_norm": 2.7114689350128174, + "learning_rate": 4.8188058970978254e-05, + "loss": 0.593, + "num_input_tokens_seen": 16336256, + "step": 28165 + }, + { + "epoch": 4.195710455764075, + "grad_norm": 0.38885289430618286, + "learning_rate": 4.818684424641438e-05, + "loss": 0.571, + "num_input_tokens_seen": 16338976, + "step": 28170 + }, + { + "epoch": 4.196455168305034, + "grad_norm": 8.37083625793457, + "learning_rate": 4.8185629130132336e-05, + "loss": 0.3066, + "num_input_tokens_seen": 16342048, + "step": 28175 + }, + { + "epoch": 4.197199880845994, + "grad_norm": 15.937474250793457, + "learning_rate": 4.818441362215266e-05, + "loss": 0.1755, + "num_input_tokens_seen": 16344800, + "step": 28180 + }, + { + "epoch": 4.197944593386953, + "grad_norm": 10.414351463317871, + "learning_rate": 4.8183197722495877e-05, + "loss": 0.1733, + "num_input_tokens_seen": 16347616, + "step": 28185 + }, + { + "epoch": 4.198689305927912, + "grad_norm": 3.5356364250183105, + "learning_rate": 4.8181981431182523e-05, + "loss": 0.3774, + "num_input_tokens_seen": 16350624, + "step": 28190 + }, + { + "epoch": 4.199434018468871, + "grad_norm": 19.384349822998047, + "learning_rate": 4.818076474823316e-05, + "loss": 0.3702, + "num_input_tokens_seen": 16353600, + "step": 28195 + }, + { + "epoch": 4.200178731009831, + "grad_norm": 3.121189594268799, + "learning_rate": 4.817954767366833e-05, + "loss": 0.1607, + "num_input_tokens_seen": 16356672, + "step": 28200 + }, + { + "epoch": 4.200923443550789, + "grad_norm": 7.412895202636719, + "learning_rate": 4.817833020750861e-05, + "loss": 0.3462, + "num_input_tokens_seen": 16359424, + "step": 28205 + }, + { + "epoch": 4.201668156091749, + "grad_norm": 8.426218032836914, + "learning_rate": 4.8177112349774554e-05, + "loss": 0.0897, + "num_input_tokens_seen": 16362368, + "step": 28210 + }, + { + "epoch": 4.202412868632708, + "grad_norm": 6.303321838378906, + "learning_rate": 4.817589410048674e-05, + "loss": 0.1794, + "num_input_tokens_seen": 16365120, + "step": 28215 + }, + { + "epoch": 4.203157581173667, + "grad_norm": 24.21786880493164, + "learning_rate": 4.817467545966575e-05, + "loss": 0.2652, + "num_input_tokens_seen": 16368160, + "step": 28220 + }, + { + "epoch": 4.203902293714626, + "grad_norm": 8.818909645080566, + "learning_rate": 4.8173456427332176e-05, + "loss": 0.4586, + "num_input_tokens_seen": 16371328, + "step": 28225 + }, + { + "epoch": 4.204647006255585, + "grad_norm": 9.384291648864746, + "learning_rate": 4.817223700350661e-05, + "loss": 0.2714, + "num_input_tokens_seen": 16374400, + "step": 28230 + }, + { + "epoch": 4.205391718796545, + "grad_norm": 17.586902618408203, + "learning_rate": 4.817101718820965e-05, + "loss": 0.214, + "num_input_tokens_seen": 16377056, + "step": 28235 + }, + { + "epoch": 4.206136431337503, + "grad_norm": 15.286097526550293, + "learning_rate": 4.8169796981461904e-05, + "loss": 0.1697, + "num_input_tokens_seen": 16379808, + "step": 28240 + }, + { + "epoch": 4.206881143878463, + "grad_norm": 9.41003704071045, + "learning_rate": 4.816857638328398e-05, + "loss": 0.4755, + "num_input_tokens_seen": 16382784, + "step": 28245 + }, + { + "epoch": 4.207625856419422, + "grad_norm": 5.831031799316406, + "learning_rate": 4.816735539369651e-05, + "loss": 0.3421, + "num_input_tokens_seen": 16385888, + "step": 28250 + }, + { + "epoch": 4.208370568960381, + "grad_norm": 8.847854614257812, + "learning_rate": 4.816613401272011e-05, + "loss": 0.2713, + "num_input_tokens_seen": 16388608, + "step": 28255 + }, + { + "epoch": 4.20911528150134, + "grad_norm": 32.252906799316406, + "learning_rate": 4.816491224037543e-05, + "loss": 0.1156, + "num_input_tokens_seen": 16391712, + "step": 28260 + }, + { + "epoch": 4.2098599940423, + "grad_norm": 6.243994235992432, + "learning_rate": 4.81636900766831e-05, + "loss": 0.2496, + "num_input_tokens_seen": 16394560, + "step": 28265 + }, + { + "epoch": 4.210604706583259, + "grad_norm": 2.182673931121826, + "learning_rate": 4.816246752166377e-05, + "loss": 0.1814, + "num_input_tokens_seen": 16397216, + "step": 28270 + }, + { + "epoch": 4.211349419124218, + "grad_norm": 9.794950485229492, + "learning_rate": 4.8161244575338086e-05, + "loss": 0.1462, + "num_input_tokens_seen": 16400288, + "step": 28275 + }, + { + "epoch": 4.212094131665177, + "grad_norm": 11.068338394165039, + "learning_rate": 4.816002123772672e-05, + "loss": 0.4929, + "num_input_tokens_seen": 16403360, + "step": 28280 + }, + { + "epoch": 4.212838844206137, + "grad_norm": 6.424300670623779, + "learning_rate": 4.815879750885033e-05, + "loss": 0.0446, + "num_input_tokens_seen": 16406080, + "step": 28285 + }, + { + "epoch": 4.213583556747095, + "grad_norm": 0.35603469610214233, + "learning_rate": 4.81575733887296e-05, + "loss": 0.1819, + "num_input_tokens_seen": 16409024, + "step": 28290 + }, + { + "epoch": 4.214328269288055, + "grad_norm": 21.813899993896484, + "learning_rate": 4.81563488773852e-05, + "loss": 0.3697, + "num_input_tokens_seen": 16411968, + "step": 28295 + }, + { + "epoch": 4.215072981829014, + "grad_norm": 44.003204345703125, + "learning_rate": 4.8155123974837824e-05, + "loss": 0.2579, + "num_input_tokens_seen": 16414784, + "step": 28300 + }, + { + "epoch": 4.2158176943699734, + "grad_norm": 13.14913558959961, + "learning_rate": 4.815389868110816e-05, + "loss": 0.2135, + "num_input_tokens_seen": 16417536, + "step": 28305 + }, + { + "epoch": 4.216562406910932, + "grad_norm": 27.66847038269043, + "learning_rate": 4.815267299621691e-05, + "loss": 0.3199, + "num_input_tokens_seen": 16420224, + "step": 28310 + }, + { + "epoch": 4.217307119451892, + "grad_norm": 30.526399612426758, + "learning_rate": 4.815144692018477e-05, + "loss": 0.1961, + "num_input_tokens_seen": 16422944, + "step": 28315 + }, + { + "epoch": 4.218051831992851, + "grad_norm": 0.23518460988998413, + "learning_rate": 4.815022045303248e-05, + "loss": 0.3925, + "num_input_tokens_seen": 16425472, + "step": 28320 + }, + { + "epoch": 4.21879654453381, + "grad_norm": 0.29969653487205505, + "learning_rate": 4.814899359478074e-05, + "loss": 0.5054, + "num_input_tokens_seen": 16428192, + "step": 28325 + }, + { + "epoch": 4.219541257074769, + "grad_norm": 1.0050947666168213, + "learning_rate": 4.814776634545028e-05, + "loss": 0.0537, + "num_input_tokens_seen": 16431200, + "step": 28330 + }, + { + "epoch": 4.220285969615729, + "grad_norm": 11.353423118591309, + "learning_rate": 4.814653870506183e-05, + "loss": 0.3782, + "num_input_tokens_seen": 16434176, + "step": 28335 + }, + { + "epoch": 4.221030682156687, + "grad_norm": 36.68923568725586, + "learning_rate": 4.8145310673636143e-05, + "loss": 0.4524, + "num_input_tokens_seen": 16437120, + "step": 28340 + }, + { + "epoch": 4.221775394697647, + "grad_norm": 20.115623474121094, + "learning_rate": 4.814408225119395e-05, + "loss": 0.3085, + "num_input_tokens_seen": 16440000, + "step": 28345 + }, + { + "epoch": 4.222520107238606, + "grad_norm": 2.996169090270996, + "learning_rate": 4.8142853437756006e-05, + "loss": 0.2919, + "num_input_tokens_seen": 16442880, + "step": 28350 + }, + { + "epoch": 4.2232648197795655, + "grad_norm": 11.461074829101562, + "learning_rate": 4.814162423334309e-05, + "loss": 0.1885, + "num_input_tokens_seen": 16445888, + "step": 28355 + }, + { + "epoch": 4.224009532320524, + "grad_norm": 14.660873413085938, + "learning_rate": 4.814039463797594e-05, + "loss": 0.4366, + "num_input_tokens_seen": 16448448, + "step": 28360 + }, + { + "epoch": 4.224754244861484, + "grad_norm": 0.14317813515663147, + "learning_rate": 4.813916465167534e-05, + "loss": 0.2637, + "num_input_tokens_seen": 16451232, + "step": 28365 + }, + { + "epoch": 4.225498957402443, + "grad_norm": 13.185346603393555, + "learning_rate": 4.813793427446207e-05, + "loss": 0.5652, + "num_input_tokens_seen": 16454144, + "step": 28370 + }, + { + "epoch": 4.226243669943402, + "grad_norm": 22.87798500061035, + "learning_rate": 4.813670350635693e-05, + "loss": 0.283, + "num_input_tokens_seen": 16457152, + "step": 28375 + }, + { + "epoch": 4.226988382484361, + "grad_norm": 38.4735221862793, + "learning_rate": 4.8135472347380684e-05, + "loss": 0.4095, + "num_input_tokens_seen": 16460160, + "step": 28380 + }, + { + "epoch": 4.22773309502532, + "grad_norm": 6.787174701690674, + "learning_rate": 4.8134240797554155e-05, + "loss": 0.4414, + "num_input_tokens_seen": 16463392, + "step": 28385 + }, + { + "epoch": 4.2284778075662794, + "grad_norm": 4.032049179077148, + "learning_rate": 4.813300885689814e-05, + "loss": 0.2744, + "num_input_tokens_seen": 16466208, + "step": 28390 + }, + { + "epoch": 4.229222520107238, + "grad_norm": 18.639266967773438, + "learning_rate": 4.813177652543345e-05, + "loss": 0.2183, + "num_input_tokens_seen": 16468864, + "step": 28395 + }, + { + "epoch": 4.229967232648198, + "grad_norm": 9.335545539855957, + "learning_rate": 4.813054380318091e-05, + "loss": 0.3699, + "num_input_tokens_seen": 16472000, + "step": 28400 + }, + { + "epoch": 4.230711945189157, + "grad_norm": 4.458124160766602, + "learning_rate": 4.8129310690161335e-05, + "loss": 0.3065, + "num_input_tokens_seen": 16474816, + "step": 28405 + }, + { + "epoch": 4.231456657730116, + "grad_norm": 11.203413963317871, + "learning_rate": 4.812807718639556e-05, + "loss": 0.1527, + "num_input_tokens_seen": 16477856, + "step": 28410 + }, + { + "epoch": 4.232201370271075, + "grad_norm": 16.28843116760254, + "learning_rate": 4.812684329190443e-05, + "loss": 0.3307, + "num_input_tokens_seen": 16480800, + "step": 28415 + }, + { + "epoch": 4.232946082812035, + "grad_norm": 0.3026573061943054, + "learning_rate": 4.8125609006708796e-05, + "loss": 0.2652, + "num_input_tokens_seen": 16484096, + "step": 28420 + }, + { + "epoch": 4.233690795352993, + "grad_norm": 16.62486457824707, + "learning_rate": 4.812437433082949e-05, + "loss": 0.1497, + "num_input_tokens_seen": 16486752, + "step": 28425 + }, + { + "epoch": 4.234435507893953, + "grad_norm": 0.6417481899261475, + "learning_rate": 4.812313926428739e-05, + "loss": 0.1103, + "num_input_tokens_seen": 16489696, + "step": 28430 + }, + { + "epoch": 4.235180220434912, + "grad_norm": 38.42431640625, + "learning_rate": 4.812190380710335e-05, + "loss": 0.4351, + "num_input_tokens_seen": 16492544, + "step": 28435 + }, + { + "epoch": 4.2359249329758715, + "grad_norm": 0.2384079396724701, + "learning_rate": 4.812066795929825e-05, + "loss": 0.2737, + "num_input_tokens_seen": 16495520, + "step": 28440 + }, + { + "epoch": 4.23666964551683, + "grad_norm": 36.27783203125, + "learning_rate": 4.811943172089296e-05, + "loss": 0.3432, + "num_input_tokens_seen": 16498464, + "step": 28445 + }, + { + "epoch": 4.23741435805779, + "grad_norm": 8.08753490447998, + "learning_rate": 4.811819509190837e-05, + "loss": 0.5183, + "num_input_tokens_seen": 16501216, + "step": 28450 + }, + { + "epoch": 4.238159070598749, + "grad_norm": 6.696557521820068, + "learning_rate": 4.811695807236537e-05, + "loss": 0.0936, + "num_input_tokens_seen": 16504384, + "step": 28455 + }, + { + "epoch": 4.238903783139708, + "grad_norm": 82.63771057128906, + "learning_rate": 4.8115720662284855e-05, + "loss": 0.1803, + "num_input_tokens_seen": 16507328, + "step": 28460 + }, + { + "epoch": 4.239648495680667, + "grad_norm": 32.39236068725586, + "learning_rate": 4.8114482861687734e-05, + "loss": 0.313, + "num_input_tokens_seen": 16510560, + "step": 28465 + }, + { + "epoch": 4.240393208221627, + "grad_norm": 4.255077838897705, + "learning_rate": 4.8113244670594926e-05, + "loss": 0.2943, + "num_input_tokens_seen": 16513312, + "step": 28470 + }, + { + "epoch": 4.2411379207625854, + "grad_norm": 45.50907897949219, + "learning_rate": 4.811200608902733e-05, + "loss": 0.38, + "num_input_tokens_seen": 16516096, + "step": 28475 + }, + { + "epoch": 4.241882633303545, + "grad_norm": 46.31314468383789, + "learning_rate": 4.811076711700588e-05, + "loss": 0.3035, + "num_input_tokens_seen": 16519168, + "step": 28480 + }, + { + "epoch": 4.242627345844504, + "grad_norm": 14.663875579833984, + "learning_rate": 4.810952775455152e-05, + "loss": 0.4205, + "num_input_tokens_seen": 16522240, + "step": 28485 + }, + { + "epoch": 4.2433720583854635, + "grad_norm": 17.93598747253418, + "learning_rate": 4.810828800168517e-05, + "loss": 0.4055, + "num_input_tokens_seen": 16525344, + "step": 28490 + }, + { + "epoch": 4.244116770926422, + "grad_norm": 5.566340446472168, + "learning_rate": 4.810704785842778e-05, + "loss": 0.3216, + "num_input_tokens_seen": 16528288, + "step": 28495 + }, + { + "epoch": 4.244861483467382, + "grad_norm": 17.421096801757812, + "learning_rate": 4.81058073248003e-05, + "loss": 0.3684, + "num_input_tokens_seen": 16531296, + "step": 28500 + }, + { + "epoch": 4.245606196008341, + "grad_norm": 14.983701705932617, + "learning_rate": 4.810456640082369e-05, + "loss": 0.3232, + "num_input_tokens_seen": 16534336, + "step": 28505 + }, + { + "epoch": 4.2463509085493, + "grad_norm": 14.728039741516113, + "learning_rate": 4.810332508651891e-05, + "loss": 0.3154, + "num_input_tokens_seen": 16537216, + "step": 28510 + }, + { + "epoch": 4.247095621090259, + "grad_norm": 12.048958778381348, + "learning_rate": 4.810208338190694e-05, + "loss": 0.3049, + "num_input_tokens_seen": 16540192, + "step": 28515 + }, + { + "epoch": 4.247840333631219, + "grad_norm": 22.299646377563477, + "learning_rate": 4.810084128700875e-05, + "loss": 0.2647, + "num_input_tokens_seen": 16543040, + "step": 28520 + }, + { + "epoch": 4.2485850461721775, + "grad_norm": 8.599422454833984, + "learning_rate": 4.809959880184532e-05, + "loss": 0.2378, + "num_input_tokens_seen": 16546176, + "step": 28525 + }, + { + "epoch": 4.249329758713137, + "grad_norm": 30.806203842163086, + "learning_rate": 4.8098355926437655e-05, + "loss": 0.2873, + "num_input_tokens_seen": 16549184, + "step": 28530 + }, + { + "epoch": 4.250074471254096, + "grad_norm": 11.783903121948242, + "learning_rate": 4.809711266080673e-05, + "loss": 0.1956, + "num_input_tokens_seen": 16551904, + "step": 28535 + }, + { + "epoch": 4.2508191837950555, + "grad_norm": 4.581516742706299, + "learning_rate": 4.809586900497357e-05, + "loss": 0.1472, + "num_input_tokens_seen": 16554688, + "step": 28540 + }, + { + "epoch": 4.251563896336014, + "grad_norm": 0.3142271041870117, + "learning_rate": 4.809462495895918e-05, + "loss": 0.3693, + "num_input_tokens_seen": 16557696, + "step": 28545 + }, + { + "epoch": 4.252308608876973, + "grad_norm": 37.4454231262207, + "learning_rate": 4.809338052278456e-05, + "loss": 0.4391, + "num_input_tokens_seen": 16560352, + "step": 28550 + }, + { + "epoch": 4.253053321417933, + "grad_norm": 19.97968101501465, + "learning_rate": 4.809213569647076e-05, + "loss": 0.2227, + "num_input_tokens_seen": 16562880, + "step": 28555 + }, + { + "epoch": 4.253798033958892, + "grad_norm": 63.97805404663086, + "learning_rate": 4.8090890480038796e-05, + "loss": 0.3564, + "num_input_tokens_seen": 16565792, + "step": 28560 + }, + { + "epoch": 4.254542746499851, + "grad_norm": 35.830780029296875, + "learning_rate": 4.80896448735097e-05, + "loss": 0.2806, + "num_input_tokens_seen": 16568704, + "step": 28565 + }, + { + "epoch": 4.25528745904081, + "grad_norm": 20.43401527404785, + "learning_rate": 4.8088398876904526e-05, + "loss": 0.4588, + "num_input_tokens_seen": 16571648, + "step": 28570 + }, + { + "epoch": 4.2560321715817695, + "grad_norm": 32.908870697021484, + "learning_rate": 4.808715249024431e-05, + "loss": 0.2453, + "num_input_tokens_seen": 16575008, + "step": 28575 + }, + { + "epoch": 4.256776884122728, + "grad_norm": 7.576714038848877, + "learning_rate": 4.808590571355013e-05, + "loss": 0.3061, + "num_input_tokens_seen": 16577728, + "step": 28580 + }, + { + "epoch": 4.257521596663688, + "grad_norm": 12.106696128845215, + "learning_rate": 4.808465854684303e-05, + "loss": 0.1976, + "num_input_tokens_seen": 16580608, + "step": 28585 + }, + { + "epoch": 4.258266309204647, + "grad_norm": 9.164046287536621, + "learning_rate": 4.8083410990144085e-05, + "loss": 0.3718, + "num_input_tokens_seen": 16583680, + "step": 28590 + }, + { + "epoch": 4.259011021745606, + "grad_norm": 25.022489547729492, + "learning_rate": 4.808216304347438e-05, + "loss": 0.2594, + "num_input_tokens_seen": 16586912, + "step": 28595 + }, + { + "epoch": 4.259755734286565, + "grad_norm": 0.07518516480922699, + "learning_rate": 4.8080914706854985e-05, + "loss": 0.2904, + "num_input_tokens_seen": 16589792, + "step": 28600 + }, + { + "epoch": 4.260500446827525, + "grad_norm": 0.6321947574615479, + "learning_rate": 4.8079665980306986e-05, + "loss": 0.4556, + "num_input_tokens_seen": 16592928, + "step": 28605 + }, + { + "epoch": 4.2612451593684835, + "grad_norm": 70.88273620605469, + "learning_rate": 4.80784168638515e-05, + "loss": 0.2073, + "num_input_tokens_seen": 16595712, + "step": 28610 + }, + { + "epoch": 4.261989871909443, + "grad_norm": 20.36012840270996, + "learning_rate": 4.807716735750961e-05, + "loss": 0.2468, + "num_input_tokens_seen": 16598912, + "step": 28615 + }, + { + "epoch": 4.262734584450402, + "grad_norm": 24.906906127929688, + "learning_rate": 4.8075917461302435e-05, + "loss": 0.3312, + "num_input_tokens_seen": 16602080, + "step": 28620 + }, + { + "epoch": 4.2634792969913615, + "grad_norm": 11.06688404083252, + "learning_rate": 4.807466717525109e-05, + "loss": 0.2381, + "num_input_tokens_seen": 16604960, + "step": 28625 + }, + { + "epoch": 4.26422400953232, + "grad_norm": 27.80609703063965, + "learning_rate": 4.807341649937669e-05, + "loss": 0.2123, + "num_input_tokens_seen": 16607776, + "step": 28630 + }, + { + "epoch": 4.26496872207328, + "grad_norm": 23.56313705444336, + "learning_rate": 4.8072165433700366e-05, + "loss": 0.4274, + "num_input_tokens_seen": 16610624, + "step": 28635 + }, + { + "epoch": 4.265713434614239, + "grad_norm": 13.529838562011719, + "learning_rate": 4.807091397824327e-05, + "loss": 0.3937, + "num_input_tokens_seen": 16613504, + "step": 28640 + }, + { + "epoch": 4.266458147155198, + "grad_norm": 15.0145902633667, + "learning_rate": 4.806966213302652e-05, + "loss": 0.2633, + "num_input_tokens_seen": 16616544, + "step": 28645 + }, + { + "epoch": 4.267202859696157, + "grad_norm": 0.2621101438999176, + "learning_rate": 4.806840989807128e-05, + "loss": 0.1398, + "num_input_tokens_seen": 16619552, + "step": 28650 + }, + { + "epoch": 4.267947572237117, + "grad_norm": 11.462831497192383, + "learning_rate": 4.806715727339869e-05, + "loss": 0.3384, + "num_input_tokens_seen": 16622368, + "step": 28655 + }, + { + "epoch": 4.2686922847780755, + "grad_norm": 29.827905654907227, + "learning_rate": 4.8065904259029934e-05, + "loss": 0.4606, + "num_input_tokens_seen": 16625280, + "step": 28660 + }, + { + "epoch": 4.269436997319035, + "grad_norm": 10.973686218261719, + "learning_rate": 4.806465085498616e-05, + "loss": 0.4897, + "num_input_tokens_seen": 16628256, + "step": 28665 + }, + { + "epoch": 4.270181709859994, + "grad_norm": 0.9960203766822815, + "learning_rate": 4.806339706128856e-05, + "loss": 0.5352, + "num_input_tokens_seen": 16631616, + "step": 28670 + }, + { + "epoch": 4.2709264224009535, + "grad_norm": 1.7715964317321777, + "learning_rate": 4.8062142877958307e-05, + "loss": 0.171, + "num_input_tokens_seen": 16634496, + "step": 28675 + }, + { + "epoch": 4.271671134941912, + "grad_norm": 0.021309679374098778, + "learning_rate": 4.8060888305016584e-05, + "loss": 0.2268, + "num_input_tokens_seen": 16637440, + "step": 28680 + }, + { + "epoch": 4.272415847482872, + "grad_norm": 15.00682258605957, + "learning_rate": 4.8059633342484586e-05, + "loss": 0.4555, + "num_input_tokens_seen": 16640192, + "step": 28685 + }, + { + "epoch": 4.273160560023831, + "grad_norm": 41.583335876464844, + "learning_rate": 4.805837799038353e-05, + "loss": 0.2687, + "num_input_tokens_seen": 16643200, + "step": 28690 + }, + { + "epoch": 4.27390527256479, + "grad_norm": 0.8707924485206604, + "learning_rate": 4.805712224873461e-05, + "loss": 0.318, + "num_input_tokens_seen": 16646368, + "step": 28695 + }, + { + "epoch": 4.274649985105749, + "grad_norm": 49.58342742919922, + "learning_rate": 4.805586611755905e-05, + "loss": 0.5288, + "num_input_tokens_seen": 16649216, + "step": 28700 + }, + { + "epoch": 4.275394697646709, + "grad_norm": 29.69356346130371, + "learning_rate": 4.805460959687805e-05, + "loss": 0.6877, + "num_input_tokens_seen": 16652032, + "step": 28705 + }, + { + "epoch": 4.2761394101876675, + "grad_norm": 0.3755721151828766, + "learning_rate": 4.805335268671286e-05, + "loss": 0.2738, + "num_input_tokens_seen": 16654880, + "step": 28710 + }, + { + "epoch": 4.276884122728626, + "grad_norm": 8.383697509765625, + "learning_rate": 4.805209538708471e-05, + "loss": 0.2004, + "num_input_tokens_seen": 16657760, + "step": 28715 + }, + { + "epoch": 4.277628835269586, + "grad_norm": 4.383885383605957, + "learning_rate": 4.805083769801484e-05, + "loss": 0.211, + "num_input_tokens_seen": 16660640, + "step": 28720 + }, + { + "epoch": 4.278373547810546, + "grad_norm": 5.488803863525391, + "learning_rate": 4.804957961952449e-05, + "loss": 0.256, + "num_input_tokens_seen": 16663552, + "step": 28725 + }, + { + "epoch": 4.279118260351504, + "grad_norm": 6.8197503089904785, + "learning_rate": 4.804832115163491e-05, + "loss": 0.0522, + "num_input_tokens_seen": 16666752, + "step": 28730 + }, + { + "epoch": 4.279862972892463, + "grad_norm": 11.39733600616455, + "learning_rate": 4.804706229436739e-05, + "loss": 0.1915, + "num_input_tokens_seen": 16669696, + "step": 28735 + }, + { + "epoch": 4.280607685433423, + "grad_norm": 39.53892135620117, + "learning_rate": 4.804580304774316e-05, + "loss": 0.3396, + "num_input_tokens_seen": 16672640, + "step": 28740 + }, + { + "epoch": 4.2813523979743815, + "grad_norm": 17.13007164001465, + "learning_rate": 4.804454341178352e-05, + "loss": 0.4817, + "num_input_tokens_seen": 16675520, + "step": 28745 + }, + { + "epoch": 4.282097110515341, + "grad_norm": 16.742475509643555, + "learning_rate": 4.804328338650973e-05, + "loss": 0.2133, + "num_input_tokens_seen": 16678304, + "step": 28750 + }, + { + "epoch": 4.2828418230563, + "grad_norm": 4.284462928771973, + "learning_rate": 4.804202297194309e-05, + "loss": 0.3202, + "num_input_tokens_seen": 16682304, + "step": 28755 + }, + { + "epoch": 4.2835865355972595, + "grad_norm": 14.140411376953125, + "learning_rate": 4.8040762168104895e-05, + "loss": 0.3885, + "num_input_tokens_seen": 16685152, + "step": 28760 + }, + { + "epoch": 4.284331248138218, + "grad_norm": 9.590864181518555, + "learning_rate": 4.803950097501644e-05, + "loss": 0.1888, + "num_input_tokens_seen": 16687904, + "step": 28765 + }, + { + "epoch": 4.285075960679178, + "grad_norm": 18.89423942565918, + "learning_rate": 4.8038239392699033e-05, + "loss": 0.2972, + "num_input_tokens_seen": 16690752, + "step": 28770 + }, + { + "epoch": 4.285820673220137, + "grad_norm": 11.13714599609375, + "learning_rate": 4.803697742117399e-05, + "loss": 0.2307, + "num_input_tokens_seen": 16693760, + "step": 28775 + }, + { + "epoch": 4.286565385761096, + "grad_norm": 0.22127510607242584, + "learning_rate": 4.8035715060462614e-05, + "loss": 0.202, + "num_input_tokens_seen": 16696480, + "step": 28780 + }, + { + "epoch": 4.287310098302055, + "grad_norm": 1.0717326402664185, + "learning_rate": 4.803445231058625e-05, + "loss": 0.4978, + "num_input_tokens_seen": 16699328, + "step": 28785 + }, + { + "epoch": 4.288054810843015, + "grad_norm": 5.094315052032471, + "learning_rate": 4.803318917156624e-05, + "loss": 0.4114, + "num_input_tokens_seen": 16702336, + "step": 28790 + }, + { + "epoch": 4.2887995233839735, + "grad_norm": 5.424774646759033, + "learning_rate": 4.803192564342389e-05, + "loss": 0.3588, + "num_input_tokens_seen": 16705408, + "step": 28795 + }, + { + "epoch": 4.289544235924933, + "grad_norm": 0.6658567786216736, + "learning_rate": 4.803066172618058e-05, + "loss": 0.3208, + "num_input_tokens_seen": 16708448, + "step": 28800 + }, + { + "epoch": 4.290288948465892, + "grad_norm": 22.088211059570312, + "learning_rate": 4.802939741985763e-05, + "loss": 0.5097, + "num_input_tokens_seen": 16711456, + "step": 28805 + }, + { + "epoch": 4.291033661006852, + "grad_norm": 34.60633087158203, + "learning_rate": 4.802813272447643e-05, + "loss": 0.2999, + "num_input_tokens_seen": 16714336, + "step": 28810 + }, + { + "epoch": 4.29177837354781, + "grad_norm": 3.0481412410736084, + "learning_rate": 4.8026867640058335e-05, + "loss": 0.2854, + "num_input_tokens_seen": 16717440, + "step": 28815 + }, + { + "epoch": 4.29252308608877, + "grad_norm": 11.702635765075684, + "learning_rate": 4.8025602166624705e-05, + "loss": 0.2578, + "num_input_tokens_seen": 16720320, + "step": 28820 + }, + { + "epoch": 4.293267798629729, + "grad_norm": 25.354061126708984, + "learning_rate": 4.8024336304196927e-05, + "loss": 0.3322, + "num_input_tokens_seen": 16722752, + "step": 28825 + }, + { + "epoch": 4.294012511170688, + "grad_norm": 4.917409420013428, + "learning_rate": 4.802307005279639e-05, + "loss": 0.0633, + "num_input_tokens_seen": 16725728, + "step": 28830 + }, + { + "epoch": 4.294757223711647, + "grad_norm": 0.7596952319145203, + "learning_rate": 4.8021803412444496e-05, + "loss": 0.1974, + "num_input_tokens_seen": 16728992, + "step": 28835 + }, + { + "epoch": 4.295501936252607, + "grad_norm": 35.97960662841797, + "learning_rate": 4.8020536383162615e-05, + "loss": 0.347, + "num_input_tokens_seen": 16732192, + "step": 28840 + }, + { + "epoch": 4.2962466487935655, + "grad_norm": 5.730953693389893, + "learning_rate": 4.8019268964972184e-05, + "loss": 0.3645, + "num_input_tokens_seen": 16735104, + "step": 28845 + }, + { + "epoch": 4.296991361334525, + "grad_norm": 6.536734580993652, + "learning_rate": 4.801800115789459e-05, + "loss": 0.059, + "num_input_tokens_seen": 16737856, + "step": 28850 + }, + { + "epoch": 4.297736073875484, + "grad_norm": 1.9942313432693481, + "learning_rate": 4.801673296195126e-05, + "loss": 0.2497, + "num_input_tokens_seen": 16740736, + "step": 28855 + }, + { + "epoch": 4.298480786416444, + "grad_norm": 7.297593593597412, + "learning_rate": 4.801546437716362e-05, + "loss": 0.256, + "num_input_tokens_seen": 16743488, + "step": 28860 + }, + { + "epoch": 4.299225498957402, + "grad_norm": 20.895889282226562, + "learning_rate": 4.801419540355311e-05, + "loss": 0.4476, + "num_input_tokens_seen": 16746496, + "step": 28865 + }, + { + "epoch": 4.299970211498362, + "grad_norm": 14.175061225891113, + "learning_rate": 4.801292604114115e-05, + "loss": 0.4012, + "num_input_tokens_seen": 16749280, + "step": 28870 + }, + { + "epoch": 4.300714924039321, + "grad_norm": 0.29712408781051636, + "learning_rate": 4.80116562899492e-05, + "loss": 0.1994, + "num_input_tokens_seen": 16752320, + "step": 28875 + }, + { + "epoch": 4.30145963658028, + "grad_norm": 63.268009185791016, + "learning_rate": 4.80103861499987e-05, + "loss": 0.6535, + "num_input_tokens_seen": 16755360, + "step": 28880 + }, + { + "epoch": 4.302204349121239, + "grad_norm": 7.874936103820801, + "learning_rate": 4.800911562131112e-05, + "loss": 0.1153, + "num_input_tokens_seen": 16758144, + "step": 28885 + }, + { + "epoch": 4.302949061662199, + "grad_norm": 25.812931060791016, + "learning_rate": 4.800784470390791e-05, + "loss": 0.4675, + "num_input_tokens_seen": 16760800, + "step": 28890 + }, + { + "epoch": 4.303693774203158, + "grad_norm": 5.470602512359619, + "learning_rate": 4.800657339781055e-05, + "loss": 0.2621, + "num_input_tokens_seen": 16763680, + "step": 28895 + }, + { + "epoch": 4.304438486744116, + "grad_norm": 34.28282928466797, + "learning_rate": 4.800530170304051e-05, + "loss": 0.7568, + "num_input_tokens_seen": 16766368, + "step": 28900 + }, + { + "epoch": 4.305183199285076, + "grad_norm": 31.946455001831055, + "learning_rate": 4.800402961961928e-05, + "loss": 0.2568, + "num_input_tokens_seen": 16769152, + "step": 28905 + }, + { + "epoch": 4.305927911826035, + "grad_norm": 14.459739685058594, + "learning_rate": 4.800275714756836e-05, + "loss": 0.587, + "num_input_tokens_seen": 16771808, + "step": 28910 + }, + { + "epoch": 4.306672624366994, + "grad_norm": 20.577064514160156, + "learning_rate": 4.800148428690923e-05, + "loss": 0.4422, + "num_input_tokens_seen": 16774784, + "step": 28915 + }, + { + "epoch": 4.307417336907953, + "grad_norm": 13.704504013061523, + "learning_rate": 4.80002110376634e-05, + "loss": 0.1681, + "num_input_tokens_seen": 16777664, + "step": 28920 + }, + { + "epoch": 4.308162049448913, + "grad_norm": 14.91804313659668, + "learning_rate": 4.7998937399852386e-05, + "loss": 0.1325, + "num_input_tokens_seen": 16780416, + "step": 28925 + }, + { + "epoch": 4.3089067619898715, + "grad_norm": 8.251310348510742, + "learning_rate": 4.799766337349769e-05, + "loss": 0.3432, + "num_input_tokens_seen": 16783360, + "step": 28930 + }, + { + "epoch": 4.309651474530831, + "grad_norm": 35.684024810791016, + "learning_rate": 4.799638895862085e-05, + "loss": 0.3888, + "num_input_tokens_seen": 16786400, + "step": 28935 + }, + { + "epoch": 4.31039618707179, + "grad_norm": 7.535246849060059, + "learning_rate": 4.79951141552434e-05, + "loss": 0.3641, + "num_input_tokens_seen": 16789248, + "step": 28940 + }, + { + "epoch": 4.31114089961275, + "grad_norm": 40.761207580566406, + "learning_rate": 4.799383896338686e-05, + "loss": 0.273, + "num_input_tokens_seen": 16792192, + "step": 28945 + }, + { + "epoch": 4.311885612153708, + "grad_norm": 14.066943168640137, + "learning_rate": 4.7992563383072775e-05, + "loss": 0.3393, + "num_input_tokens_seen": 16794912, + "step": 28950 + }, + { + "epoch": 4.312630324694668, + "grad_norm": 13.973109245300293, + "learning_rate": 4.799128741432271e-05, + "loss": 0.512, + "num_input_tokens_seen": 16797824, + "step": 28955 + }, + { + "epoch": 4.313375037235627, + "grad_norm": 17.931392669677734, + "learning_rate": 4.7990011057158207e-05, + "loss": 0.3932, + "num_input_tokens_seen": 16800800, + "step": 28960 + }, + { + "epoch": 4.314119749776586, + "grad_norm": 23.046642303466797, + "learning_rate": 4.798873431160084e-05, + "loss": 0.6647, + "num_input_tokens_seen": 16803520, + "step": 28965 + }, + { + "epoch": 4.314864462317545, + "grad_norm": 63.16336441040039, + "learning_rate": 4.798745717767216e-05, + "loss": 0.133, + "num_input_tokens_seen": 16806656, + "step": 28970 + }, + { + "epoch": 4.315609174858505, + "grad_norm": 13.524558067321777, + "learning_rate": 4.7986179655393756e-05, + "loss": 0.2756, + "num_input_tokens_seen": 16809216, + "step": 28975 + }, + { + "epoch": 4.316353887399464, + "grad_norm": 10.53085994720459, + "learning_rate": 4.798490174478721e-05, + "loss": 0.1947, + "num_input_tokens_seen": 16812064, + "step": 28980 + }, + { + "epoch": 4.317098599940423, + "grad_norm": 11.259649276733398, + "learning_rate": 4.7983623445874114e-05, + "loss": 0.2621, + "num_input_tokens_seen": 16814944, + "step": 28985 + }, + { + "epoch": 4.317843312481382, + "grad_norm": 33.35010528564453, + "learning_rate": 4.798234475867606e-05, + "loss": 0.4071, + "num_input_tokens_seen": 16817952, + "step": 28990 + }, + { + "epoch": 4.318588025022342, + "grad_norm": 20.382699966430664, + "learning_rate": 4.7981065683214645e-05, + "loss": 0.3102, + "num_input_tokens_seen": 16820928, + "step": 28995 + }, + { + "epoch": 4.3193327375633, + "grad_norm": 13.280024528503418, + "learning_rate": 4.797978621951148e-05, + "loss": 0.2301, + "num_input_tokens_seen": 16823840, + "step": 29000 + }, + { + "epoch": 4.32007745010426, + "grad_norm": 24.533565521240234, + "learning_rate": 4.797850636758819e-05, + "loss": 0.409, + "num_input_tokens_seen": 16826560, + "step": 29005 + }, + { + "epoch": 4.320822162645219, + "grad_norm": 0.7454230785369873, + "learning_rate": 4.7977226127466386e-05, + "loss": 0.2083, + "num_input_tokens_seen": 16829184, + "step": 29010 + }, + { + "epoch": 4.321566875186178, + "grad_norm": 27.49786376953125, + "learning_rate": 4.7975945499167696e-05, + "loss": 0.3421, + "num_input_tokens_seen": 16832032, + "step": 29015 + }, + { + "epoch": 4.322311587727137, + "grad_norm": 29.700790405273438, + "learning_rate": 4.797466448271376e-05, + "loss": 0.3918, + "num_input_tokens_seen": 16834976, + "step": 29020 + }, + { + "epoch": 4.323056300268097, + "grad_norm": 21.917152404785156, + "learning_rate": 4.7973383078126223e-05, + "loss": 0.3335, + "num_input_tokens_seen": 16837792, + "step": 29025 + }, + { + "epoch": 4.323801012809056, + "grad_norm": 17.774709701538086, + "learning_rate": 4.797210128542673e-05, + "loss": 0.4493, + "num_input_tokens_seen": 16840864, + "step": 29030 + }, + { + "epoch": 4.324545725350015, + "grad_norm": 0.12125902622938156, + "learning_rate": 4.7970819104636924e-05, + "loss": 0.317, + "num_input_tokens_seen": 16843840, + "step": 29035 + }, + { + "epoch": 4.325290437890974, + "grad_norm": 28.07311248779297, + "learning_rate": 4.796953653577848e-05, + "loss": 0.1525, + "num_input_tokens_seen": 16846560, + "step": 29040 + }, + { + "epoch": 4.326035150431934, + "grad_norm": 12.744878768920898, + "learning_rate": 4.7968253578873054e-05, + "loss": 0.1446, + "num_input_tokens_seen": 16849056, + "step": 29045 + }, + { + "epoch": 4.326779862972892, + "grad_norm": 0.12325242161750793, + "learning_rate": 4.796697023394234e-05, + "loss": 0.1933, + "num_input_tokens_seen": 16851904, + "step": 29050 + }, + { + "epoch": 4.327524575513852, + "grad_norm": 19.171268463134766, + "learning_rate": 4.7965686501008e-05, + "loss": 0.6745, + "num_input_tokens_seen": 16854592, + "step": 29055 + }, + { + "epoch": 4.328269288054811, + "grad_norm": 12.915350914001465, + "learning_rate": 4.7964402380091734e-05, + "loss": 0.3757, + "num_input_tokens_seen": 16857312, + "step": 29060 + }, + { + "epoch": 4.32901400059577, + "grad_norm": 28.331628799438477, + "learning_rate": 4.7963117871215224e-05, + "loss": 0.2305, + "num_input_tokens_seen": 16860416, + "step": 29065 + }, + { + "epoch": 4.329758713136729, + "grad_norm": 18.022605895996094, + "learning_rate": 4.796183297440018e-05, + "loss": 0.1839, + "num_input_tokens_seen": 16863360, + "step": 29070 + }, + { + "epoch": 4.330503425677689, + "grad_norm": 9.340893745422363, + "learning_rate": 4.79605476896683e-05, + "loss": 0.3798, + "num_input_tokens_seen": 16866208, + "step": 29075 + }, + { + "epoch": 4.331248138218648, + "grad_norm": 25.572742462158203, + "learning_rate": 4.795926201704131e-05, + "loss": 0.2281, + "num_input_tokens_seen": 16868992, + "step": 29080 + }, + { + "epoch": 4.331992850759606, + "grad_norm": 28.738693237304688, + "learning_rate": 4.795797595654091e-05, + "loss": 0.6307, + "num_input_tokens_seen": 16872032, + "step": 29085 + }, + { + "epoch": 4.332737563300566, + "grad_norm": 1.8812100887298584, + "learning_rate": 4.795668950818885e-05, + "loss": 0.1228, + "num_input_tokens_seen": 16875424, + "step": 29090 + }, + { + "epoch": 4.333482275841525, + "grad_norm": 11.43095874786377, + "learning_rate": 4.7955402672006854e-05, + "loss": 0.4011, + "num_input_tokens_seen": 16878080, + "step": 29095 + }, + { + "epoch": 4.334226988382484, + "grad_norm": 12.45539379119873, + "learning_rate": 4.7954115448016654e-05, + "loss": 0.1976, + "num_input_tokens_seen": 16880800, + "step": 29100 + }, + { + "epoch": 4.334971700923443, + "grad_norm": 17.385034561157227, + "learning_rate": 4.795282783624001e-05, + "loss": 0.2809, + "num_input_tokens_seen": 16883552, + "step": 29105 + }, + { + "epoch": 4.335716413464403, + "grad_norm": 5.360114574432373, + "learning_rate": 4.795153983669867e-05, + "loss": 0.0875, + "num_input_tokens_seen": 16886464, + "step": 29110 + }, + { + "epoch": 4.336461126005362, + "grad_norm": 19.451175689697266, + "learning_rate": 4.795025144941438e-05, + "loss": 0.2989, + "num_input_tokens_seen": 16889664, + "step": 29115 + }, + { + "epoch": 4.337205838546321, + "grad_norm": 6.971844673156738, + "learning_rate": 4.794896267440893e-05, + "loss": 0.2188, + "num_input_tokens_seen": 16892320, + "step": 29120 + }, + { + "epoch": 4.33795055108728, + "grad_norm": 19.789119720458984, + "learning_rate": 4.794767351170406e-05, + "loss": 0.5497, + "num_input_tokens_seen": 16895104, + "step": 29125 + }, + { + "epoch": 4.33869526362824, + "grad_norm": 11.84891414642334, + "learning_rate": 4.794638396132159e-05, + "loss": 0.23, + "num_input_tokens_seen": 16897824, + "step": 29130 + }, + { + "epoch": 4.339439976169198, + "grad_norm": 17.507307052612305, + "learning_rate": 4.7945094023283275e-05, + "loss": 0.606, + "num_input_tokens_seen": 16901024, + "step": 29135 + }, + { + "epoch": 4.340184688710158, + "grad_norm": 12.413040161132812, + "learning_rate": 4.794380369761092e-05, + "loss": 0.277, + "num_input_tokens_seen": 16903840, + "step": 29140 + }, + { + "epoch": 4.340929401251117, + "grad_norm": 0.5102971792221069, + "learning_rate": 4.794251298432632e-05, + "loss": 0.1079, + "num_input_tokens_seen": 16906752, + "step": 29145 + }, + { + "epoch": 4.3416741137920765, + "grad_norm": 5.583956718444824, + "learning_rate": 4.794122188345128e-05, + "loss": 0.1642, + "num_input_tokens_seen": 16910048, + "step": 29150 + }, + { + "epoch": 4.342418826333035, + "grad_norm": 38.32942581176758, + "learning_rate": 4.7939930395007615e-05, + "loss": 0.6432, + "num_input_tokens_seen": 16912800, + "step": 29155 + }, + { + "epoch": 4.343163538873995, + "grad_norm": 6.999309539794922, + "learning_rate": 4.7938638519017134e-05, + "loss": 0.802, + "num_input_tokens_seen": 16915680, + "step": 29160 + }, + { + "epoch": 4.343908251414954, + "grad_norm": 8.394060134887695, + "learning_rate": 4.793734625550167e-05, + "loss": 0.3741, + "num_input_tokens_seen": 16918848, + "step": 29165 + }, + { + "epoch": 4.344652963955913, + "grad_norm": 50.89555358886719, + "learning_rate": 4.7936053604483065e-05, + "loss": 0.3806, + "num_input_tokens_seen": 16921600, + "step": 29170 + }, + { + "epoch": 4.345397676496872, + "grad_norm": 2.4384493827819824, + "learning_rate": 4.793476056598314e-05, + "loss": 0.2872, + "num_input_tokens_seen": 16924256, + "step": 29175 + }, + { + "epoch": 4.346142389037832, + "grad_norm": 29.392732620239258, + "learning_rate": 4.7933467140023736e-05, + "loss": 0.1605, + "num_input_tokens_seen": 16927136, + "step": 29180 + }, + { + "epoch": 4.34688710157879, + "grad_norm": 6.8290228843688965, + "learning_rate": 4.793217332662672e-05, + "loss": 0.1217, + "num_input_tokens_seen": 16929888, + "step": 29185 + }, + { + "epoch": 4.34763181411975, + "grad_norm": 21.405555725097656, + "learning_rate": 4.7930879125813945e-05, + "loss": 0.2618, + "num_input_tokens_seen": 16932928, + "step": 29190 + }, + { + "epoch": 4.348376526660709, + "grad_norm": 22.289602279663086, + "learning_rate": 4.792958453760728e-05, + "loss": 0.5017, + "num_input_tokens_seen": 16935968, + "step": 29195 + }, + { + "epoch": 4.3491212392016685, + "grad_norm": 5.244910717010498, + "learning_rate": 4.792828956202857e-05, + "loss": 0.2606, + "num_input_tokens_seen": 16938720, + "step": 29200 + }, + { + "epoch": 4.349865951742627, + "grad_norm": 22.23744773864746, + "learning_rate": 4.792699419909972e-05, + "loss": 0.2297, + "num_input_tokens_seen": 16941536, + "step": 29205 + }, + { + "epoch": 4.350610664283587, + "grad_norm": 53.94146728515625, + "learning_rate": 4.792569844884261e-05, + "loss": 0.2667, + "num_input_tokens_seen": 16944448, + "step": 29210 + }, + { + "epoch": 4.351355376824546, + "grad_norm": 5.722512722015381, + "learning_rate": 4.792440231127912e-05, + "loss": 0.6081, + "num_input_tokens_seen": 16947424, + "step": 29215 + }, + { + "epoch": 4.352100089365505, + "grad_norm": 5.309858322143555, + "learning_rate": 4.792310578643116e-05, + "loss": 0.3044, + "num_input_tokens_seen": 16950336, + "step": 29220 + }, + { + "epoch": 4.352844801906464, + "grad_norm": 13.971969604492188, + "learning_rate": 4.7921808874320616e-05, + "loss": 0.39, + "num_input_tokens_seen": 16953056, + "step": 29225 + }, + { + "epoch": 4.353589514447424, + "grad_norm": 5.907062530517578, + "learning_rate": 4.792051157496941e-05, + "loss": 0.4098, + "num_input_tokens_seen": 16955872, + "step": 29230 + }, + { + "epoch": 4.3543342269883825, + "grad_norm": 10.666295051574707, + "learning_rate": 4.791921388839946e-05, + "loss": 0.3824, + "num_input_tokens_seen": 16958848, + "step": 29235 + }, + { + "epoch": 4.355078939529342, + "grad_norm": 14.933871269226074, + "learning_rate": 4.791791581463268e-05, + "loss": 0.308, + "num_input_tokens_seen": 16961600, + "step": 29240 + }, + { + "epoch": 4.355823652070301, + "grad_norm": 16.455368041992188, + "learning_rate": 4.791661735369101e-05, + "loss": 0.2699, + "num_input_tokens_seen": 16964480, + "step": 29245 + }, + { + "epoch": 4.35656836461126, + "grad_norm": 30.024002075195312, + "learning_rate": 4.791531850559637e-05, + "loss": 0.4521, + "num_input_tokens_seen": 16967296, + "step": 29250 + }, + { + "epoch": 4.357313077152219, + "grad_norm": 6.100282192230225, + "learning_rate": 4.791401927037073e-05, + "loss": 0.1888, + "num_input_tokens_seen": 16970240, + "step": 29255 + }, + { + "epoch": 4.358057789693178, + "grad_norm": 8.273956298828125, + "learning_rate": 4.791271964803602e-05, + "loss": 0.3514, + "num_input_tokens_seen": 16973024, + "step": 29260 + }, + { + "epoch": 4.358802502234138, + "grad_norm": 3.211993932723999, + "learning_rate": 4.791141963861419e-05, + "loss": 0.0768, + "num_input_tokens_seen": 16975584, + "step": 29265 + }, + { + "epoch": 4.359547214775096, + "grad_norm": 7.724535942077637, + "learning_rate": 4.791011924212721e-05, + "loss": 0.3008, + "num_input_tokens_seen": 16978624, + "step": 29270 + }, + { + "epoch": 4.360291927316056, + "grad_norm": 35.344478607177734, + "learning_rate": 4.790881845859707e-05, + "loss": 0.3658, + "num_input_tokens_seen": 16981376, + "step": 29275 + }, + { + "epoch": 4.361036639857015, + "grad_norm": 1.327999472618103, + "learning_rate": 4.790751728804571e-05, + "loss": 0.4466, + "num_input_tokens_seen": 16984160, + "step": 29280 + }, + { + "epoch": 4.3617813523979745, + "grad_norm": 9.279290199279785, + "learning_rate": 4.790621573049513e-05, + "loss": 0.3434, + "num_input_tokens_seen": 16987072, + "step": 29285 + }, + { + "epoch": 4.362526064938933, + "grad_norm": 33.63454818725586, + "learning_rate": 4.790491378596731e-05, + "loss": 0.3832, + "num_input_tokens_seen": 16990080, + "step": 29290 + }, + { + "epoch": 4.363270777479893, + "grad_norm": 53.5229377746582, + "learning_rate": 4.7903611454484266e-05, + "loss": 0.1863, + "num_input_tokens_seen": 16992992, + "step": 29295 + }, + { + "epoch": 4.364015490020852, + "grad_norm": 15.985663414001465, + "learning_rate": 4.790230873606797e-05, + "loss": 0.4668, + "num_input_tokens_seen": 16996000, + "step": 29300 + }, + { + "epoch": 4.364760202561811, + "grad_norm": 5.858895301818848, + "learning_rate": 4.790100563074045e-05, + "loss": 0.267, + "num_input_tokens_seen": 16998816, + "step": 29305 + }, + { + "epoch": 4.36550491510277, + "grad_norm": 4.416316509246826, + "learning_rate": 4.789970213852372e-05, + "loss": 0.1055, + "num_input_tokens_seen": 17001664, + "step": 29310 + }, + { + "epoch": 4.36624962764373, + "grad_norm": 18.275188446044922, + "learning_rate": 4.789839825943979e-05, + "loss": 0.3173, + "num_input_tokens_seen": 17004704, + "step": 29315 + }, + { + "epoch": 4.3669943401846885, + "grad_norm": 2.3198187351226807, + "learning_rate": 4.78970939935107e-05, + "loss": 0.3034, + "num_input_tokens_seen": 17007680, + "step": 29320 + }, + { + "epoch": 4.367739052725648, + "grad_norm": 2.872305393218994, + "learning_rate": 4.789578934075847e-05, + "loss": 0.1017, + "num_input_tokens_seen": 17010624, + "step": 29325 + }, + { + "epoch": 4.368483765266607, + "grad_norm": 23.61745834350586, + "learning_rate": 4.7894484301205156e-05, + "loss": 0.4228, + "num_input_tokens_seen": 17013696, + "step": 29330 + }, + { + "epoch": 4.3692284778075665, + "grad_norm": 16.958454132080078, + "learning_rate": 4.78931788748728e-05, + "loss": 0.1896, + "num_input_tokens_seen": 17016512, + "step": 29335 + }, + { + "epoch": 4.369973190348525, + "grad_norm": 28.379423141479492, + "learning_rate": 4.789187306178345e-05, + "loss": 0.3645, + "num_input_tokens_seen": 17019392, + "step": 29340 + }, + { + "epoch": 4.370717902889485, + "grad_norm": 28.21207046508789, + "learning_rate": 4.789056686195917e-05, + "loss": 0.1723, + "num_input_tokens_seen": 17022560, + "step": 29345 + }, + { + "epoch": 4.371462615430444, + "grad_norm": 25.956403732299805, + "learning_rate": 4.788926027542203e-05, + "loss": 0.6401, + "num_input_tokens_seen": 17025408, + "step": 29350 + }, + { + "epoch": 4.372207327971403, + "grad_norm": 12.659544944763184, + "learning_rate": 4.7887953302194106e-05, + "loss": 0.2399, + "num_input_tokens_seen": 17028288, + "step": 29355 + }, + { + "epoch": 4.372952040512362, + "grad_norm": 12.922609329223633, + "learning_rate": 4.788664594229747e-05, + "loss": 0.2892, + "num_input_tokens_seen": 17031072, + "step": 29360 + }, + { + "epoch": 4.373696753053322, + "grad_norm": 0.03707532957196236, + "learning_rate": 4.788533819575421e-05, + "loss": 0.1781, + "num_input_tokens_seen": 17034176, + "step": 29365 + }, + { + "epoch": 4.3744414655942805, + "grad_norm": 16.389883041381836, + "learning_rate": 4.7884030062586424e-05, + "loss": 0.2659, + "num_input_tokens_seen": 17037120, + "step": 29370 + }, + { + "epoch": 4.37518617813524, + "grad_norm": 8.260663032531738, + "learning_rate": 4.78827215428162e-05, + "loss": 0.1466, + "num_input_tokens_seen": 17040160, + "step": 29375 + }, + { + "epoch": 4.375930890676199, + "grad_norm": 9.822958946228027, + "learning_rate": 4.7881412636465664e-05, + "loss": 0.5039, + "num_input_tokens_seen": 17043072, + "step": 29380 + }, + { + "epoch": 4.3766756032171585, + "grad_norm": 3.0624136924743652, + "learning_rate": 4.7880103343556906e-05, + "loss": 0.2285, + "num_input_tokens_seen": 17045888, + "step": 29385 + }, + { + "epoch": 4.377420315758117, + "grad_norm": 10.821120262145996, + "learning_rate": 4.787879366411206e-05, + "loss": 0.1726, + "num_input_tokens_seen": 17048672, + "step": 29390 + }, + { + "epoch": 4.378165028299077, + "grad_norm": 15.160326957702637, + "learning_rate": 4.787748359815326e-05, + "loss": 0.11, + "num_input_tokens_seen": 17051744, + "step": 29395 + }, + { + "epoch": 4.378909740840036, + "grad_norm": 22.208105087280273, + "learning_rate": 4.787617314570261e-05, + "loss": 0.3282, + "num_input_tokens_seen": 17054592, + "step": 29400 + }, + { + "epoch": 4.379654453380995, + "grad_norm": 17.704923629760742, + "learning_rate": 4.7874862306782276e-05, + "loss": 0.2832, + "num_input_tokens_seen": 17057312, + "step": 29405 + }, + { + "epoch": 4.380399165921954, + "grad_norm": 14.345149040222168, + "learning_rate": 4.787355108141439e-05, + "loss": 0.4932, + "num_input_tokens_seen": 17060224, + "step": 29410 + }, + { + "epoch": 4.381143878462913, + "grad_norm": 2.978088855743408, + "learning_rate": 4.78722394696211e-05, + "loss": 0.4633, + "num_input_tokens_seen": 17063072, + "step": 29415 + }, + { + "epoch": 4.3818885910038725, + "grad_norm": 15.892309188842773, + "learning_rate": 4.787092747142458e-05, + "loss": 0.4723, + "num_input_tokens_seen": 17066240, + "step": 29420 + }, + { + "epoch": 4.382633303544832, + "grad_norm": 28.780588150024414, + "learning_rate": 4.7869615086846973e-05, + "loss": 0.3874, + "num_input_tokens_seen": 17068992, + "step": 29425 + }, + { + "epoch": 4.383378016085791, + "grad_norm": 6.0764031410217285, + "learning_rate": 4.786830231591047e-05, + "loss": 0.3605, + "num_input_tokens_seen": 17071936, + "step": 29430 + }, + { + "epoch": 4.38412272862675, + "grad_norm": 22.17464828491211, + "learning_rate": 4.786698915863724e-05, + "loss": 0.4439, + "num_input_tokens_seen": 17074592, + "step": 29435 + }, + { + "epoch": 4.384867441167709, + "grad_norm": 22.402481079101562, + "learning_rate": 4.7865675615049464e-05, + "loss": 0.3117, + "num_input_tokens_seen": 17077632, + "step": 29440 + }, + { + "epoch": 4.385612153708668, + "grad_norm": 25.580780029296875, + "learning_rate": 4.786436168516935e-05, + "loss": 0.2983, + "num_input_tokens_seen": 17080448, + "step": 29445 + }, + { + "epoch": 4.386356866249628, + "grad_norm": 9.453603744506836, + "learning_rate": 4.786304736901908e-05, + "loss": 0.2937, + "num_input_tokens_seen": 17083296, + "step": 29450 + }, + { + "epoch": 4.3871015787905865, + "grad_norm": 31.773578643798828, + "learning_rate": 4.7861732666620856e-05, + "loss": 0.2293, + "num_input_tokens_seen": 17086528, + "step": 29455 + }, + { + "epoch": 4.387846291331546, + "grad_norm": 30.32025718688965, + "learning_rate": 4.78604175779969e-05, + "loss": 0.3328, + "num_input_tokens_seen": 17089696, + "step": 29460 + }, + { + "epoch": 4.388591003872505, + "grad_norm": 20.478538513183594, + "learning_rate": 4.7859102103169415e-05, + "loss": 0.3643, + "num_input_tokens_seen": 17092736, + "step": 29465 + }, + { + "epoch": 4.3893357164134645, + "grad_norm": 29.511226654052734, + "learning_rate": 4.785778624216064e-05, + "loss": 0.3001, + "num_input_tokens_seen": 17095552, + "step": 29470 + }, + { + "epoch": 4.390080428954423, + "grad_norm": 67.00880432128906, + "learning_rate": 4.7856469994992805e-05, + "loss": 0.5873, + "num_input_tokens_seen": 17098368, + "step": 29475 + }, + { + "epoch": 4.390825141495383, + "grad_norm": 4.850490093231201, + "learning_rate": 4.7855153361688124e-05, + "loss": 0.247, + "num_input_tokens_seen": 17101216, + "step": 29480 + }, + { + "epoch": 4.391569854036342, + "grad_norm": 1.781208872795105, + "learning_rate": 4.785383634226887e-05, + "loss": 0.1712, + "num_input_tokens_seen": 17104224, + "step": 29485 + }, + { + "epoch": 4.392314566577301, + "grad_norm": 23.645610809326172, + "learning_rate": 4.785251893675727e-05, + "loss": 0.1931, + "num_input_tokens_seen": 17107136, + "step": 29490 + }, + { + "epoch": 4.39305927911826, + "grad_norm": 26.121671676635742, + "learning_rate": 4.785120114517559e-05, + "loss": 0.4084, + "num_input_tokens_seen": 17110048, + "step": 29495 + }, + { + "epoch": 4.39380399165922, + "grad_norm": 9.314555168151855, + "learning_rate": 4.7849882967546086e-05, + "loss": 0.277, + "num_input_tokens_seen": 17113216, + "step": 29500 + }, + { + "epoch": 4.3945487042001785, + "grad_norm": 28.253480911254883, + "learning_rate": 4.784856440389105e-05, + "loss": 0.3561, + "num_input_tokens_seen": 17115968, + "step": 29505 + }, + { + "epoch": 4.395293416741138, + "grad_norm": 32.80290222167969, + "learning_rate": 4.784724545423272e-05, + "loss": 0.2388, + "num_input_tokens_seen": 17118656, + "step": 29510 + }, + { + "epoch": 4.396038129282097, + "grad_norm": 8.360032081604004, + "learning_rate": 4.7845926118593415e-05, + "loss": 0.277, + "num_input_tokens_seen": 17122144, + "step": 29515 + }, + { + "epoch": 4.396782841823057, + "grad_norm": 3.6755850315093994, + "learning_rate": 4.784460639699541e-05, + "loss": 0.1043, + "num_input_tokens_seen": 17125120, + "step": 29520 + }, + { + "epoch": 4.397527554364015, + "grad_norm": 43.91926193237305, + "learning_rate": 4.784328628946098e-05, + "loss": 0.2806, + "num_input_tokens_seen": 17127776, + "step": 29525 + }, + { + "epoch": 4.398272266904975, + "grad_norm": 22.36624526977539, + "learning_rate": 4.784196579601246e-05, + "loss": 0.2627, + "num_input_tokens_seen": 17130496, + "step": 29530 + }, + { + "epoch": 4.399016979445934, + "grad_norm": 0.2460036426782608, + "learning_rate": 4.784064491667214e-05, + "loss": 0.6446, + "num_input_tokens_seen": 17133344, + "step": 29535 + }, + { + "epoch": 4.399761691986893, + "grad_norm": 15.3202543258667, + "learning_rate": 4.7839323651462334e-05, + "loss": 0.2828, + "num_input_tokens_seen": 17136224, + "step": 29540 + }, + { + "epoch": 4.400506404527852, + "grad_norm": 15.027656555175781, + "learning_rate": 4.783800200040537e-05, + "loss": 0.3171, + "num_input_tokens_seen": 17139232, + "step": 29545 + }, + { + "epoch": 4.401251117068812, + "grad_norm": 2.106672763824463, + "learning_rate": 4.783667996352357e-05, + "loss": 0.3788, + "num_input_tokens_seen": 17142208, + "step": 29550 + }, + { + "epoch": 4.4019958296097705, + "grad_norm": 4.7831621170043945, + "learning_rate": 4.783535754083927e-05, + "loss": 0.2023, + "num_input_tokens_seen": 17145024, + "step": 29555 + }, + { + "epoch": 4.40274054215073, + "grad_norm": 0.6528099775314331, + "learning_rate": 4.783403473237483e-05, + "loss": 0.3246, + "num_input_tokens_seen": 17147936, + "step": 29560 + }, + { + "epoch": 4.403485254691689, + "grad_norm": 53.935508728027344, + "learning_rate": 4.783271153815257e-05, + "loss": 0.4642, + "num_input_tokens_seen": 17150720, + "step": 29565 + }, + { + "epoch": 4.404229967232649, + "grad_norm": 2.724112033843994, + "learning_rate": 4.783138795819485e-05, + "loss": 0.384, + "num_input_tokens_seen": 17153408, + "step": 29570 + }, + { + "epoch": 4.404974679773607, + "grad_norm": 7.38333797454834, + "learning_rate": 4.783006399252404e-05, + "loss": 0.1014, + "num_input_tokens_seen": 17156576, + "step": 29575 + }, + { + "epoch": 4.405719392314566, + "grad_norm": 0.45681026577949524, + "learning_rate": 4.782873964116251e-05, + "loss": 0.2928, + "num_input_tokens_seen": 17159648, + "step": 29580 + }, + { + "epoch": 4.406464104855526, + "grad_norm": 28.444387435913086, + "learning_rate": 4.782741490413262e-05, + "loss": 0.4574, + "num_input_tokens_seen": 17162432, + "step": 29585 + }, + { + "epoch": 4.407208817396485, + "grad_norm": 19.658185958862305, + "learning_rate": 4.782608978145675e-05, + "loss": 0.565, + "num_input_tokens_seen": 17165152, + "step": 29590 + }, + { + "epoch": 4.407953529937444, + "grad_norm": 8.207259178161621, + "learning_rate": 4.7824764273157295e-05, + "loss": 0.0874, + "num_input_tokens_seen": 17168000, + "step": 29595 + }, + { + "epoch": 4.408698242478403, + "grad_norm": 13.283620834350586, + "learning_rate": 4.782343837925665e-05, + "loss": 0.2829, + "num_input_tokens_seen": 17170976, + "step": 29600 + }, + { + "epoch": 4.409442955019363, + "grad_norm": 36.49824905395508, + "learning_rate": 4.7822112099777205e-05, + "loss": 0.7571, + "num_input_tokens_seen": 17173600, + "step": 29605 + }, + { + "epoch": 4.410187667560321, + "grad_norm": 4.192839622497559, + "learning_rate": 4.7820785434741375e-05, + "loss": 0.3949, + "num_input_tokens_seen": 17176512, + "step": 29610 + }, + { + "epoch": 4.410932380101281, + "grad_norm": 57.4027214050293, + "learning_rate": 4.7819458384171566e-05, + "loss": 0.4868, + "num_input_tokens_seen": 17179392, + "step": 29615 + }, + { + "epoch": 4.41167709264224, + "grad_norm": 7.756306171417236, + "learning_rate": 4.78181309480902e-05, + "loss": 0.2952, + "num_input_tokens_seen": 17182528, + "step": 29620 + }, + { + "epoch": 4.412421805183199, + "grad_norm": 1.0186817646026611, + "learning_rate": 4.781680312651971e-05, + "loss": 0.1948, + "num_input_tokens_seen": 17185696, + "step": 29625 + }, + { + "epoch": 4.413166517724158, + "grad_norm": 0.5810716152191162, + "learning_rate": 4.781547491948252e-05, + "loss": 0.2806, + "num_input_tokens_seen": 17188832, + "step": 29630 + }, + { + "epoch": 4.413911230265118, + "grad_norm": 14.510319709777832, + "learning_rate": 4.7814146327001067e-05, + "loss": 0.1579, + "num_input_tokens_seen": 17191808, + "step": 29635 + }, + { + "epoch": 4.4146559428060765, + "grad_norm": 6.771398544311523, + "learning_rate": 4.7812817349097796e-05, + "loss": 0.2648, + "num_input_tokens_seen": 17195040, + "step": 29640 + }, + { + "epoch": 4.415400655347036, + "grad_norm": 24.222980499267578, + "learning_rate": 4.7811487985795164e-05, + "loss": 0.2472, + "num_input_tokens_seen": 17197568, + "step": 29645 + }, + { + "epoch": 4.416145367887995, + "grad_norm": 8.613580703735352, + "learning_rate": 4.781015823711563e-05, + "loss": 0.2961, + "num_input_tokens_seen": 17200544, + "step": 29650 + }, + { + "epoch": 4.416890080428955, + "grad_norm": 33.71099090576172, + "learning_rate": 4.780882810308165e-05, + "loss": 0.2093, + "num_input_tokens_seen": 17203200, + "step": 29655 + }, + { + "epoch": 4.417634792969913, + "grad_norm": 15.866223335266113, + "learning_rate": 4.7807497583715704e-05, + "loss": 0.1726, + "num_input_tokens_seen": 17205856, + "step": 29660 + }, + { + "epoch": 4.418379505510873, + "grad_norm": 3.3936245441436768, + "learning_rate": 4.780616667904026e-05, + "loss": 0.0818, + "num_input_tokens_seen": 17208672, + "step": 29665 + }, + { + "epoch": 4.419124218051832, + "grad_norm": 22.81624984741211, + "learning_rate": 4.7804835389077824e-05, + "loss": 0.3892, + "num_input_tokens_seen": 17211296, + "step": 29670 + }, + { + "epoch": 4.419868930592791, + "grad_norm": 22.67951774597168, + "learning_rate": 4.780350371385086e-05, + "loss": 0.1929, + "num_input_tokens_seen": 17214336, + "step": 29675 + }, + { + "epoch": 4.42061364313375, + "grad_norm": 13.656778335571289, + "learning_rate": 4.7802171653381885e-05, + "loss": 0.5569, + "num_input_tokens_seen": 17217504, + "step": 29680 + }, + { + "epoch": 4.42135835567471, + "grad_norm": 22.00374984741211, + "learning_rate": 4.780083920769339e-05, + "loss": 0.3978, + "num_input_tokens_seen": 17220160, + "step": 29685 + }, + { + "epoch": 4.422103068215669, + "grad_norm": 3.220170497894287, + "learning_rate": 4.779950637680789e-05, + "loss": 0.0607, + "num_input_tokens_seen": 17223168, + "step": 29690 + }, + { + "epoch": 4.422847780756628, + "grad_norm": 3.3972389698028564, + "learning_rate": 4.7798173160747906e-05, + "loss": 0.1272, + "num_input_tokens_seen": 17225888, + "step": 29695 + }, + { + "epoch": 4.423592493297587, + "grad_norm": 18.51543426513672, + "learning_rate": 4.7796839559535955e-05, + "loss": 0.1863, + "num_input_tokens_seen": 17228576, + "step": 29700 + }, + { + "epoch": 4.424337205838547, + "grad_norm": 22.589218139648438, + "learning_rate": 4.779550557319457e-05, + "loss": 0.5613, + "num_input_tokens_seen": 17231488, + "step": 29705 + }, + { + "epoch": 4.425081918379505, + "grad_norm": 38.318050384521484, + "learning_rate": 4.7794171201746285e-05, + "loss": 0.4985, + "num_input_tokens_seen": 17234208, + "step": 29710 + }, + { + "epoch": 4.425826630920465, + "grad_norm": 16.542926788330078, + "learning_rate": 4.779283644521365e-05, + "loss": 0.3211, + "num_input_tokens_seen": 17236896, + "step": 29715 + }, + { + "epoch": 4.426571343461424, + "grad_norm": 21.077760696411133, + "learning_rate": 4.7791501303619205e-05, + "loss": 0.4486, + "num_input_tokens_seen": 17239712, + "step": 29720 + }, + { + "epoch": 4.427316056002383, + "grad_norm": 18.760780334472656, + "learning_rate": 4.7790165776985504e-05, + "loss": 0.7887, + "num_input_tokens_seen": 17242688, + "step": 29725 + }, + { + "epoch": 4.428060768543342, + "grad_norm": 0.37193188071250916, + "learning_rate": 4.7788829865335125e-05, + "loss": 0.3668, + "num_input_tokens_seen": 17245440, + "step": 29730 + }, + { + "epoch": 4.428805481084302, + "grad_norm": 0.019805334508419037, + "learning_rate": 4.778749356869062e-05, + "loss": 0.3368, + "num_input_tokens_seen": 17248256, + "step": 29735 + }, + { + "epoch": 4.429550193625261, + "grad_norm": 14.92375659942627, + "learning_rate": 4.778615688707457e-05, + "loss": 0.2408, + "num_input_tokens_seen": 17251168, + "step": 29740 + }, + { + "epoch": 4.43029490616622, + "grad_norm": 21.177824020385742, + "learning_rate": 4.778481982050956e-05, + "loss": 0.17, + "num_input_tokens_seen": 17253920, + "step": 29745 + }, + { + "epoch": 4.431039618707179, + "grad_norm": 17.01898956298828, + "learning_rate": 4.778348236901818e-05, + "loss": 0.2406, + "num_input_tokens_seen": 17256896, + "step": 29750 + }, + { + "epoch": 4.431784331248139, + "grad_norm": 8.997203826904297, + "learning_rate": 4.7782144532623016e-05, + "loss": 0.1642, + "num_input_tokens_seen": 17259808, + "step": 29755 + }, + { + "epoch": 4.432529043789097, + "grad_norm": 38.82923889160156, + "learning_rate": 4.7780806311346684e-05, + "loss": 0.3762, + "num_input_tokens_seen": 17262592, + "step": 29760 + }, + { + "epoch": 4.433273756330056, + "grad_norm": 11.63684368133545, + "learning_rate": 4.777946770521178e-05, + "loss": 0.2623, + "num_input_tokens_seen": 17265632, + "step": 29765 + }, + { + "epoch": 4.434018468871016, + "grad_norm": 42.62089157104492, + "learning_rate": 4.7778128714240915e-05, + "loss": 0.4094, + "num_input_tokens_seen": 17268672, + "step": 29770 + }, + { + "epoch": 4.434763181411975, + "grad_norm": 2.7892935276031494, + "learning_rate": 4.7776789338456717e-05, + "loss": 0.3984, + "num_input_tokens_seen": 17271744, + "step": 29775 + }, + { + "epoch": 4.435507893952934, + "grad_norm": 2.8559746742248535, + "learning_rate": 4.777544957788182e-05, + "loss": 0.1406, + "num_input_tokens_seen": 17274752, + "step": 29780 + }, + { + "epoch": 4.436252606493893, + "grad_norm": 0.24646811187267303, + "learning_rate": 4.7774109432538843e-05, + "loss": 0.1718, + "num_input_tokens_seen": 17277600, + "step": 29785 + }, + { + "epoch": 4.436997319034853, + "grad_norm": 18.50497055053711, + "learning_rate": 4.777276890245044e-05, + "loss": 0.5909, + "num_input_tokens_seen": 17280448, + "step": 29790 + }, + { + "epoch": 4.437742031575811, + "grad_norm": 2.3105008602142334, + "learning_rate": 4.7771427987639246e-05, + "loss": 0.1549, + "num_input_tokens_seen": 17283264, + "step": 29795 + }, + { + "epoch": 4.438486744116771, + "grad_norm": 14.987652778625488, + "learning_rate": 4.777008668812793e-05, + "loss": 0.0305, + "num_input_tokens_seen": 17286560, + "step": 29800 + }, + { + "epoch": 4.43923145665773, + "grad_norm": 10.689244270324707, + "learning_rate": 4.776874500393912e-05, + "loss": 0.41, + "num_input_tokens_seen": 17289376, + "step": 29805 + }, + { + "epoch": 4.439976169198689, + "grad_norm": 2.0981433391571045, + "learning_rate": 4.7767402935095525e-05, + "loss": 0.1545, + "num_input_tokens_seen": 17292224, + "step": 29810 + }, + { + "epoch": 4.440720881739648, + "grad_norm": 17.47929573059082, + "learning_rate": 4.776606048161979e-05, + "loss": 0.3014, + "num_input_tokens_seen": 17295520, + "step": 29815 + }, + { + "epoch": 4.441465594280608, + "grad_norm": 40.51980972290039, + "learning_rate": 4.77647176435346e-05, + "loss": 0.1535, + "num_input_tokens_seen": 17298464, + "step": 29820 + }, + { + "epoch": 4.442210306821567, + "grad_norm": 1.9189300537109375, + "learning_rate": 4.7763374420862645e-05, + "loss": 0.2602, + "num_input_tokens_seen": 17301120, + "step": 29825 + }, + { + "epoch": 4.442955019362526, + "grad_norm": 44.05251693725586, + "learning_rate": 4.7762030813626615e-05, + "loss": 0.1754, + "num_input_tokens_seen": 17303840, + "step": 29830 + }, + { + "epoch": 4.443699731903485, + "grad_norm": 25.334165573120117, + "learning_rate": 4.776068682184921e-05, + "loss": 0.3671, + "num_input_tokens_seen": 17306624, + "step": 29835 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 32.66969680786133, + "learning_rate": 4.7759342445553124e-05, + "loss": 0.5835, + "num_input_tokens_seen": 17309472, + "step": 29840 + }, + { + "epoch": 4.445189156985403, + "grad_norm": 0.06932416558265686, + "learning_rate": 4.775799768476109e-05, + "loss": 0.1733, + "num_input_tokens_seen": 17312384, + "step": 29845 + }, + { + "epoch": 4.445933869526363, + "grad_norm": 40.71976089477539, + "learning_rate": 4.775665253949581e-05, + "loss": 0.3948, + "num_input_tokens_seen": 17315392, + "step": 29850 + }, + { + "epoch": 4.446678582067322, + "grad_norm": 0.10536747425794601, + "learning_rate": 4.775530700978002e-05, + "loss": 0.3572, + "num_input_tokens_seen": 17318272, + "step": 29855 + }, + { + "epoch": 4.4474232946082815, + "grad_norm": 14.518484115600586, + "learning_rate": 4.775396109563644e-05, + "loss": 0.101, + "num_input_tokens_seen": 17321152, + "step": 29860 + }, + { + "epoch": 4.44816800714924, + "grad_norm": 21.944963455200195, + "learning_rate": 4.775261479708781e-05, + "loss": 0.6492, + "num_input_tokens_seen": 17324128, + "step": 29865 + }, + { + "epoch": 4.4489127196902, + "grad_norm": 0.9357914328575134, + "learning_rate": 4.775126811415689e-05, + "loss": 0.1539, + "num_input_tokens_seen": 17327008, + "step": 29870 + }, + { + "epoch": 4.449657432231159, + "grad_norm": 83.65035247802734, + "learning_rate": 4.7749921046866407e-05, + "loss": 0.5304, + "num_input_tokens_seen": 17329728, + "step": 29875 + }, + { + "epoch": 4.450402144772118, + "grad_norm": 17.93484115600586, + "learning_rate": 4.7748573595239134e-05, + "loss": 0.4318, + "num_input_tokens_seen": 17332480, + "step": 29880 + }, + { + "epoch": 4.451146857313077, + "grad_norm": 14.960918426513672, + "learning_rate": 4.7747225759297835e-05, + "loss": 0.4453, + "num_input_tokens_seen": 17335232, + "step": 29885 + }, + { + "epoch": 4.451891569854037, + "grad_norm": 10.651571273803711, + "learning_rate": 4.774587753906526e-05, + "loss": 0.1607, + "num_input_tokens_seen": 17338016, + "step": 29890 + }, + { + "epoch": 4.452636282394995, + "grad_norm": 32.78421401977539, + "learning_rate": 4.774452893456423e-05, + "loss": 0.4686, + "num_input_tokens_seen": 17340736, + "step": 29895 + }, + { + "epoch": 4.453380994935955, + "grad_norm": 2.44270658493042, + "learning_rate": 4.774317994581748e-05, + "loss": 0.1314, + "num_input_tokens_seen": 17343584, + "step": 29900 + }, + { + "epoch": 4.454125707476914, + "grad_norm": 37.09122848510742, + "learning_rate": 4.7741830572847826e-05, + "loss": 0.1367, + "num_input_tokens_seen": 17346432, + "step": 29905 + }, + { + "epoch": 4.4548704200178735, + "grad_norm": 20.662399291992188, + "learning_rate": 4.774048081567805e-05, + "loss": 0.3121, + "num_input_tokens_seen": 17349280, + "step": 29910 + }, + { + "epoch": 4.455615132558832, + "grad_norm": 5.7691426277160645, + "learning_rate": 4.7739130674330966e-05, + "loss": 0.133, + "num_input_tokens_seen": 17351936, + "step": 29915 + }, + { + "epoch": 4.456359845099792, + "grad_norm": 17.940509796142578, + "learning_rate": 4.773778014882939e-05, + "loss": 0.24, + "num_input_tokens_seen": 17354912, + "step": 29920 + }, + { + "epoch": 4.457104557640751, + "grad_norm": 5.256476879119873, + "learning_rate": 4.773642923919612e-05, + "loss": 0.3556, + "num_input_tokens_seen": 17357920, + "step": 29925 + }, + { + "epoch": 4.457849270181709, + "grad_norm": 4.378721237182617, + "learning_rate": 4.773507794545399e-05, + "loss": 0.1668, + "num_input_tokens_seen": 17360768, + "step": 29930 + }, + { + "epoch": 4.458593982722669, + "grad_norm": 16.963144302368164, + "learning_rate": 4.7733726267625824e-05, + "loss": 0.3905, + "num_input_tokens_seen": 17363456, + "step": 29935 + }, + { + "epoch": 4.459338695263629, + "grad_norm": 0.13414490222930908, + "learning_rate": 4.7732374205734456e-05, + "loss": 0.2516, + "num_input_tokens_seen": 17366464, + "step": 29940 + }, + { + "epoch": 4.4600834078045875, + "grad_norm": 0.84726482629776, + "learning_rate": 4.773102175980273e-05, + "loss": 0.1286, + "num_input_tokens_seen": 17369344, + "step": 29945 + }, + { + "epoch": 4.460828120345546, + "grad_norm": 19.058168411254883, + "learning_rate": 4.772966892985349e-05, + "loss": 0.6352, + "num_input_tokens_seen": 17372128, + "step": 29950 + }, + { + "epoch": 4.461572832886506, + "grad_norm": 9.304529190063477, + "learning_rate": 4.77283157159096e-05, + "loss": 0.2783, + "num_input_tokens_seen": 17375008, + "step": 29955 + }, + { + "epoch": 4.462317545427465, + "grad_norm": 18.962692260742188, + "learning_rate": 4.772696211799392e-05, + "loss": 0.1618, + "num_input_tokens_seen": 17377952, + "step": 29960 + }, + { + "epoch": 4.463062257968424, + "grad_norm": 17.93213653564453, + "learning_rate": 4.7725608136129305e-05, + "loss": 0.406, + "num_input_tokens_seen": 17380640, + "step": 29965 + }, + { + "epoch": 4.463806970509383, + "grad_norm": 14.747440338134766, + "learning_rate": 4.7724253770338645e-05, + "loss": 0.4122, + "num_input_tokens_seen": 17383840, + "step": 29970 + }, + { + "epoch": 4.464551683050343, + "grad_norm": 1.723600149154663, + "learning_rate": 4.772289902064481e-05, + "loss": 0.1977, + "num_input_tokens_seen": 17386592, + "step": 29975 + }, + { + "epoch": 4.465296395591301, + "grad_norm": 37.37350082397461, + "learning_rate": 4.772154388707069e-05, + "loss": 0.6003, + "num_input_tokens_seen": 17389536, + "step": 29980 + }, + { + "epoch": 4.466041108132261, + "grad_norm": 6.795199394226074, + "learning_rate": 4.7720188369639186e-05, + "loss": 0.2335, + "num_input_tokens_seen": 17392224, + "step": 29985 + }, + { + "epoch": 4.46678582067322, + "grad_norm": 0.6464906930923462, + "learning_rate": 4.771883246837318e-05, + "loss": 0.1958, + "num_input_tokens_seen": 17395168, + "step": 29990 + }, + { + "epoch": 4.4675305332141795, + "grad_norm": 48.132850646972656, + "learning_rate": 4.77174761832956e-05, + "loss": 0.287, + "num_input_tokens_seen": 17397984, + "step": 29995 + }, + { + "epoch": 4.468275245755138, + "grad_norm": 0.18319293856620789, + "learning_rate": 4.771611951442935e-05, + "loss": 0.0869, + "num_input_tokens_seen": 17400928, + "step": 30000 + }, + { + "epoch": 4.469019958296098, + "grad_norm": 6.254396438598633, + "learning_rate": 4.771476246179734e-05, + "loss": 0.2763, + "num_input_tokens_seen": 17403424, + "step": 30005 + }, + { + "epoch": 4.469764670837057, + "grad_norm": 15.072479248046875, + "learning_rate": 4.7713405025422505e-05, + "loss": 0.1259, + "num_input_tokens_seen": 17406432, + "step": 30010 + }, + { + "epoch": 4.470509383378016, + "grad_norm": 7.026984691619873, + "learning_rate": 4.771204720532778e-05, + "loss": 0.2928, + "num_input_tokens_seen": 17409152, + "step": 30015 + }, + { + "epoch": 4.471254095918975, + "grad_norm": 3.1971511840820312, + "learning_rate": 4.7710689001536105e-05, + "loss": 0.3296, + "num_input_tokens_seen": 17411872, + "step": 30020 + }, + { + "epoch": 4.471998808459935, + "grad_norm": 16.228816986083984, + "learning_rate": 4.7709330414070406e-05, + "loss": 0.3578, + "num_input_tokens_seen": 17414752, + "step": 30025 + }, + { + "epoch": 4.4727435210008935, + "grad_norm": 17.39839744567871, + "learning_rate": 4.770797144295366e-05, + "loss": 0.3853, + "num_input_tokens_seen": 17417472, + "step": 30030 + }, + { + "epoch": 4.473488233541853, + "grad_norm": 5.335371494293213, + "learning_rate": 4.7706612088208826e-05, + "loss": 0.2405, + "num_input_tokens_seen": 17420320, + "step": 30035 + }, + { + "epoch": 4.474232946082812, + "grad_norm": 39.6072883605957, + "learning_rate": 4.770525234985884e-05, + "loss": 0.5178, + "num_input_tokens_seen": 17423072, + "step": 30040 + }, + { + "epoch": 4.4749776586237715, + "grad_norm": 46.93097686767578, + "learning_rate": 4.770389222792671e-05, + "loss": 0.3197, + "num_input_tokens_seen": 17425856, + "step": 30045 + }, + { + "epoch": 4.47572237116473, + "grad_norm": 9.431509017944336, + "learning_rate": 4.770253172243538e-05, + "loss": 0.1638, + "num_input_tokens_seen": 17428704, + "step": 30050 + }, + { + "epoch": 4.47646708370569, + "grad_norm": 35.58946990966797, + "learning_rate": 4.770117083340786e-05, + "loss": 0.4611, + "num_input_tokens_seen": 17431712, + "step": 30055 + }, + { + "epoch": 4.477211796246649, + "grad_norm": 4.597087860107422, + "learning_rate": 4.769980956086714e-05, + "loss": 0.1685, + "num_input_tokens_seen": 17434720, + "step": 30060 + }, + { + "epoch": 4.477956508787608, + "grad_norm": 1.8229202032089233, + "learning_rate": 4.769844790483619e-05, + "loss": 0.1848, + "num_input_tokens_seen": 17437280, + "step": 30065 + }, + { + "epoch": 4.478701221328567, + "grad_norm": 27.84120750427246, + "learning_rate": 4.769708586533804e-05, + "loss": 0.5795, + "num_input_tokens_seen": 17440224, + "step": 30070 + }, + { + "epoch": 4.479445933869527, + "grad_norm": 0.41116926074028015, + "learning_rate": 4.7695723442395694e-05, + "loss": 0.3565, + "num_input_tokens_seen": 17443040, + "step": 30075 + }, + { + "epoch": 4.4801906464104855, + "grad_norm": 0.12520533800125122, + "learning_rate": 4.769436063603217e-05, + "loss": 0.4763, + "num_input_tokens_seen": 17446016, + "step": 30080 + }, + { + "epoch": 4.480935358951445, + "grad_norm": 31.492368698120117, + "learning_rate": 4.769299744627048e-05, + "loss": 0.3432, + "num_input_tokens_seen": 17448768, + "step": 30085 + }, + { + "epoch": 4.481680071492404, + "grad_norm": 3.115633249282837, + "learning_rate": 4.769163387313367e-05, + "loss": 0.2327, + "num_input_tokens_seen": 17451552, + "step": 30090 + }, + { + "epoch": 4.4824247840333635, + "grad_norm": 54.24796676635742, + "learning_rate": 4.7690269916644766e-05, + "loss": 0.263, + "num_input_tokens_seen": 17454368, + "step": 30095 + }, + { + "epoch": 4.483169496574322, + "grad_norm": 18.520357131958008, + "learning_rate": 4.768890557682681e-05, + "loss": 0.3943, + "num_input_tokens_seen": 17457504, + "step": 30100 + }, + { + "epoch": 4.483914209115282, + "grad_norm": 15.835697174072266, + "learning_rate": 4.768754085370286e-05, + "loss": 0.7908, + "num_input_tokens_seen": 17460384, + "step": 30105 + }, + { + "epoch": 4.484658921656241, + "grad_norm": 0.5341591835021973, + "learning_rate": 4.768617574729596e-05, + "loss": 0.1576, + "num_input_tokens_seen": 17462976, + "step": 30110 + }, + { + "epoch": 4.4854036341971995, + "grad_norm": 11.110146522521973, + "learning_rate": 4.768481025762918e-05, + "loss": 0.3817, + "num_input_tokens_seen": 17465856, + "step": 30115 + }, + { + "epoch": 4.486148346738159, + "grad_norm": 9.692222595214844, + "learning_rate": 4.768344438472559e-05, + "loss": 0.1513, + "num_input_tokens_seen": 17468512, + "step": 30120 + }, + { + "epoch": 4.486893059279118, + "grad_norm": 35.736854553222656, + "learning_rate": 4.768207812860826e-05, + "loss": 0.2752, + "num_input_tokens_seen": 17471424, + "step": 30125 + }, + { + "epoch": 4.4876377718200775, + "grad_norm": 26.52302360534668, + "learning_rate": 4.768071148930027e-05, + "loss": 0.3538, + "num_input_tokens_seen": 17474336, + "step": 30130 + }, + { + "epoch": 4.488382484361036, + "grad_norm": 13.987210273742676, + "learning_rate": 4.7679344466824716e-05, + "loss": 0.4889, + "num_input_tokens_seen": 17477376, + "step": 30135 + }, + { + "epoch": 4.489127196901996, + "grad_norm": 37.97938537597656, + "learning_rate": 4.767797706120468e-05, + "loss": 0.5099, + "num_input_tokens_seen": 17480416, + "step": 30140 + }, + { + "epoch": 4.489871909442955, + "grad_norm": 18.8743839263916, + "learning_rate": 4.767660927246328e-05, + "loss": 0.2148, + "num_input_tokens_seen": 17483424, + "step": 30145 + }, + { + "epoch": 4.490616621983914, + "grad_norm": 16.176326751708984, + "learning_rate": 4.7675241100623604e-05, + "loss": 0.6626, + "num_input_tokens_seen": 17486496, + "step": 30150 + }, + { + "epoch": 4.491361334524873, + "grad_norm": 15.34019947052002, + "learning_rate": 4.7673872545708784e-05, + "loss": 0.3069, + "num_input_tokens_seen": 17489376, + "step": 30155 + }, + { + "epoch": 4.492106047065833, + "grad_norm": 7.5181355476379395, + "learning_rate": 4.767250360774193e-05, + "loss": 0.1632, + "num_input_tokens_seen": 17492480, + "step": 30160 + }, + { + "epoch": 4.4928507596067915, + "grad_norm": 1.3137234449386597, + "learning_rate": 4.767113428674616e-05, + "loss": 0.3623, + "num_input_tokens_seen": 17495328, + "step": 30165 + }, + { + "epoch": 4.493595472147751, + "grad_norm": 11.226158142089844, + "learning_rate": 4.766976458274464e-05, + "loss": 0.4753, + "num_input_tokens_seen": 17498208, + "step": 30170 + }, + { + "epoch": 4.49434018468871, + "grad_norm": 23.379552841186523, + "learning_rate": 4.766839449576047e-05, + "loss": 0.1898, + "num_input_tokens_seen": 17501312, + "step": 30175 + }, + { + "epoch": 4.4950848972296695, + "grad_norm": 17.759471893310547, + "learning_rate": 4.766702402581682e-05, + "loss": 0.5291, + "num_input_tokens_seen": 17504384, + "step": 30180 + }, + { + "epoch": 4.495829609770628, + "grad_norm": 4.587245941162109, + "learning_rate": 4.766565317293683e-05, + "loss": 0.3472, + "num_input_tokens_seen": 17507136, + "step": 30185 + }, + { + "epoch": 4.496574322311588, + "grad_norm": 1.9264631271362305, + "learning_rate": 4.766428193714367e-05, + "loss": 0.134, + "num_input_tokens_seen": 17510208, + "step": 30190 + }, + { + "epoch": 4.497319034852547, + "grad_norm": 24.197378158569336, + "learning_rate": 4.766291031846051e-05, + "loss": 0.2702, + "num_input_tokens_seen": 17512960, + "step": 30195 + }, + { + "epoch": 4.498063747393506, + "grad_norm": 13.511609077453613, + "learning_rate": 4.76615383169105e-05, + "loss": 0.2803, + "num_input_tokens_seen": 17515840, + "step": 30200 + }, + { + "epoch": 4.498808459934465, + "grad_norm": 72.81790924072266, + "learning_rate": 4.766016593251684e-05, + "loss": 0.4934, + "num_input_tokens_seen": 17518592, + "step": 30205 + }, + { + "epoch": 4.499553172475425, + "grad_norm": 0.5601631999015808, + "learning_rate": 4.765879316530272e-05, + "loss": 0.0813, + "num_input_tokens_seen": 17521824, + "step": 30210 + }, + { + "epoch": 4.5002978850163835, + "grad_norm": 2.9041285514831543, + "learning_rate": 4.76574200152913e-05, + "loss": 0.4271, + "num_input_tokens_seen": 17524832, + "step": 30215 + }, + { + "epoch": 4.501042597557343, + "grad_norm": 14.554133415222168, + "learning_rate": 4.76560464825058e-05, + "loss": 0.4374, + "num_input_tokens_seen": 17527968, + "step": 30220 + }, + { + "epoch": 4.501787310098302, + "grad_norm": 13.642412185668945, + "learning_rate": 4.7654672566969424e-05, + "loss": 0.4592, + "num_input_tokens_seen": 17531296, + "step": 30225 + }, + { + "epoch": 4.5025320226392616, + "grad_norm": 8.432531356811523, + "learning_rate": 4.765329826870538e-05, + "loss": 0.2421, + "num_input_tokens_seen": 17534336, + "step": 30230 + }, + { + "epoch": 4.50327673518022, + "grad_norm": 22.822975158691406, + "learning_rate": 4.765192358773689e-05, + "loss": 0.4209, + "num_input_tokens_seen": 17538464, + "step": 30235 + }, + { + "epoch": 4.50402144772118, + "grad_norm": 10.603482246398926, + "learning_rate": 4.765054852408717e-05, + "loss": 0.1649, + "num_input_tokens_seen": 17541568, + "step": 30240 + }, + { + "epoch": 4.504766160262139, + "grad_norm": 6.865964889526367, + "learning_rate": 4.7649173077779455e-05, + "loss": 0.4769, + "num_input_tokens_seen": 17544352, + "step": 30245 + }, + { + "epoch": 4.505510872803098, + "grad_norm": 15.537236213684082, + "learning_rate": 4.7647797248836975e-05, + "loss": 0.2272, + "num_input_tokens_seen": 17547456, + "step": 30250 + }, + { + "epoch": 4.506255585344057, + "grad_norm": 13.490080833435059, + "learning_rate": 4.7646421037282984e-05, + "loss": 0.2192, + "num_input_tokens_seen": 17550496, + "step": 30255 + }, + { + "epoch": 4.507000297885017, + "grad_norm": 11.986013412475586, + "learning_rate": 4.764504444314072e-05, + "loss": 0.5311, + "num_input_tokens_seen": 17553504, + "step": 30260 + }, + { + "epoch": 4.5077450104259755, + "grad_norm": 10.908186912536621, + "learning_rate": 4.7643667466433453e-05, + "loss": 0.5077, + "num_input_tokens_seen": 17556672, + "step": 30265 + }, + { + "epoch": 4.508489722966935, + "grad_norm": 8.698946952819824, + "learning_rate": 4.7642290107184426e-05, + "loss": 0.2313, + "num_input_tokens_seen": 17559456, + "step": 30270 + }, + { + "epoch": 4.509234435507894, + "grad_norm": 1.3441470861434937, + "learning_rate": 4.764091236541693e-05, + "loss": 0.147, + "num_input_tokens_seen": 17562432, + "step": 30275 + }, + { + "epoch": 4.509979148048853, + "grad_norm": 7.3125104904174805, + "learning_rate": 4.763953424115424e-05, + "loss": 0.2986, + "num_input_tokens_seen": 17565248, + "step": 30280 + }, + { + "epoch": 4.510723860589812, + "grad_norm": 24.35096549987793, + "learning_rate": 4.7638155734419616e-05, + "loss": 0.4491, + "num_input_tokens_seen": 17568032, + "step": 30285 + }, + { + "epoch": 4.511468573130772, + "grad_norm": 31.644573211669922, + "learning_rate": 4.763677684523636e-05, + "loss": 0.5099, + "num_input_tokens_seen": 17570912, + "step": 30290 + }, + { + "epoch": 4.512213285671731, + "grad_norm": 5.298210620880127, + "learning_rate": 4.7635397573627774e-05, + "loss": 0.2667, + "num_input_tokens_seen": 17573728, + "step": 30295 + }, + { + "epoch": 4.5129579982126895, + "grad_norm": 0.14663544297218323, + "learning_rate": 4.7634017919617143e-05, + "loss": 0.2674, + "num_input_tokens_seen": 17576800, + "step": 30300 + }, + { + "epoch": 4.513702710753649, + "grad_norm": 3.105207681655884, + "learning_rate": 4.763263788322778e-05, + "loss": 0.229, + "num_input_tokens_seen": 17580000, + "step": 30305 + }, + { + "epoch": 4.514447423294608, + "grad_norm": 17.560543060302734, + "learning_rate": 4.7631257464483014e-05, + "loss": 0.3568, + "num_input_tokens_seen": 17583104, + "step": 30310 + }, + { + "epoch": 4.5151921358355676, + "grad_norm": 18.29657745361328, + "learning_rate": 4.762987666340615e-05, + "loss": 0.2714, + "num_input_tokens_seen": 17586080, + "step": 30315 + }, + { + "epoch": 4.515936848376526, + "grad_norm": 19.6517333984375, + "learning_rate": 4.7628495480020516e-05, + "loss": 0.3081, + "num_input_tokens_seen": 17588992, + "step": 30320 + }, + { + "epoch": 4.516681560917486, + "grad_norm": 5.921510219573975, + "learning_rate": 4.762711391434945e-05, + "loss": 0.5158, + "num_input_tokens_seen": 17591936, + "step": 30325 + }, + { + "epoch": 4.517426273458445, + "grad_norm": 19.587350845336914, + "learning_rate": 4.76257319664163e-05, + "loss": 0.3336, + "num_input_tokens_seen": 17594496, + "step": 30330 + }, + { + "epoch": 4.518170985999404, + "grad_norm": 8.040468215942383, + "learning_rate": 4.76243496362444e-05, + "loss": 0.3397, + "num_input_tokens_seen": 17597216, + "step": 30335 + }, + { + "epoch": 4.518915698540363, + "grad_norm": 7.009576320648193, + "learning_rate": 4.76229669238571e-05, + "loss": 0.2745, + "num_input_tokens_seen": 17599840, + "step": 30340 + }, + { + "epoch": 4.519660411081323, + "grad_norm": 7.657815456390381, + "learning_rate": 4.762158382927777e-05, + "loss": 0.2541, + "num_input_tokens_seen": 17602464, + "step": 30345 + }, + { + "epoch": 4.5204051236222815, + "grad_norm": 6.722915172576904, + "learning_rate": 4.762020035252978e-05, + "loss": 0.432, + "num_input_tokens_seen": 17605408, + "step": 30350 + }, + { + "epoch": 4.521149836163241, + "grad_norm": 8.736778259277344, + "learning_rate": 4.761881649363649e-05, + "loss": 0.2619, + "num_input_tokens_seen": 17608288, + "step": 30355 + }, + { + "epoch": 4.5218945487042, + "grad_norm": 19.282663345336914, + "learning_rate": 4.7617432252621285e-05, + "loss": 0.4148, + "num_input_tokens_seen": 17611232, + "step": 30360 + }, + { + "epoch": 4.52263926124516, + "grad_norm": 18.08877182006836, + "learning_rate": 4.7616047629507556e-05, + "loss": 0.1852, + "num_input_tokens_seen": 17614080, + "step": 30365 + }, + { + "epoch": 4.523383973786118, + "grad_norm": 24.532203674316406, + "learning_rate": 4.761466262431867e-05, + "loss": 0.3212, + "num_input_tokens_seen": 17616896, + "step": 30370 + }, + { + "epoch": 4.524128686327078, + "grad_norm": 14.728608131408691, + "learning_rate": 4.7613277237078055e-05, + "loss": 0.287, + "num_input_tokens_seen": 17619840, + "step": 30375 + }, + { + "epoch": 4.524873398868037, + "grad_norm": 3.3835318088531494, + "learning_rate": 4.761189146780911e-05, + "loss": 0.214, + "num_input_tokens_seen": 17622496, + "step": 30380 + }, + { + "epoch": 4.525618111408996, + "grad_norm": 3.04913067817688, + "learning_rate": 4.761050531653524e-05, + "loss": 0.0892, + "num_input_tokens_seen": 17625632, + "step": 30385 + }, + { + "epoch": 4.526362823949955, + "grad_norm": 19.94980812072754, + "learning_rate": 4.760911878327985e-05, + "loss": 0.3866, + "num_input_tokens_seen": 17628800, + "step": 30390 + }, + { + "epoch": 4.527107536490915, + "grad_norm": 11.682767868041992, + "learning_rate": 4.760773186806639e-05, + "loss": 0.3657, + "num_input_tokens_seen": 17631872, + "step": 30395 + }, + { + "epoch": 4.5278522490318736, + "grad_norm": 0.41346675157546997, + "learning_rate": 4.7606344570918264e-05, + "loss": 0.1251, + "num_input_tokens_seen": 17634656, + "step": 30400 + }, + { + "epoch": 4.528596961572833, + "grad_norm": 16.579566955566406, + "learning_rate": 4.760495689185893e-05, + "loss": 0.2529, + "num_input_tokens_seen": 17637632, + "step": 30405 + }, + { + "epoch": 4.529341674113792, + "grad_norm": 12.933300971984863, + "learning_rate": 4.760356883091183e-05, + "loss": 0.3978, + "num_input_tokens_seen": 17640384, + "step": 30410 + }, + { + "epoch": 4.530086386654752, + "grad_norm": 8.065406799316406, + "learning_rate": 4.7602180388100395e-05, + "loss": 0.426, + "num_input_tokens_seen": 17643200, + "step": 30415 + }, + { + "epoch": 4.53083109919571, + "grad_norm": 16.162334442138672, + "learning_rate": 4.760079156344811e-05, + "loss": 0.3096, + "num_input_tokens_seen": 17646144, + "step": 30420 + }, + { + "epoch": 4.53157581173667, + "grad_norm": 0.10779467970132828, + "learning_rate": 4.7599402356978406e-05, + "loss": 0.162, + "num_input_tokens_seen": 17648992, + "step": 30425 + }, + { + "epoch": 4.532320524277629, + "grad_norm": 16.776813507080078, + "learning_rate": 4.759801276871478e-05, + "loss": 0.2987, + "num_input_tokens_seen": 17651968, + "step": 30430 + }, + { + "epoch": 4.533065236818588, + "grad_norm": 16.409618377685547, + "learning_rate": 4.759662279868069e-05, + "loss": 0.2788, + "num_input_tokens_seen": 17655136, + "step": 30435 + }, + { + "epoch": 4.533809949359547, + "grad_norm": 32.27018737792969, + "learning_rate": 4.759523244689963e-05, + "loss": 0.5699, + "num_input_tokens_seen": 17658112, + "step": 30440 + }, + { + "epoch": 4.534554661900506, + "grad_norm": 12.439801216125488, + "learning_rate": 4.759384171339507e-05, + "loss": 0.273, + "num_input_tokens_seen": 17661120, + "step": 30445 + }, + { + "epoch": 4.535299374441466, + "grad_norm": 13.647223472595215, + "learning_rate": 4.759245059819053e-05, + "loss": 0.3124, + "num_input_tokens_seen": 17663712, + "step": 30450 + }, + { + "epoch": 4.536044086982425, + "grad_norm": 6.554420471191406, + "learning_rate": 4.759105910130949e-05, + "loss": 0.2478, + "num_input_tokens_seen": 17666656, + "step": 30455 + }, + { + "epoch": 4.536788799523384, + "grad_norm": 17.184520721435547, + "learning_rate": 4.758966722277547e-05, + "loss": 0.3538, + "num_input_tokens_seen": 17669568, + "step": 30460 + }, + { + "epoch": 4.537533512064343, + "grad_norm": 25.436433792114258, + "learning_rate": 4.758827496261199e-05, + "loss": 0.656, + "num_input_tokens_seen": 17672672, + "step": 30465 + }, + { + "epoch": 4.538278224605302, + "grad_norm": 3.5827243328094482, + "learning_rate": 4.758688232084255e-05, + "loss": 0.5169, + "num_input_tokens_seen": 17676320, + "step": 30470 + }, + { + "epoch": 4.539022937146262, + "grad_norm": 3.059328556060791, + "learning_rate": 4.7585489297490694e-05, + "loss": 0.4189, + "num_input_tokens_seen": 17679520, + "step": 30475 + }, + { + "epoch": 4.539767649687221, + "grad_norm": 22.338911056518555, + "learning_rate": 4.758409589257995e-05, + "loss": 0.3233, + "num_input_tokens_seen": 17682240, + "step": 30480 + }, + { + "epoch": 4.5405123622281796, + "grad_norm": 3.622206211090088, + "learning_rate": 4.758270210613387e-05, + "loss": 0.3641, + "num_input_tokens_seen": 17685536, + "step": 30485 + }, + { + "epoch": 4.541257074769139, + "grad_norm": 62.02421951293945, + "learning_rate": 4.758130793817598e-05, + "loss": 0.3141, + "num_input_tokens_seen": 17688416, + "step": 30490 + }, + { + "epoch": 4.542001787310098, + "grad_norm": 3.7042009830474854, + "learning_rate": 4.7579913388729844e-05, + "loss": 0.2917, + "num_input_tokens_seen": 17691328, + "step": 30495 + }, + { + "epoch": 4.542746499851058, + "grad_norm": 7.442888259887695, + "learning_rate": 4.757851845781902e-05, + "loss": 0.5163, + "num_input_tokens_seen": 17693920, + "step": 30500 + }, + { + "epoch": 4.543491212392016, + "grad_norm": 1.5457229614257812, + "learning_rate": 4.757712314546707e-05, + "loss": 0.1552, + "num_input_tokens_seen": 17696960, + "step": 30505 + }, + { + "epoch": 4.544235924932976, + "grad_norm": 21.568466186523438, + "learning_rate": 4.7575727451697585e-05, + "loss": 0.4216, + "num_input_tokens_seen": 17699840, + "step": 30510 + }, + { + "epoch": 4.544980637473935, + "grad_norm": 7.249448299407959, + "learning_rate": 4.757433137653411e-05, + "loss": 0.31, + "num_input_tokens_seen": 17702720, + "step": 30515 + }, + { + "epoch": 4.545725350014894, + "grad_norm": 5.6372857093811035, + "learning_rate": 4.757293492000027e-05, + "loss": 0.2669, + "num_input_tokens_seen": 17705888, + "step": 30520 + }, + { + "epoch": 4.546470062555853, + "grad_norm": 14.662376403808594, + "learning_rate": 4.757153808211962e-05, + "loss": 0.3937, + "num_input_tokens_seen": 17708672, + "step": 30525 + }, + { + "epoch": 4.547214775096813, + "grad_norm": 11.800264358520508, + "learning_rate": 4.757014086291579e-05, + "loss": 0.3977, + "num_input_tokens_seen": 17711776, + "step": 30530 + }, + { + "epoch": 4.547959487637772, + "grad_norm": 3.5133190155029297, + "learning_rate": 4.7568743262412354e-05, + "loss": 0.4723, + "num_input_tokens_seen": 17714688, + "step": 30535 + }, + { + "epoch": 4.548704200178731, + "grad_norm": 11.513691902160645, + "learning_rate": 4.756734528063295e-05, + "loss": 0.3483, + "num_input_tokens_seen": 17717504, + "step": 30540 + }, + { + "epoch": 4.54944891271969, + "grad_norm": 12.918113708496094, + "learning_rate": 4.756594691760118e-05, + "loss": 0.2952, + "num_input_tokens_seen": 17720224, + "step": 30545 + }, + { + "epoch": 4.55019362526065, + "grad_norm": 4.9646100997924805, + "learning_rate": 4.7564548173340664e-05, + "loss": 0.2158, + "num_input_tokens_seen": 17723168, + "step": 30550 + }, + { + "epoch": 4.550938337801608, + "grad_norm": 3.27107834815979, + "learning_rate": 4.7563149047875054e-05, + "loss": 0.3521, + "num_input_tokens_seen": 17725792, + "step": 30555 + }, + { + "epoch": 4.551683050342568, + "grad_norm": 17.99198341369629, + "learning_rate": 4.756174954122796e-05, + "loss": 0.3654, + "num_input_tokens_seen": 17728832, + "step": 30560 + }, + { + "epoch": 4.552427762883527, + "grad_norm": 2.2309083938598633, + "learning_rate": 4.7560349653423055e-05, + "loss": 0.4242, + "num_input_tokens_seen": 17731584, + "step": 30565 + }, + { + "epoch": 4.553172475424486, + "grad_norm": 7.848949909210205, + "learning_rate": 4.755894938448395e-05, + "loss": 0.3348, + "num_input_tokens_seen": 17734592, + "step": 30570 + }, + { + "epoch": 4.553917187965445, + "grad_norm": 15.377388000488281, + "learning_rate": 4.755754873443434e-05, + "loss": 0.3519, + "num_input_tokens_seen": 17737600, + "step": 30575 + }, + { + "epoch": 4.554661900506405, + "grad_norm": 22.6447811126709, + "learning_rate": 4.7556147703297865e-05, + "loss": 0.337, + "num_input_tokens_seen": 17740384, + "step": 30580 + }, + { + "epoch": 4.555406613047364, + "grad_norm": 20.567346572875977, + "learning_rate": 4.75547462910982e-05, + "loss": 0.2946, + "num_input_tokens_seen": 17743296, + "step": 30585 + }, + { + "epoch": 4.556151325588323, + "grad_norm": 10.559515953063965, + "learning_rate": 4.755334449785902e-05, + "loss": 0.2685, + "num_input_tokens_seen": 17746048, + "step": 30590 + }, + { + "epoch": 4.556896038129282, + "grad_norm": 18.355268478393555, + "learning_rate": 4.755194232360401e-05, + "loss": 0.1758, + "num_input_tokens_seen": 17749184, + "step": 30595 + }, + { + "epoch": 4.557640750670242, + "grad_norm": 11.998499870300293, + "learning_rate": 4.755053976835685e-05, + "loss": 0.1798, + "num_input_tokens_seen": 17752256, + "step": 30600 + }, + { + "epoch": 4.5583854632112, + "grad_norm": 3.77194881439209, + "learning_rate": 4.754913683214124e-05, + "loss": 0.2473, + "num_input_tokens_seen": 17755136, + "step": 30605 + }, + { + "epoch": 4.559130175752159, + "grad_norm": 33.26763153076172, + "learning_rate": 4.754773351498088e-05, + "loss": 0.4606, + "num_input_tokens_seen": 17758048, + "step": 30610 + }, + { + "epoch": 4.559874888293119, + "grad_norm": 17.935035705566406, + "learning_rate": 4.754632981689949e-05, + "loss": 0.4648, + "num_input_tokens_seen": 17760832, + "step": 30615 + }, + { + "epoch": 4.5606196008340785, + "grad_norm": 16.4871883392334, + "learning_rate": 4.7544925737920766e-05, + "loss": 0.263, + "num_input_tokens_seen": 17763648, + "step": 30620 + }, + { + "epoch": 4.561364313375037, + "grad_norm": 0.7040030360221863, + "learning_rate": 4.754352127806843e-05, + "loss": 0.1505, + "num_input_tokens_seen": 17767008, + "step": 30625 + }, + { + "epoch": 4.562109025915996, + "grad_norm": 4.495717525482178, + "learning_rate": 4.754211643736622e-05, + "loss": 0.1576, + "num_input_tokens_seen": 17769728, + "step": 30630 + }, + { + "epoch": 4.562853738456956, + "grad_norm": 41.43510055541992, + "learning_rate": 4.7540711215837866e-05, + "loss": 0.3781, + "num_input_tokens_seen": 17772736, + "step": 30635 + }, + { + "epoch": 4.563598450997915, + "grad_norm": 17.796228408813477, + "learning_rate": 4.7539305613507096e-05, + "loss": 0.4136, + "num_input_tokens_seen": 17775680, + "step": 30640 + }, + { + "epoch": 4.564343163538874, + "grad_norm": 0.31975528597831726, + "learning_rate": 4.753789963039767e-05, + "loss": 0.2269, + "num_input_tokens_seen": 17778784, + "step": 30645 + }, + { + "epoch": 4.565087876079833, + "grad_norm": 8.94723129272461, + "learning_rate": 4.753649326653334e-05, + "loss": 0.4916, + "num_input_tokens_seen": 17781632, + "step": 30650 + }, + { + "epoch": 4.565832588620792, + "grad_norm": 3.6586380004882812, + "learning_rate": 4.753508652193785e-05, + "loss": 0.1019, + "num_input_tokens_seen": 17784576, + "step": 30655 + }, + { + "epoch": 4.566577301161751, + "grad_norm": 31.706838607788086, + "learning_rate": 4.7533679396634986e-05, + "loss": 0.5691, + "num_input_tokens_seen": 17787392, + "step": 30660 + }, + { + "epoch": 4.567322013702711, + "grad_norm": 5.210697174072266, + "learning_rate": 4.7532271890648516e-05, + "loss": 0.5601, + "num_input_tokens_seen": 17790208, + "step": 30665 + }, + { + "epoch": 4.56806672624367, + "grad_norm": 33.42879104614258, + "learning_rate": 4.753086400400221e-05, + "loss": 0.5004, + "num_input_tokens_seen": 17793088, + "step": 30670 + }, + { + "epoch": 4.568811438784629, + "grad_norm": 2.4178719520568848, + "learning_rate": 4.752945573671985e-05, + "loss": 0.2636, + "num_input_tokens_seen": 17795872, + "step": 30675 + }, + { + "epoch": 4.569556151325588, + "grad_norm": 3.1297237873077393, + "learning_rate": 4.752804708882523e-05, + "loss": 0.3003, + "num_input_tokens_seen": 17798912, + "step": 30680 + }, + { + "epoch": 4.570300863866548, + "grad_norm": 21.825124740600586, + "learning_rate": 4.7526638060342164e-05, + "loss": 0.307, + "num_input_tokens_seen": 17801696, + "step": 30685 + }, + { + "epoch": 4.571045576407506, + "grad_norm": 12.46279525756836, + "learning_rate": 4.752522865129444e-05, + "loss": 0.2042, + "num_input_tokens_seen": 17804480, + "step": 30690 + }, + { + "epoch": 4.571790288948466, + "grad_norm": 9.782309532165527, + "learning_rate": 4.7523818861705865e-05, + "loss": 0.2426, + "num_input_tokens_seen": 17807616, + "step": 30695 + }, + { + "epoch": 4.572535001489425, + "grad_norm": 27.001346588134766, + "learning_rate": 4.752240869160026e-05, + "loss": 0.264, + "num_input_tokens_seen": 17810368, + "step": 30700 + }, + { + "epoch": 4.5732797140303845, + "grad_norm": 7.035051345825195, + "learning_rate": 4.752099814100146e-05, + "loss": 0.235, + "num_input_tokens_seen": 17813568, + "step": 30705 + }, + { + "epoch": 4.574024426571343, + "grad_norm": 11.843191146850586, + "learning_rate": 4.751958720993328e-05, + "loss": 0.382, + "num_input_tokens_seen": 17816288, + "step": 30710 + }, + { + "epoch": 4.574769139112303, + "grad_norm": 12.553790092468262, + "learning_rate": 4.751817589841957e-05, + "loss": 0.1985, + "num_input_tokens_seen": 17819296, + "step": 30715 + }, + { + "epoch": 4.575513851653262, + "grad_norm": 12.91866397857666, + "learning_rate": 4.7516764206484156e-05, + "loss": 0.4267, + "num_input_tokens_seen": 17822016, + "step": 30720 + }, + { + "epoch": 4.576258564194221, + "grad_norm": 20.524587631225586, + "learning_rate": 4.75153521341509e-05, + "loss": 0.4853, + "num_input_tokens_seen": 17824864, + "step": 30725 + }, + { + "epoch": 4.57700327673518, + "grad_norm": 5.201016902923584, + "learning_rate": 4.751393968144365e-05, + "loss": 0.1463, + "num_input_tokens_seen": 17827744, + "step": 30730 + }, + { + "epoch": 4.57774798927614, + "grad_norm": 16.910907745361328, + "learning_rate": 4.7512526848386276e-05, + "loss": 0.5193, + "num_input_tokens_seen": 17830528, + "step": 30735 + }, + { + "epoch": 4.578492701817098, + "grad_norm": 5.111041069030762, + "learning_rate": 4.751111363500263e-05, + "loss": 0.2724, + "num_input_tokens_seen": 17833504, + "step": 30740 + }, + { + "epoch": 4.579237414358058, + "grad_norm": 0.04689129814505577, + "learning_rate": 4.750970004131662e-05, + "loss": 0.5566, + "num_input_tokens_seen": 17836640, + "step": 30745 + }, + { + "epoch": 4.579982126899017, + "grad_norm": 63.12054443359375, + "learning_rate": 4.7508286067352085e-05, + "loss": 0.7755, + "num_input_tokens_seen": 17839520, + "step": 30750 + }, + { + "epoch": 4.5807268394399765, + "grad_norm": 9.762655258178711, + "learning_rate": 4.750687171313294e-05, + "loss": 0.3652, + "num_input_tokens_seen": 17842336, + "step": 30755 + }, + { + "epoch": 4.581471551980935, + "grad_norm": 2.0959584712982178, + "learning_rate": 4.750545697868307e-05, + "loss": 0.1717, + "num_input_tokens_seen": 17845280, + "step": 30760 + }, + { + "epoch": 4.582216264521895, + "grad_norm": 29.935789108276367, + "learning_rate": 4.750404186402639e-05, + "loss": 0.4506, + "num_input_tokens_seen": 17848032, + "step": 30765 + }, + { + "epoch": 4.582960977062854, + "grad_norm": 31.950979232788086, + "learning_rate": 4.7502626369186784e-05, + "loss": 0.3504, + "num_input_tokens_seen": 17850784, + "step": 30770 + }, + { + "epoch": 4.583705689603813, + "grad_norm": 22.687429428100586, + "learning_rate": 4.750121049418817e-05, + "loss": 0.3745, + "num_input_tokens_seen": 17853760, + "step": 30775 + }, + { + "epoch": 4.584450402144772, + "grad_norm": 16.350866317749023, + "learning_rate": 4.749979423905449e-05, + "loss": 0.3373, + "num_input_tokens_seen": 17856640, + "step": 30780 + }, + { + "epoch": 4.585195114685732, + "grad_norm": 1.883855938911438, + "learning_rate": 4.749837760380965e-05, + "loss": 0.2048, + "num_input_tokens_seen": 17859520, + "step": 30785 + }, + { + "epoch": 4.5859398272266905, + "grad_norm": 22.46205711364746, + "learning_rate": 4.749696058847758e-05, + "loss": 0.396, + "num_input_tokens_seen": 17862400, + "step": 30790 + }, + { + "epoch": 4.586684539767649, + "grad_norm": 11.40376091003418, + "learning_rate": 4.749554319308223e-05, + "loss": 0.4011, + "num_input_tokens_seen": 17865152, + "step": 30795 + }, + { + "epoch": 4.587429252308609, + "grad_norm": 15.072359085083008, + "learning_rate": 4.7494125417647536e-05, + "loss": 0.5864, + "num_input_tokens_seen": 17868160, + "step": 30800 + }, + { + "epoch": 4.5881739648495685, + "grad_norm": 5.830355167388916, + "learning_rate": 4.749270726219746e-05, + "loss": 0.3808, + "num_input_tokens_seen": 17871232, + "step": 30805 + }, + { + "epoch": 4.588918677390527, + "grad_norm": 1.6525256633758545, + "learning_rate": 4.7491288726755954e-05, + "loss": 0.1453, + "num_input_tokens_seen": 17874208, + "step": 30810 + }, + { + "epoch": 4.589663389931486, + "grad_norm": 59.0389404296875, + "learning_rate": 4.7489869811346984e-05, + "loss": 0.1943, + "num_input_tokens_seen": 17877216, + "step": 30815 + }, + { + "epoch": 4.590408102472446, + "grad_norm": 16.336294174194336, + "learning_rate": 4.748845051599452e-05, + "loss": 0.1923, + "num_input_tokens_seen": 17880320, + "step": 30820 + }, + { + "epoch": 4.591152815013404, + "grad_norm": 1.464958667755127, + "learning_rate": 4.748703084072255e-05, + "loss": 0.2114, + "num_input_tokens_seen": 17883264, + "step": 30825 + }, + { + "epoch": 4.591897527554364, + "grad_norm": 12.12487506866455, + "learning_rate": 4.748561078555504e-05, + "loss": 0.4339, + "num_input_tokens_seen": 17886496, + "step": 30830 + }, + { + "epoch": 4.592642240095323, + "grad_norm": 9.91814136505127, + "learning_rate": 4.748419035051599e-05, + "loss": 0.298, + "num_input_tokens_seen": 17889344, + "step": 30835 + }, + { + "epoch": 4.5933869526362825, + "grad_norm": 24.587310791015625, + "learning_rate": 4.748276953562939e-05, + "loss": 0.254, + "num_input_tokens_seen": 17892352, + "step": 30840 + }, + { + "epoch": 4.594131665177241, + "grad_norm": 29.015201568603516, + "learning_rate": 4.7481348340919255e-05, + "loss": 0.6607, + "num_input_tokens_seen": 17895424, + "step": 30845 + }, + { + "epoch": 4.594876377718201, + "grad_norm": 18.33296012878418, + "learning_rate": 4.747992676640959e-05, + "loss": 0.4534, + "num_input_tokens_seen": 17897984, + "step": 30850 + }, + { + "epoch": 4.59562109025916, + "grad_norm": 17.3752498626709, + "learning_rate": 4.7478504812124416e-05, + "loss": 0.373, + "num_input_tokens_seen": 17900960, + "step": 30855 + }, + { + "epoch": 4.596365802800119, + "grad_norm": 23.008838653564453, + "learning_rate": 4.7477082478087734e-05, + "loss": 0.5031, + "num_input_tokens_seen": 17903776, + "step": 30860 + }, + { + "epoch": 4.597110515341078, + "grad_norm": 25.049922943115234, + "learning_rate": 4.74756597643236e-05, + "loss": 0.2566, + "num_input_tokens_seen": 17906368, + "step": 30865 + }, + { + "epoch": 4.597855227882038, + "grad_norm": 29.972593307495117, + "learning_rate": 4.747423667085603e-05, + "loss": 0.3922, + "num_input_tokens_seen": 17909504, + "step": 30870 + }, + { + "epoch": 4.5985999404229965, + "grad_norm": 23.1469783782959, + "learning_rate": 4.7472813197709084e-05, + "loss": 0.3463, + "num_input_tokens_seen": 17912736, + "step": 30875 + }, + { + "epoch": 4.599344652963956, + "grad_norm": 13.095322608947754, + "learning_rate": 4.747138934490679e-05, + "loss": 0.3674, + "num_input_tokens_seen": 17915552, + "step": 30880 + }, + { + "epoch": 4.600089365504915, + "grad_norm": 1.9725117683410645, + "learning_rate": 4.746996511247321e-05, + "loss": 0.1045, + "num_input_tokens_seen": 17918336, + "step": 30885 + }, + { + "epoch": 4.6008340780458745, + "grad_norm": 0.5019196271896362, + "learning_rate": 4.746854050043241e-05, + "loss": 0.3703, + "num_input_tokens_seen": 17921312, + "step": 30890 + }, + { + "epoch": 4.601578790586833, + "grad_norm": 12.310063362121582, + "learning_rate": 4.7467115508808456e-05, + "loss": 0.2406, + "num_input_tokens_seen": 17924160, + "step": 30895 + }, + { + "epoch": 4.602323503127793, + "grad_norm": 25.750635147094727, + "learning_rate": 4.746569013762543e-05, + "loss": 0.514, + "num_input_tokens_seen": 17927008, + "step": 30900 + }, + { + "epoch": 4.603068215668752, + "grad_norm": 28.681316375732422, + "learning_rate": 4.7464264386907385e-05, + "loss": 0.3436, + "num_input_tokens_seen": 17929824, + "step": 30905 + }, + { + "epoch": 4.603812928209711, + "grad_norm": 17.64364242553711, + "learning_rate": 4.746283825667843e-05, + "loss": 0.4345, + "num_input_tokens_seen": 17932704, + "step": 30910 + }, + { + "epoch": 4.60455764075067, + "grad_norm": 1.7865724563598633, + "learning_rate": 4.746141174696266e-05, + "loss": 0.3583, + "num_input_tokens_seen": 17935552, + "step": 30915 + }, + { + "epoch": 4.60530235329163, + "grad_norm": 43.700252532958984, + "learning_rate": 4.745998485778416e-05, + "loss": 0.2553, + "num_input_tokens_seen": 17938720, + "step": 30920 + }, + { + "epoch": 4.6060470658325885, + "grad_norm": 6.9691386222839355, + "learning_rate": 4.7458557589167044e-05, + "loss": 0.4044, + "num_input_tokens_seen": 17941408, + "step": 30925 + }, + { + "epoch": 4.606791778373548, + "grad_norm": 22.70697784423828, + "learning_rate": 4.7457129941135424e-05, + "loss": 0.4746, + "num_input_tokens_seen": 17944256, + "step": 30930 + }, + { + "epoch": 4.607536490914507, + "grad_norm": 1.386268138885498, + "learning_rate": 4.7455701913713424e-05, + "loss": 0.5502, + "num_input_tokens_seen": 17946880, + "step": 30935 + }, + { + "epoch": 4.6082812034554665, + "grad_norm": 9.957139015197754, + "learning_rate": 4.745427350692515e-05, + "loss": 0.3137, + "num_input_tokens_seen": 17949920, + "step": 30940 + }, + { + "epoch": 4.609025915996425, + "grad_norm": 14.430673599243164, + "learning_rate": 4.7452844720794756e-05, + "loss": 0.4205, + "num_input_tokens_seen": 17952832, + "step": 30945 + }, + { + "epoch": 4.609770628537385, + "grad_norm": 12.166468620300293, + "learning_rate": 4.745141555534637e-05, + "loss": 0.222, + "num_input_tokens_seen": 17956288, + "step": 30950 + }, + { + "epoch": 4.610515341078344, + "grad_norm": 7.28957986831665, + "learning_rate": 4.744998601060414e-05, + "loss": 0.1727, + "num_input_tokens_seen": 17959040, + "step": 30955 + }, + { + "epoch": 4.6112600536193025, + "grad_norm": 20.669401168823242, + "learning_rate": 4.74485560865922e-05, + "loss": 0.5144, + "num_input_tokens_seen": 17961984, + "step": 30960 + }, + { + "epoch": 4.612004766160262, + "grad_norm": 8.74471664428711, + "learning_rate": 4.744712578333473e-05, + "loss": 0.4165, + "num_input_tokens_seen": 17964704, + "step": 30965 + }, + { + "epoch": 4.612749478701222, + "grad_norm": 2.0281879901885986, + "learning_rate": 4.744569510085589e-05, + "loss": 0.0884, + "num_input_tokens_seen": 17967296, + "step": 30970 + }, + { + "epoch": 4.6134941912421805, + "grad_norm": 8.964041709899902, + "learning_rate": 4.7444264039179845e-05, + "loss": 0.3383, + "num_input_tokens_seen": 17970272, + "step": 30975 + }, + { + "epoch": 4.614238903783139, + "grad_norm": 14.796939849853516, + "learning_rate": 4.744283259833076e-05, + "loss": 0.3474, + "num_input_tokens_seen": 17973184, + "step": 30980 + }, + { + "epoch": 4.614983616324099, + "grad_norm": 27.195144653320312, + "learning_rate": 4.744140077833283e-05, + "loss": 0.5185, + "num_input_tokens_seen": 17975936, + "step": 30985 + }, + { + "epoch": 4.615728328865059, + "grad_norm": 9.697405815124512, + "learning_rate": 4.743996857921024e-05, + "loss": 0.2715, + "num_input_tokens_seen": 17978720, + "step": 30990 + }, + { + "epoch": 4.616473041406017, + "grad_norm": 20.902719497680664, + "learning_rate": 4.7438536000987195e-05, + "loss": 0.3109, + "num_input_tokens_seen": 17981536, + "step": 30995 + }, + { + "epoch": 4.617217753946976, + "grad_norm": 10.251232147216797, + "learning_rate": 4.743710304368788e-05, + "loss": 0.3307, + "num_input_tokens_seen": 17984320, + "step": 31000 + }, + { + "epoch": 4.617962466487936, + "grad_norm": 28.663347244262695, + "learning_rate": 4.743566970733652e-05, + "loss": 0.3642, + "num_input_tokens_seen": 17987200, + "step": 31005 + }, + { + "epoch": 4.6187071790288945, + "grad_norm": 0.12083500623703003, + "learning_rate": 4.7434235991957326e-05, + "loss": 0.0895, + "num_input_tokens_seen": 17990304, + "step": 31010 + }, + { + "epoch": 4.619451891569854, + "grad_norm": 0.14829935133457184, + "learning_rate": 4.743280189757451e-05, + "loss": 0.2008, + "num_input_tokens_seen": 17993088, + "step": 31015 + }, + { + "epoch": 4.620196604110813, + "grad_norm": 11.35517692565918, + "learning_rate": 4.7431367424212305e-05, + "loss": 0.1318, + "num_input_tokens_seen": 17995648, + "step": 31020 + }, + { + "epoch": 4.6209413166517725, + "grad_norm": 10.429142951965332, + "learning_rate": 4.7429932571894954e-05, + "loss": 0.6066, + "num_input_tokens_seen": 17998560, + "step": 31025 + }, + { + "epoch": 4.621686029192731, + "grad_norm": 16.5269718170166, + "learning_rate": 4.742849734064668e-05, + "loss": 0.2798, + "num_input_tokens_seen": 18001504, + "step": 31030 + }, + { + "epoch": 4.622430741733691, + "grad_norm": 45.957157135009766, + "learning_rate": 4.742706173049174e-05, + "loss": 0.4302, + "num_input_tokens_seen": 18004608, + "step": 31035 + }, + { + "epoch": 4.62317545427465, + "grad_norm": 8.19621467590332, + "learning_rate": 4.7425625741454394e-05, + "loss": 0.178, + "num_input_tokens_seen": 18007584, + "step": 31040 + }, + { + "epoch": 4.623920166815609, + "grad_norm": 2.0091092586517334, + "learning_rate": 4.7424189373558886e-05, + "loss": 0.226, + "num_input_tokens_seen": 18010496, + "step": 31045 + }, + { + "epoch": 4.624664879356568, + "grad_norm": 14.290946960449219, + "learning_rate": 4.742275262682949e-05, + "loss": 0.3486, + "num_input_tokens_seen": 18013216, + "step": 31050 + }, + { + "epoch": 4.625409591897528, + "grad_norm": 29.987417221069336, + "learning_rate": 4.7421315501290484e-05, + "loss": 0.6971, + "num_input_tokens_seen": 18016000, + "step": 31055 + }, + { + "epoch": 4.6261543044384865, + "grad_norm": 22.757173538208008, + "learning_rate": 4.7419877996966134e-05, + "loss": 0.4518, + "num_input_tokens_seen": 18019040, + "step": 31060 + }, + { + "epoch": 4.626899016979446, + "grad_norm": 14.087249755859375, + "learning_rate": 4.7418440113880736e-05, + "loss": 0.3781, + "num_input_tokens_seen": 18022240, + "step": 31065 + }, + { + "epoch": 4.627643729520405, + "grad_norm": 3.6218202114105225, + "learning_rate": 4.7417001852058576e-05, + "loss": 0.1788, + "num_input_tokens_seen": 18025312, + "step": 31070 + }, + { + "epoch": 4.628388442061365, + "grad_norm": 77.30500793457031, + "learning_rate": 4.741556321152395e-05, + "loss": 0.4507, + "num_input_tokens_seen": 18028096, + "step": 31075 + }, + { + "epoch": 4.629133154602323, + "grad_norm": 13.372518539428711, + "learning_rate": 4.741412419230117e-05, + "loss": 0.3, + "num_input_tokens_seen": 18031040, + "step": 31080 + }, + { + "epoch": 4.629877867143283, + "grad_norm": 9.313050270080566, + "learning_rate": 4.741268479441453e-05, + "loss": 0.4807, + "num_input_tokens_seen": 18034304, + "step": 31085 + }, + { + "epoch": 4.630622579684242, + "grad_norm": 4.198699951171875, + "learning_rate": 4.7411245017888374e-05, + "loss": 0.3559, + "num_input_tokens_seen": 18036960, + "step": 31090 + }, + { + "epoch": 4.631367292225201, + "grad_norm": 18.996089935302734, + "learning_rate": 4.7409804862747007e-05, + "loss": 0.2429, + "num_input_tokens_seen": 18039808, + "step": 31095 + }, + { + "epoch": 4.63211200476616, + "grad_norm": 3.7202703952789307, + "learning_rate": 4.740836432901476e-05, + "loss": 0.217, + "num_input_tokens_seen": 18042208, + "step": 31100 + }, + { + "epoch": 4.63285671730712, + "grad_norm": 2.4771785736083984, + "learning_rate": 4.7406923416715976e-05, + "loss": 0.1237, + "num_input_tokens_seen": 18045024, + "step": 31105 + }, + { + "epoch": 4.6336014298480785, + "grad_norm": 16.27128791809082, + "learning_rate": 4.7405482125875e-05, + "loss": 0.3793, + "num_input_tokens_seen": 18047936, + "step": 31110 + }, + { + "epoch": 4.634346142389038, + "grad_norm": 1.7475471496582031, + "learning_rate": 4.7404040456516164e-05, + "loss": 0.2863, + "num_input_tokens_seen": 18050688, + "step": 31115 + }, + { + "epoch": 4.635090854929997, + "grad_norm": 13.3019437789917, + "learning_rate": 4.7402598408663846e-05, + "loss": 0.3038, + "num_input_tokens_seen": 18054144, + "step": 31120 + }, + { + "epoch": 4.635835567470957, + "grad_norm": 24.930484771728516, + "learning_rate": 4.740115598234239e-05, + "loss": 0.2271, + "num_input_tokens_seen": 18056928, + "step": 31125 + }, + { + "epoch": 4.636580280011915, + "grad_norm": 1.5911997556686401, + "learning_rate": 4.739971317757617e-05, + "loss": 0.1332, + "num_input_tokens_seen": 18059520, + "step": 31130 + }, + { + "epoch": 4.637324992552875, + "grad_norm": 60.11217498779297, + "learning_rate": 4.7398269994389567e-05, + "loss": 0.5173, + "num_input_tokens_seen": 18062400, + "step": 31135 + }, + { + "epoch": 4.638069705093834, + "grad_norm": 0.26977095007896423, + "learning_rate": 4.739682643280695e-05, + "loss": 0.1335, + "num_input_tokens_seen": 18065120, + "step": 31140 + }, + { + "epoch": 4.6388144176347925, + "grad_norm": 6.817995071411133, + "learning_rate": 4.739538249285272e-05, + "loss": 0.1984, + "num_input_tokens_seen": 18068000, + "step": 31145 + }, + { + "epoch": 4.639559130175752, + "grad_norm": 1.2674566507339478, + "learning_rate": 4.739393817455127e-05, + "loss": 0.1643, + "num_input_tokens_seen": 18070720, + "step": 31150 + }, + { + "epoch": 4.640303842716712, + "grad_norm": 23.767620086669922, + "learning_rate": 4.739249347792698e-05, + "loss": 0.551, + "num_input_tokens_seen": 18073600, + "step": 31155 + }, + { + "epoch": 4.641048555257671, + "grad_norm": 11.235041618347168, + "learning_rate": 4.739104840300428e-05, + "loss": 0.7373, + "num_input_tokens_seen": 18076736, + "step": 31160 + }, + { + "epoch": 4.641793267798629, + "grad_norm": 17.363107681274414, + "learning_rate": 4.738960294980757e-05, + "loss": 0.2209, + "num_input_tokens_seen": 18079840, + "step": 31165 + }, + { + "epoch": 4.642537980339589, + "grad_norm": 17.759864807128906, + "learning_rate": 4.738815711836128e-05, + "loss": 0.2316, + "num_input_tokens_seen": 18082592, + "step": 31170 + }, + { + "epoch": 4.643282692880548, + "grad_norm": 12.004858016967773, + "learning_rate": 4.738671090868982e-05, + "loss": 0.3911, + "num_input_tokens_seen": 18085664, + "step": 31175 + }, + { + "epoch": 4.644027405421507, + "grad_norm": 28.36096954345703, + "learning_rate": 4.738526432081765e-05, + "loss": 0.2669, + "num_input_tokens_seen": 18088768, + "step": 31180 + }, + { + "epoch": 4.644772117962466, + "grad_norm": 4.427820205688477, + "learning_rate": 4.738381735476917e-05, + "loss": 0.5128, + "num_input_tokens_seen": 18091456, + "step": 31185 + }, + { + "epoch": 4.645516830503426, + "grad_norm": 29.316200256347656, + "learning_rate": 4.738237001056886e-05, + "loss": 0.4305, + "num_input_tokens_seen": 18094176, + "step": 31190 + }, + { + "epoch": 4.6462615430443845, + "grad_norm": 8.021954536437988, + "learning_rate": 4.738092228824115e-05, + "loss": 0.2969, + "num_input_tokens_seen": 18097120, + "step": 31195 + }, + { + "epoch": 4.647006255585344, + "grad_norm": 14.739161491394043, + "learning_rate": 4.7379474187810506e-05, + "loss": 0.237, + "num_input_tokens_seen": 18099872, + "step": 31200 + }, + { + "epoch": 4.647750968126303, + "grad_norm": 17.527944564819336, + "learning_rate": 4.7378025709301386e-05, + "loss": 0.3192, + "num_input_tokens_seen": 18102944, + "step": 31205 + }, + { + "epoch": 4.648495680667263, + "grad_norm": 3.0096468925476074, + "learning_rate": 4.7376576852738274e-05, + "loss": 0.1864, + "num_input_tokens_seen": 18105792, + "step": 31210 + }, + { + "epoch": 4.649240393208221, + "grad_norm": 26.27851104736328, + "learning_rate": 4.7375127618145645e-05, + "loss": 0.3633, + "num_input_tokens_seen": 18108640, + "step": 31215 + }, + { + "epoch": 4.649985105749181, + "grad_norm": 6.4240851402282715, + "learning_rate": 4.737367800554796e-05, + "loss": 0.2866, + "num_input_tokens_seen": 18111712, + "step": 31220 + }, + { + "epoch": 4.65072981829014, + "grad_norm": 49.930389404296875, + "learning_rate": 4.737222801496973e-05, + "loss": 0.3195, + "num_input_tokens_seen": 18114656, + "step": 31225 + }, + { + "epoch": 4.651474530831099, + "grad_norm": 7.077437400817871, + "learning_rate": 4.737077764643545e-05, + "loss": 0.2317, + "num_input_tokens_seen": 18117728, + "step": 31230 + }, + { + "epoch": 4.652219243372058, + "grad_norm": 26.562822341918945, + "learning_rate": 4.736932689996962e-05, + "loss": 0.347, + "num_input_tokens_seen": 18120416, + "step": 31235 + }, + { + "epoch": 4.652963955913018, + "grad_norm": 0.9382070899009705, + "learning_rate": 4.7367875775596746e-05, + "loss": 0.2938, + "num_input_tokens_seen": 18123200, + "step": 31240 + }, + { + "epoch": 4.653708668453977, + "grad_norm": 0.3840640187263489, + "learning_rate": 4.7366424273341334e-05, + "loss": 0.0408, + "num_input_tokens_seen": 18126048, + "step": 31245 + }, + { + "epoch": 4.654453380994936, + "grad_norm": 15.775094985961914, + "learning_rate": 4.7364972393227925e-05, + "loss": 0.3799, + "num_input_tokens_seen": 18129024, + "step": 31250 + }, + { + "epoch": 4.655198093535895, + "grad_norm": 3.178703784942627, + "learning_rate": 4.736352013528104e-05, + "loss": 0.3419, + "num_input_tokens_seen": 18132544, + "step": 31255 + }, + { + "epoch": 4.655942806076855, + "grad_norm": 0.33728766441345215, + "learning_rate": 4.736206749952521e-05, + "loss": 0.2555, + "num_input_tokens_seen": 18135392, + "step": 31260 + }, + { + "epoch": 4.656687518617813, + "grad_norm": 11.787589073181152, + "learning_rate": 4.736061448598498e-05, + "loss": 0.3248, + "num_input_tokens_seen": 18138272, + "step": 31265 + }, + { + "epoch": 4.657432231158773, + "grad_norm": 32.566707611083984, + "learning_rate": 4.7359161094684886e-05, + "loss": 0.3827, + "num_input_tokens_seen": 18141376, + "step": 31270 + }, + { + "epoch": 4.658176943699732, + "grad_norm": 0.9076245427131653, + "learning_rate": 4.735770732564949e-05, + "loss": 0.3162, + "num_input_tokens_seen": 18144640, + "step": 31275 + }, + { + "epoch": 4.658921656240691, + "grad_norm": 21.165239334106445, + "learning_rate": 4.735625317890336e-05, + "loss": 0.1407, + "num_input_tokens_seen": 18147872, + "step": 31280 + }, + { + "epoch": 4.65966636878165, + "grad_norm": 0.18506741523742676, + "learning_rate": 4.735479865447105e-05, + "loss": 0.1391, + "num_input_tokens_seen": 18150656, + "step": 31285 + }, + { + "epoch": 4.66041108132261, + "grad_norm": 11.469250679016113, + "learning_rate": 4.735334375237714e-05, + "loss": 0.2773, + "num_input_tokens_seen": 18153568, + "step": 31290 + }, + { + "epoch": 4.661155793863569, + "grad_norm": 5.913527011871338, + "learning_rate": 4.7351888472646204e-05, + "loss": 0.2136, + "num_input_tokens_seen": 18156384, + "step": 31295 + }, + { + "epoch": 4.661900506404528, + "grad_norm": 11.889450073242188, + "learning_rate": 4.735043281530283e-05, + "loss": 0.2642, + "num_input_tokens_seen": 18159264, + "step": 31300 + }, + { + "epoch": 4.662645218945487, + "grad_norm": 63.08407974243164, + "learning_rate": 4.7348976780371615e-05, + "loss": 0.149, + "num_input_tokens_seen": 18162272, + "step": 31305 + }, + { + "epoch": 4.663389931486446, + "grad_norm": 44.085121154785156, + "learning_rate": 4.734752036787714e-05, + "loss": 0.3857, + "num_input_tokens_seen": 18165088, + "step": 31310 + }, + { + "epoch": 4.664134644027405, + "grad_norm": 37.80672073364258, + "learning_rate": 4.734606357784403e-05, + "loss": 0.6304, + "num_input_tokens_seen": 18167808, + "step": 31315 + }, + { + "epoch": 4.664879356568365, + "grad_norm": 73.55496978759766, + "learning_rate": 4.734460641029689e-05, + "loss": 0.8609, + "num_input_tokens_seen": 18170912, + "step": 31320 + }, + { + "epoch": 4.665624069109324, + "grad_norm": 30.404542922973633, + "learning_rate": 4.7343148865260326e-05, + "loss": 0.6589, + "num_input_tokens_seen": 18173728, + "step": 31325 + }, + { + "epoch": 4.666368781650283, + "grad_norm": 23.27995491027832, + "learning_rate": 4.7341690942758974e-05, + "loss": 0.2167, + "num_input_tokens_seen": 18176576, + "step": 31330 + }, + { + "epoch": 4.667113494191242, + "grad_norm": 33.885047912597656, + "learning_rate": 4.734023264281746e-05, + "loss": 0.1656, + "num_input_tokens_seen": 18179520, + "step": 31335 + }, + { + "epoch": 4.667858206732202, + "grad_norm": 29.44375991821289, + "learning_rate": 4.7338773965460426e-05, + "loss": 0.7498, + "num_input_tokens_seen": 18182496, + "step": 31340 + }, + { + "epoch": 4.668602919273161, + "grad_norm": 5.32576322555542, + "learning_rate": 4.733731491071251e-05, + "loss": 0.2161, + "num_input_tokens_seen": 18185344, + "step": 31345 + }, + { + "epoch": 4.669347631814119, + "grad_norm": 1.9394701719284058, + "learning_rate": 4.7335855478598354e-05, + "loss": 0.2709, + "num_input_tokens_seen": 18188288, + "step": 31350 + }, + { + "epoch": 4.670092344355079, + "grad_norm": 3.7617316246032715, + "learning_rate": 4.7334395669142616e-05, + "loss": 0.1232, + "num_input_tokens_seen": 18191456, + "step": 31355 + }, + { + "epoch": 4.670837056896038, + "grad_norm": 3.46130633354187, + "learning_rate": 4.733293548236998e-05, + "loss": 0.3371, + "num_input_tokens_seen": 18194400, + "step": 31360 + }, + { + "epoch": 4.671581769436997, + "grad_norm": 1.681675672531128, + "learning_rate": 4.7331474918305086e-05, + "loss": 0.3811, + "num_input_tokens_seen": 18197440, + "step": 31365 + }, + { + "epoch": 4.672326481977956, + "grad_norm": 11.939253807067871, + "learning_rate": 4.733001397697262e-05, + "loss": 0.5988, + "num_input_tokens_seen": 18200128, + "step": 31370 + }, + { + "epoch": 4.673071194518916, + "grad_norm": 15.855934143066406, + "learning_rate": 4.732855265839726e-05, + "loss": 0.3532, + "num_input_tokens_seen": 18203040, + "step": 31375 + }, + { + "epoch": 4.673815907059875, + "grad_norm": 8.416633605957031, + "learning_rate": 4.7327090962603704e-05, + "loss": 0.5075, + "num_input_tokens_seen": 18205856, + "step": 31380 + }, + { + "epoch": 4.674560619600834, + "grad_norm": 12.221652030944824, + "learning_rate": 4.7325628889616644e-05, + "loss": 0.2371, + "num_input_tokens_seen": 18208928, + "step": 31385 + }, + { + "epoch": 4.675305332141793, + "grad_norm": 16.83504295349121, + "learning_rate": 4.732416643946076e-05, + "loss": 0.2466, + "num_input_tokens_seen": 18211968, + "step": 31390 + }, + { + "epoch": 4.676050044682753, + "grad_norm": 3.4388513565063477, + "learning_rate": 4.732270361216078e-05, + "loss": 0.336, + "num_input_tokens_seen": 18214880, + "step": 31395 + }, + { + "epoch": 4.676794757223711, + "grad_norm": 20.802949905395508, + "learning_rate": 4.732124040774142e-05, + "loss": 0.5006, + "num_input_tokens_seen": 18217952, + "step": 31400 + }, + { + "epoch": 4.677539469764671, + "grad_norm": 6.857173442840576, + "learning_rate": 4.731977682622737e-05, + "loss": 0.4744, + "num_input_tokens_seen": 18220832, + "step": 31405 + }, + { + "epoch": 4.67828418230563, + "grad_norm": 14.927507400512695, + "learning_rate": 4.731831286764339e-05, + "loss": 0.4123, + "num_input_tokens_seen": 18223808, + "step": 31410 + }, + { + "epoch": 4.6790288948465895, + "grad_norm": 12.015049934387207, + "learning_rate": 4.731684853201419e-05, + "loss": 0.5498, + "num_input_tokens_seen": 18226720, + "step": 31415 + }, + { + "epoch": 4.679773607387548, + "grad_norm": 0.3369440734386444, + "learning_rate": 4.7315383819364526e-05, + "loss": 0.1951, + "num_input_tokens_seen": 18229536, + "step": 31420 + }, + { + "epoch": 4.680518319928508, + "grad_norm": 14.209575653076172, + "learning_rate": 4.731391872971912e-05, + "loss": 0.2664, + "num_input_tokens_seen": 18232672, + "step": 31425 + }, + { + "epoch": 4.681263032469467, + "grad_norm": 10.82887077331543, + "learning_rate": 4.731245326310274e-05, + "loss": 0.4369, + "num_input_tokens_seen": 18235616, + "step": 31430 + }, + { + "epoch": 4.682007745010426, + "grad_norm": 100.54358673095703, + "learning_rate": 4.731098741954014e-05, + "loss": 0.1539, + "num_input_tokens_seen": 18238656, + "step": 31435 + }, + { + "epoch": 4.682752457551385, + "grad_norm": 39.32373046875, + "learning_rate": 4.730952119905609e-05, + "loss": 0.3466, + "num_input_tokens_seen": 18241472, + "step": 31440 + }, + { + "epoch": 4.683497170092345, + "grad_norm": 0.9297760725021362, + "learning_rate": 4.730805460167534e-05, + "loss": 0.4119, + "num_input_tokens_seen": 18244128, + "step": 31445 + }, + { + "epoch": 4.684241882633303, + "grad_norm": 15.023449897766113, + "learning_rate": 4.730658762742269e-05, + "loss": 0.6483, + "num_input_tokens_seen": 18246752, + "step": 31450 + }, + { + "epoch": 4.684986595174263, + "grad_norm": 40.6873664855957, + "learning_rate": 4.730512027632292e-05, + "loss": 0.424, + "num_input_tokens_seen": 18249440, + "step": 31455 + }, + { + "epoch": 4.685731307715222, + "grad_norm": 7.409173011779785, + "learning_rate": 4.7303652548400803e-05, + "loss": 0.5564, + "num_input_tokens_seen": 18252256, + "step": 31460 + }, + { + "epoch": 4.6864760202561815, + "grad_norm": 5.971478462219238, + "learning_rate": 4.730218444368114e-05, + "loss": 0.1079, + "num_input_tokens_seen": 18255104, + "step": 31465 + }, + { + "epoch": 4.68722073279714, + "grad_norm": 4.218419075012207, + "learning_rate": 4.7300715962188744e-05, + "loss": 0.2221, + "num_input_tokens_seen": 18257984, + "step": 31470 + }, + { + "epoch": 4.687965445338099, + "grad_norm": 11.292527198791504, + "learning_rate": 4.7299247103948417e-05, + "loss": 0.1912, + "num_input_tokens_seen": 18260992, + "step": 31475 + }, + { + "epoch": 4.688710157879059, + "grad_norm": 75.43193054199219, + "learning_rate": 4.729777786898498e-05, + "loss": 0.6426, + "num_input_tokens_seen": 18263968, + "step": 31480 + }, + { + "epoch": 4.689454870420018, + "grad_norm": 9.96634578704834, + "learning_rate": 4.729630825732324e-05, + "loss": 0.2382, + "num_input_tokens_seen": 18266976, + "step": 31485 + }, + { + "epoch": 4.690199582960977, + "grad_norm": 1.2438225746154785, + "learning_rate": 4.729483826898804e-05, + "loss": 0.2422, + "num_input_tokens_seen": 18270144, + "step": 31490 + }, + { + "epoch": 4.690944295501936, + "grad_norm": 12.613005638122559, + "learning_rate": 4.729336790400421e-05, + "loss": 0.4407, + "num_input_tokens_seen": 18273344, + "step": 31495 + }, + { + "epoch": 4.6916890080428955, + "grad_norm": 18.482593536376953, + "learning_rate": 4.729189716239657e-05, + "loss": 0.3786, + "num_input_tokens_seen": 18276288, + "step": 31500 + }, + { + "epoch": 4.692433720583855, + "grad_norm": 28.900779724121094, + "learning_rate": 4.7290426044189995e-05, + "loss": 0.3166, + "num_input_tokens_seen": 18279136, + "step": 31505 + }, + { + "epoch": 4.693178433124814, + "grad_norm": 0.673191487789154, + "learning_rate": 4.728895454940933e-05, + "loss": 0.0906, + "num_input_tokens_seen": 18282016, + "step": 31510 + }, + { + "epoch": 4.693923145665773, + "grad_norm": 3.741312026977539, + "learning_rate": 4.728748267807942e-05, + "loss": 0.4232, + "num_input_tokens_seen": 18284768, + "step": 31515 + }, + { + "epoch": 4.694667858206732, + "grad_norm": 18.890108108520508, + "learning_rate": 4.728601043022515e-05, + "loss": 0.6957, + "num_input_tokens_seen": 18287680, + "step": 31520 + }, + { + "epoch": 4.695412570747691, + "grad_norm": 5.549259662628174, + "learning_rate": 4.728453780587139e-05, + "loss": 0.2001, + "num_input_tokens_seen": 18290624, + "step": 31525 + }, + { + "epoch": 4.696157283288651, + "grad_norm": 8.512628555297852, + "learning_rate": 4.7283064805043e-05, + "loss": 0.2954, + "num_input_tokens_seen": 18293408, + "step": 31530 + }, + { + "epoch": 4.696901995829609, + "grad_norm": 10.530117988586426, + "learning_rate": 4.7281591427764886e-05, + "loss": 0.3679, + "num_input_tokens_seen": 18296416, + "step": 31535 + }, + { + "epoch": 4.697646708370569, + "grad_norm": 28.901447296142578, + "learning_rate": 4.728011767406193e-05, + "loss": 0.2348, + "num_input_tokens_seen": 18299200, + "step": 31540 + }, + { + "epoch": 4.698391420911528, + "grad_norm": 15.537580490112305, + "learning_rate": 4.7278643543959025e-05, + "loss": 0.3401, + "num_input_tokens_seen": 18302080, + "step": 31545 + }, + { + "epoch": 4.6991361334524875, + "grad_norm": 17.852571487426758, + "learning_rate": 4.727716903748108e-05, + "loss": 0.2501, + "num_input_tokens_seen": 18305088, + "step": 31550 + }, + { + "epoch": 4.699880845993446, + "grad_norm": 10.209112167358398, + "learning_rate": 4.727569415465302e-05, + "loss": 0.1668, + "num_input_tokens_seen": 18308160, + "step": 31555 + }, + { + "epoch": 4.700625558534406, + "grad_norm": 15.099605560302734, + "learning_rate": 4.727421889549973e-05, + "loss": 0.3868, + "num_input_tokens_seen": 18310880, + "step": 31560 + }, + { + "epoch": 4.701370271075365, + "grad_norm": 46.98691177368164, + "learning_rate": 4.727274326004616e-05, + "loss": 0.2695, + "num_input_tokens_seen": 18313536, + "step": 31565 + }, + { + "epoch": 4.702114983616324, + "grad_norm": 30.584609985351562, + "learning_rate": 4.727126724831723e-05, + "loss": 0.4512, + "num_input_tokens_seen": 18316320, + "step": 31570 + }, + { + "epoch": 4.702859696157283, + "grad_norm": 0.7740965485572815, + "learning_rate": 4.726979086033787e-05, + "loss": 0.3343, + "num_input_tokens_seen": 18319392, + "step": 31575 + }, + { + "epoch": 4.703604408698243, + "grad_norm": 18.894792556762695, + "learning_rate": 4.726831409613303e-05, + "loss": 0.2625, + "num_input_tokens_seen": 18322432, + "step": 31580 + }, + { + "epoch": 4.7043491212392015, + "grad_norm": 15.31454086303711, + "learning_rate": 4.7266836955727655e-05, + "loss": 0.4991, + "num_input_tokens_seen": 18325344, + "step": 31585 + }, + { + "epoch": 4.705093833780161, + "grad_norm": 5.101253032684326, + "learning_rate": 4.72653594391467e-05, + "loss": 0.2505, + "num_input_tokens_seen": 18328192, + "step": 31590 + }, + { + "epoch": 4.70583854632112, + "grad_norm": 4.566707611083984, + "learning_rate": 4.7263881546415135e-05, + "loss": 0.3801, + "num_input_tokens_seen": 18330976, + "step": 31595 + }, + { + "epoch": 4.7065832588620795, + "grad_norm": 6.056155204772949, + "learning_rate": 4.726240327755791e-05, + "loss": 0.4139, + "num_input_tokens_seen": 18333760, + "step": 31600 + }, + { + "epoch": 4.707327971403038, + "grad_norm": 2.2238707542419434, + "learning_rate": 4.726092463260001e-05, + "loss": 0.3321, + "num_input_tokens_seen": 18336672, + "step": 31605 + }, + { + "epoch": 4.708072683943998, + "grad_norm": 22.49359703063965, + "learning_rate": 4.7259445611566414e-05, + "loss": 0.2495, + "num_input_tokens_seen": 18339648, + "step": 31610 + }, + { + "epoch": 4.708817396484957, + "grad_norm": 12.761722564697266, + "learning_rate": 4.7257966214482106e-05, + "loss": 0.2174, + "num_input_tokens_seen": 18342496, + "step": 31615 + }, + { + "epoch": 4.709562109025916, + "grad_norm": 8.412092208862305, + "learning_rate": 4.725648644137208e-05, + "loss": 0.279, + "num_input_tokens_seen": 18345536, + "step": 31620 + }, + { + "epoch": 4.710306821566875, + "grad_norm": 5.600513458251953, + "learning_rate": 4.725500629226134e-05, + "loss": 0.249, + "num_input_tokens_seen": 18348352, + "step": 31625 + }, + { + "epoch": 4.711051534107835, + "grad_norm": 7.597520351409912, + "learning_rate": 4.725352576717489e-05, + "loss": 0.6619, + "num_input_tokens_seen": 18351328, + "step": 31630 + }, + { + "epoch": 4.7117962466487935, + "grad_norm": 4.201629161834717, + "learning_rate": 4.7252044866137736e-05, + "loss": 0.2248, + "num_input_tokens_seen": 18354112, + "step": 31635 + }, + { + "epoch": 4.712540959189753, + "grad_norm": 12.90861988067627, + "learning_rate": 4.72505635891749e-05, + "loss": 0.4077, + "num_input_tokens_seen": 18356864, + "step": 31640 + }, + { + "epoch": 4.713285671730712, + "grad_norm": 18.474395751953125, + "learning_rate": 4.7249081936311415e-05, + "loss": 0.4341, + "num_input_tokens_seen": 18359808, + "step": 31645 + }, + { + "epoch": 4.7140303842716715, + "grad_norm": 23.7405948638916, + "learning_rate": 4.7247599907572285e-05, + "loss": 0.2805, + "num_input_tokens_seen": 18362720, + "step": 31650 + }, + { + "epoch": 4.71477509681263, + "grad_norm": 0.9590154886245728, + "learning_rate": 4.724611750298258e-05, + "loss": 0.2264, + "num_input_tokens_seen": 18365888, + "step": 31655 + }, + { + "epoch": 4.715519809353589, + "grad_norm": 24.05951499938965, + "learning_rate": 4.724463472256733e-05, + "loss": 0.4106, + "num_input_tokens_seen": 18368512, + "step": 31660 + }, + { + "epoch": 4.716264521894549, + "grad_norm": 20.412073135375977, + "learning_rate": 4.724315156635157e-05, + "loss": 0.4689, + "num_input_tokens_seen": 18371392, + "step": 31665 + }, + { + "epoch": 4.717009234435508, + "grad_norm": 8.094541549682617, + "learning_rate": 4.7241668034360384e-05, + "loss": 0.3249, + "num_input_tokens_seen": 18374112, + "step": 31670 + }, + { + "epoch": 4.717753946976467, + "grad_norm": 22.973068237304688, + "learning_rate": 4.724018412661883e-05, + "loss": 0.648, + "num_input_tokens_seen": 18376864, + "step": 31675 + }, + { + "epoch": 4.718498659517426, + "grad_norm": 1.9710818529129028, + "learning_rate": 4.7238699843151954e-05, + "loss": 0.1073, + "num_input_tokens_seen": 18379808, + "step": 31680 + }, + { + "epoch": 4.7192433720583855, + "grad_norm": 10.868524551391602, + "learning_rate": 4.723721518398485e-05, + "loss": 0.2154, + "num_input_tokens_seen": 18382560, + "step": 31685 + }, + { + "epoch": 4.719988084599344, + "grad_norm": 2.6878464221954346, + "learning_rate": 4.72357301491426e-05, + "loss": 0.1267, + "num_input_tokens_seen": 18385696, + "step": 31690 + }, + { + "epoch": 4.720732797140304, + "grad_norm": 10.061336517333984, + "learning_rate": 4.723424473865029e-05, + "loss": 0.3116, + "num_input_tokens_seen": 18388896, + "step": 31695 + }, + { + "epoch": 4.721477509681263, + "grad_norm": 15.196205139160156, + "learning_rate": 4.7232758952533006e-05, + "loss": 0.1766, + "num_input_tokens_seen": 18391712, + "step": 31700 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 17.863006591796875, + "learning_rate": 4.723127279081586e-05, + "loss": 0.1011, + "num_input_tokens_seen": 18394304, + "step": 31705 + }, + { + "epoch": 4.722966934763181, + "grad_norm": 13.087638854980469, + "learning_rate": 4.7229786253523956e-05, + "loss": 0.2451, + "num_input_tokens_seen": 18397280, + "step": 31710 + }, + { + "epoch": 4.723711647304141, + "grad_norm": 8.958114624023438, + "learning_rate": 4.7228299340682405e-05, + "loss": 0.2956, + "num_input_tokens_seen": 18399968, + "step": 31715 + }, + { + "epoch": 4.7244563598450995, + "grad_norm": 2.355125665664673, + "learning_rate": 4.7226812052316325e-05, + "loss": 0.1412, + "num_input_tokens_seen": 18402592, + "step": 31720 + }, + { + "epoch": 4.725201072386059, + "grad_norm": 16.32207489013672, + "learning_rate": 4.722532438845085e-05, + "loss": 0.4406, + "num_input_tokens_seen": 18405120, + "step": 31725 + }, + { + "epoch": 4.725945784927018, + "grad_norm": 11.728898048400879, + "learning_rate": 4.7223836349111106e-05, + "loss": 0.299, + "num_input_tokens_seen": 18408096, + "step": 31730 + }, + { + "epoch": 4.7266904974679775, + "grad_norm": 7.994048595428467, + "learning_rate": 4.722234793432224e-05, + "loss": 0.3822, + "num_input_tokens_seen": 18410880, + "step": 31735 + }, + { + "epoch": 4.727435210008936, + "grad_norm": 26.35112953186035, + "learning_rate": 4.722085914410938e-05, + "loss": 0.3807, + "num_input_tokens_seen": 18413600, + "step": 31740 + }, + { + "epoch": 4.728179922549896, + "grad_norm": 22.660844802856445, + "learning_rate": 4.7219369978497705e-05, + "loss": 0.5632, + "num_input_tokens_seen": 18416512, + "step": 31745 + }, + { + "epoch": 4.728924635090855, + "grad_norm": 18.34157371520996, + "learning_rate": 4.7217880437512344e-05, + "loss": 0.3129, + "num_input_tokens_seen": 18419424, + "step": 31750 + }, + { + "epoch": 4.729669347631814, + "grad_norm": 8.530956268310547, + "learning_rate": 4.7216390521178475e-05, + "loss": 0.7861, + "num_input_tokens_seen": 18422208, + "step": 31755 + }, + { + "epoch": 4.730414060172773, + "grad_norm": 18.081844329833984, + "learning_rate": 4.721490022952128e-05, + "loss": 0.5134, + "num_input_tokens_seen": 18424960, + "step": 31760 + }, + { + "epoch": 4.731158772713733, + "grad_norm": 9.343294143676758, + "learning_rate": 4.721340956256591e-05, + "loss": 0.4259, + "num_input_tokens_seen": 18427744, + "step": 31765 + }, + { + "epoch": 4.7319034852546915, + "grad_norm": 1.1110460758209229, + "learning_rate": 4.721191852033757e-05, + "loss": 0.382, + "num_input_tokens_seen": 18430912, + "step": 31770 + }, + { + "epoch": 4.732648197795651, + "grad_norm": 0.10234710574150085, + "learning_rate": 4.7210427102861437e-05, + "loss": 0.4309, + "num_input_tokens_seen": 18434016, + "step": 31775 + }, + { + "epoch": 4.73339291033661, + "grad_norm": 14.242196083068848, + "learning_rate": 4.720893531016271e-05, + "loss": 0.6103, + "num_input_tokens_seen": 18436896, + "step": 31780 + }, + { + "epoch": 4.73413762287757, + "grad_norm": 21.147775650024414, + "learning_rate": 4.72074431422666e-05, + "loss": 0.3813, + "num_input_tokens_seen": 18439808, + "step": 31785 + }, + { + "epoch": 4.734882335418528, + "grad_norm": 10.764679908752441, + "learning_rate": 4.72059505991983e-05, + "loss": 0.23, + "num_input_tokens_seen": 18443104, + "step": 31790 + }, + { + "epoch": 4.735627047959488, + "grad_norm": 17.0093936920166, + "learning_rate": 4.720445768098304e-05, + "loss": 0.5367, + "num_input_tokens_seen": 18446208, + "step": 31795 + }, + { + "epoch": 4.736371760500447, + "grad_norm": 14.089417457580566, + "learning_rate": 4.720296438764604e-05, + "loss": 0.2716, + "num_input_tokens_seen": 18448896, + "step": 31800 + }, + { + "epoch": 4.737116473041406, + "grad_norm": 13.009571075439453, + "learning_rate": 4.7201470719212514e-05, + "loss": 0.4712, + "num_input_tokens_seen": 18451616, + "step": 31805 + }, + { + "epoch": 4.737861185582365, + "grad_norm": 1.5881847143173218, + "learning_rate": 4.7199976675707716e-05, + "loss": 0.1654, + "num_input_tokens_seen": 18454464, + "step": 31810 + }, + { + "epoch": 4.738605898123325, + "grad_norm": 2.2984633445739746, + "learning_rate": 4.719848225715686e-05, + "loss": 0.2966, + "num_input_tokens_seen": 18457376, + "step": 31815 + }, + { + "epoch": 4.7393506106642835, + "grad_norm": 6.40339994430542, + "learning_rate": 4.719698746358522e-05, + "loss": 0.0981, + "num_input_tokens_seen": 18460288, + "step": 31820 + }, + { + "epoch": 4.740095323205242, + "grad_norm": 14.387887954711914, + "learning_rate": 4.719549229501803e-05, + "loss": 0.401, + "num_input_tokens_seen": 18463040, + "step": 31825 + }, + { + "epoch": 4.740840035746202, + "grad_norm": 1.9930036067962646, + "learning_rate": 4.7193996751480555e-05, + "loss": 0.0556, + "num_input_tokens_seen": 18466112, + "step": 31830 + }, + { + "epoch": 4.741584748287162, + "grad_norm": 33.69049835205078, + "learning_rate": 4.7192500832998065e-05, + "loss": 0.257, + "num_input_tokens_seen": 18468992, + "step": 31835 + }, + { + "epoch": 4.74232946082812, + "grad_norm": 5.949276924133301, + "learning_rate": 4.719100453959583e-05, + "loss": 0.1137, + "num_input_tokens_seen": 18471712, + "step": 31840 + }, + { + "epoch": 4.743074173369079, + "grad_norm": 28.634361267089844, + "learning_rate": 4.718950787129912e-05, + "loss": 0.4103, + "num_input_tokens_seen": 18474368, + "step": 31845 + }, + { + "epoch": 4.743818885910039, + "grad_norm": 20.35158920288086, + "learning_rate": 4.7188010828133233e-05, + "loss": 0.6696, + "num_input_tokens_seen": 18477280, + "step": 31850 + }, + { + "epoch": 4.744563598450998, + "grad_norm": 9.527785301208496, + "learning_rate": 4.7186513410123455e-05, + "loss": 0.2466, + "num_input_tokens_seen": 18480256, + "step": 31855 + }, + { + "epoch": 4.745308310991957, + "grad_norm": 14.568937301635742, + "learning_rate": 4.718501561729508e-05, + "loss": 0.4474, + "num_input_tokens_seen": 18483104, + "step": 31860 + }, + { + "epoch": 4.746053023532916, + "grad_norm": 9.420632362365723, + "learning_rate": 4.718351744967342e-05, + "loss": 0.2116, + "num_input_tokens_seen": 18485952, + "step": 31865 + }, + { + "epoch": 4.746797736073876, + "grad_norm": 36.41156768798828, + "learning_rate": 4.7182018907283776e-05, + "loss": 0.349, + "num_input_tokens_seen": 18488928, + "step": 31870 + }, + { + "epoch": 4.747542448614834, + "grad_norm": 0.038358379155397415, + "learning_rate": 4.718051999015146e-05, + "loss": 0.1285, + "num_input_tokens_seen": 18491840, + "step": 31875 + }, + { + "epoch": 4.748287161155794, + "grad_norm": 6.911402702331543, + "learning_rate": 4.7179020698301814e-05, + "loss": 0.3118, + "num_input_tokens_seen": 18494752, + "step": 31880 + }, + { + "epoch": 4.749031873696753, + "grad_norm": 37.15654754638672, + "learning_rate": 4.717752103176014e-05, + "loss": 0.3259, + "num_input_tokens_seen": 18497888, + "step": 31885 + }, + { + "epoch": 4.749776586237712, + "grad_norm": 0.3241793215274811, + "learning_rate": 4.71760209905518e-05, + "loss": 0.437, + "num_input_tokens_seen": 18500576, + "step": 31890 + }, + { + "epoch": 4.750521298778671, + "grad_norm": 0.11790996044874191, + "learning_rate": 4.717452057470212e-05, + "loss": 0.6786, + "num_input_tokens_seen": 18503136, + "step": 31895 + }, + { + "epoch": 4.751266011319631, + "grad_norm": 13.814261436462402, + "learning_rate": 4.7173019784236455e-05, + "loss": 0.3712, + "num_input_tokens_seen": 18505920, + "step": 31900 + }, + { + "epoch": 4.7520107238605895, + "grad_norm": 14.832709312438965, + "learning_rate": 4.717151861918015e-05, + "loss": 0.266, + "num_input_tokens_seen": 18509024, + "step": 31905 + }, + { + "epoch": 4.752755436401549, + "grad_norm": 5.035778999328613, + "learning_rate": 4.717001707955858e-05, + "loss": 0.1737, + "num_input_tokens_seen": 18511968, + "step": 31910 + }, + { + "epoch": 4.753500148942508, + "grad_norm": 29.364830017089844, + "learning_rate": 4.71685151653971e-05, + "loss": 0.2658, + "num_input_tokens_seen": 18514848, + "step": 31915 + }, + { + "epoch": 4.754244861483468, + "grad_norm": 26.050384521484375, + "learning_rate": 4.716701287672109e-05, + "loss": 0.5093, + "num_input_tokens_seen": 18517728, + "step": 31920 + }, + { + "epoch": 4.754989574024426, + "grad_norm": 28.945642471313477, + "learning_rate": 4.716551021355593e-05, + "loss": 0.5504, + "num_input_tokens_seen": 18520480, + "step": 31925 + }, + { + "epoch": 4.755734286565386, + "grad_norm": 16.111509323120117, + "learning_rate": 4.716400717592699e-05, + "loss": 0.3636, + "num_input_tokens_seen": 18523040, + "step": 31930 + }, + { + "epoch": 4.756478999106345, + "grad_norm": 29.25795555114746, + "learning_rate": 4.716250376385969e-05, + "loss": 0.4684, + "num_input_tokens_seen": 18525792, + "step": 31935 + }, + { + "epoch": 4.757223711647304, + "grad_norm": 40.236907958984375, + "learning_rate": 4.716099997737941e-05, + "loss": 0.4108, + "num_input_tokens_seen": 18528608, + "step": 31940 + }, + { + "epoch": 4.757968424188263, + "grad_norm": 5.102741241455078, + "learning_rate": 4.7159495816511546e-05, + "loss": 0.1779, + "num_input_tokens_seen": 18531296, + "step": 31945 + }, + { + "epoch": 4.758713136729223, + "grad_norm": 19.18422508239746, + "learning_rate": 4.7157991281281536e-05, + "loss": 0.3772, + "num_input_tokens_seen": 18534304, + "step": 31950 + }, + { + "epoch": 4.759457849270182, + "grad_norm": 9.653355598449707, + "learning_rate": 4.715648637171478e-05, + "loss": 0.5416, + "num_input_tokens_seen": 18537280, + "step": 31955 + }, + { + "epoch": 4.760202561811141, + "grad_norm": 5.864343166351318, + "learning_rate": 4.71549810878367e-05, + "loss": 0.3436, + "num_input_tokens_seen": 18540544, + "step": 31960 + }, + { + "epoch": 4.7609472743521, + "grad_norm": 5.009960174560547, + "learning_rate": 4.7153475429672736e-05, + "loss": 0.1377, + "num_input_tokens_seen": 18543552, + "step": 31965 + }, + { + "epoch": 4.76169198689306, + "grad_norm": 6.728619575500488, + "learning_rate": 4.715196939724832e-05, + "loss": 0.1536, + "num_input_tokens_seen": 18546528, + "step": 31970 + }, + { + "epoch": 4.762436699434018, + "grad_norm": 13.0603666305542, + "learning_rate": 4.71504629905889e-05, + "loss": 0.3748, + "num_input_tokens_seen": 18549728, + "step": 31975 + }, + { + "epoch": 4.763181411974978, + "grad_norm": 0.21667730808258057, + "learning_rate": 4.714895620971992e-05, + "loss": 0.4336, + "num_input_tokens_seen": 18552768, + "step": 31980 + }, + { + "epoch": 4.763926124515937, + "grad_norm": 1.9332222938537598, + "learning_rate": 4.714744905466683e-05, + "loss": 0.3209, + "num_input_tokens_seen": 18555552, + "step": 31985 + }, + { + "epoch": 4.764670837056896, + "grad_norm": 9.059866905212402, + "learning_rate": 4.71459415254551e-05, + "loss": 0.1676, + "num_input_tokens_seen": 18558240, + "step": 31990 + }, + { + "epoch": 4.765415549597855, + "grad_norm": 12.041807174682617, + "learning_rate": 4.71444336221102e-05, + "loss": 0.3931, + "num_input_tokens_seen": 18561184, + "step": 31995 + }, + { + "epoch": 4.766160262138815, + "grad_norm": 6.7559614181518555, + "learning_rate": 4.71429253446576e-05, + "loss": 0.1367, + "num_input_tokens_seen": 18564192, + "step": 32000 + }, + { + "epoch": 4.766904974679774, + "grad_norm": 32.1585693359375, + "learning_rate": 4.714141669312278e-05, + "loss": 0.2801, + "num_input_tokens_seen": 18567040, + "step": 32005 + }, + { + "epoch": 4.767649687220732, + "grad_norm": 2.5018835067749023, + "learning_rate": 4.713990766753123e-05, + "loss": 0.3186, + "num_input_tokens_seen": 18570048, + "step": 32010 + }, + { + "epoch": 4.768394399761692, + "grad_norm": 10.944294929504395, + "learning_rate": 4.7138398267908434e-05, + "loss": 0.4007, + "num_input_tokens_seen": 18573312, + "step": 32015 + }, + { + "epoch": 4.769139112302652, + "grad_norm": 8.615724563598633, + "learning_rate": 4.713688849427991e-05, + "loss": 0.2207, + "num_input_tokens_seen": 18576096, + "step": 32020 + }, + { + "epoch": 4.76988382484361, + "grad_norm": 9.073534965515137, + "learning_rate": 4.713537834667115e-05, + "loss": 0.286, + "num_input_tokens_seen": 18579264, + "step": 32025 + }, + { + "epoch": 4.770628537384569, + "grad_norm": 17.395076751708984, + "learning_rate": 4.713386782510766e-05, + "loss": 0.1862, + "num_input_tokens_seen": 18581888, + "step": 32030 + }, + { + "epoch": 4.771373249925529, + "grad_norm": 3.327153205871582, + "learning_rate": 4.713235692961498e-05, + "loss": 0.4494, + "num_input_tokens_seen": 18584928, + "step": 32035 + }, + { + "epoch": 4.772117962466488, + "grad_norm": 5.902517318725586, + "learning_rate": 4.713084566021863e-05, + "loss": 0.1996, + "num_input_tokens_seen": 18587584, + "step": 32040 + }, + { + "epoch": 4.772862675007447, + "grad_norm": 6.156131744384766, + "learning_rate": 4.7129334016944124e-05, + "loss": 0.5174, + "num_input_tokens_seen": 18590272, + "step": 32045 + }, + { + "epoch": 4.773607387548406, + "grad_norm": 10.13264274597168, + "learning_rate": 4.7127821999817014e-05, + "loss": 0.253, + "num_input_tokens_seen": 18593376, + "step": 32050 + }, + { + "epoch": 4.774352100089366, + "grad_norm": 0.9690029621124268, + "learning_rate": 4.712630960886284e-05, + "loss": 0.2015, + "num_input_tokens_seen": 18596224, + "step": 32055 + }, + { + "epoch": 4.775096812630324, + "grad_norm": 8.685775756835938, + "learning_rate": 4.7124796844107155e-05, + "loss": 0.7711, + "num_input_tokens_seen": 18599104, + "step": 32060 + }, + { + "epoch": 4.775841525171284, + "grad_norm": 8.336730003356934, + "learning_rate": 4.7123283705575514e-05, + "loss": 0.2529, + "num_input_tokens_seen": 18601920, + "step": 32065 + }, + { + "epoch": 4.776586237712243, + "grad_norm": 3.1490988731384277, + "learning_rate": 4.712177019329348e-05, + "loss": 0.314, + "num_input_tokens_seen": 18605088, + "step": 32070 + }, + { + "epoch": 4.777330950253202, + "grad_norm": 0.7908482551574707, + "learning_rate": 4.712025630728662e-05, + "loss": 0.4535, + "num_input_tokens_seen": 18607712, + "step": 32075 + }, + { + "epoch": 4.778075662794161, + "grad_norm": 54.029296875, + "learning_rate": 4.711874204758051e-05, + "loss": 0.8207, + "num_input_tokens_seen": 18610688, + "step": 32080 + }, + { + "epoch": 4.778820375335121, + "grad_norm": 0.4090234637260437, + "learning_rate": 4.7117227414200735e-05, + "loss": 0.2219, + "num_input_tokens_seen": 18613824, + "step": 32085 + }, + { + "epoch": 4.77956508787608, + "grad_norm": 27.742557525634766, + "learning_rate": 4.711571240717289e-05, + "loss": 0.3856, + "num_input_tokens_seen": 18616704, + "step": 32090 + }, + { + "epoch": 4.780309800417039, + "grad_norm": 8.847054481506348, + "learning_rate": 4.7114197026522555e-05, + "loss": 0.2874, + "num_input_tokens_seen": 18619648, + "step": 32095 + }, + { + "epoch": 4.781054512957998, + "grad_norm": 2.1349897384643555, + "learning_rate": 4.711268127227534e-05, + "loss": 0.2571, + "num_input_tokens_seen": 18622656, + "step": 32100 + }, + { + "epoch": 4.781799225498958, + "grad_norm": 38.237693786621094, + "learning_rate": 4.711116514445685e-05, + "loss": 0.499, + "num_input_tokens_seen": 18625536, + "step": 32105 + }, + { + "epoch": 4.782543938039916, + "grad_norm": 9.475374221801758, + "learning_rate": 4.710964864309269e-05, + "loss": 0.3757, + "num_input_tokens_seen": 18628480, + "step": 32110 + }, + { + "epoch": 4.783288650580876, + "grad_norm": 20.262622833251953, + "learning_rate": 4.710813176820848e-05, + "loss": 0.4792, + "num_input_tokens_seen": 18631488, + "step": 32115 + }, + { + "epoch": 4.784033363121835, + "grad_norm": 11.152937889099121, + "learning_rate": 4.710661451982987e-05, + "loss": 0.222, + "num_input_tokens_seen": 18634720, + "step": 32120 + }, + { + "epoch": 4.7847780756627944, + "grad_norm": 28.86896324157715, + "learning_rate": 4.7105096897982473e-05, + "loss": 0.3681, + "num_input_tokens_seen": 18637664, + "step": 32125 + }, + { + "epoch": 4.785522788203753, + "grad_norm": 26.461151123046875, + "learning_rate": 4.710357890269193e-05, + "loss": 0.2013, + "num_input_tokens_seen": 18640704, + "step": 32130 + }, + { + "epoch": 4.786267500744713, + "grad_norm": 8.1235933303833, + "learning_rate": 4.710206053398388e-05, + "loss": 0.5056, + "num_input_tokens_seen": 18643264, + "step": 32135 + }, + { + "epoch": 4.787012213285672, + "grad_norm": 15.07988452911377, + "learning_rate": 4.710054179188399e-05, + "loss": 0.5496, + "num_input_tokens_seen": 18645952, + "step": 32140 + }, + { + "epoch": 4.787756925826631, + "grad_norm": 30.526336669921875, + "learning_rate": 4.7099022676417904e-05, + "loss": 0.3346, + "num_input_tokens_seen": 18649024, + "step": 32145 + }, + { + "epoch": 4.78850163836759, + "grad_norm": 8.215578079223633, + "learning_rate": 4.709750318761129e-05, + "loss": 0.2091, + "num_input_tokens_seen": 18652192, + "step": 32150 + }, + { + "epoch": 4.78924635090855, + "grad_norm": 52.11602020263672, + "learning_rate": 4.709598332548982e-05, + "loss": 0.294, + "num_input_tokens_seen": 18655008, + "step": 32155 + }, + { + "epoch": 4.789991063449508, + "grad_norm": 6.230865001678467, + "learning_rate": 4.709446309007917e-05, + "loss": 0.2978, + "num_input_tokens_seen": 18657728, + "step": 32160 + }, + { + "epoch": 4.790735775990468, + "grad_norm": 5.5247650146484375, + "learning_rate": 4.709294248140502e-05, + "loss": 0.3661, + "num_input_tokens_seen": 18660864, + "step": 32165 + }, + { + "epoch": 4.791480488531427, + "grad_norm": 18.171611785888672, + "learning_rate": 4.709142149949306e-05, + "loss": 0.4172, + "num_input_tokens_seen": 18663680, + "step": 32170 + }, + { + "epoch": 4.792225201072386, + "grad_norm": 17.358734130859375, + "learning_rate": 4.708990014436899e-05, + "loss": 0.5444, + "num_input_tokens_seen": 18666400, + "step": 32175 + }, + { + "epoch": 4.792969913613345, + "grad_norm": 19.51613998413086, + "learning_rate": 4.708837841605851e-05, + "loss": 0.5191, + "num_input_tokens_seen": 18668960, + "step": 32180 + }, + { + "epoch": 4.793714626154305, + "grad_norm": 16.57964515686035, + "learning_rate": 4.7086856314587316e-05, + "loss": 0.1357, + "num_input_tokens_seen": 18672064, + "step": 32185 + }, + { + "epoch": 4.794459338695264, + "grad_norm": 19.38800811767578, + "learning_rate": 4.708533383998114e-05, + "loss": 0.4412, + "num_input_tokens_seen": 18674816, + "step": 32190 + }, + { + "epoch": 4.795204051236222, + "grad_norm": 1.1618720293045044, + "learning_rate": 4.7083810992265696e-05, + "loss": 0.4077, + "num_input_tokens_seen": 18677632, + "step": 32195 + }, + { + "epoch": 4.795948763777182, + "grad_norm": 5.710244655609131, + "learning_rate": 4.7082287771466705e-05, + "loss": 0.2916, + "num_input_tokens_seen": 18680512, + "step": 32200 + }, + { + "epoch": 4.796693476318142, + "grad_norm": 12.146331787109375, + "learning_rate": 4.7080764177609914e-05, + "loss": 0.3314, + "num_input_tokens_seen": 18683328, + "step": 32205 + }, + { + "epoch": 4.7974381888591004, + "grad_norm": 0.03570341318845749, + "learning_rate": 4.7079240210721046e-05, + "loss": 0.2081, + "num_input_tokens_seen": 18686144, + "step": 32210 + }, + { + "epoch": 4.798182901400059, + "grad_norm": 8.206758499145508, + "learning_rate": 4.707771587082586e-05, + "loss": 0.2756, + "num_input_tokens_seen": 18688832, + "step": 32215 + }, + { + "epoch": 4.798927613941019, + "grad_norm": 30.366201400756836, + "learning_rate": 4.70761911579501e-05, + "loss": 0.4329, + "num_input_tokens_seen": 18691712, + "step": 32220 + }, + { + "epoch": 4.799672326481978, + "grad_norm": 1.0982304811477661, + "learning_rate": 4.707466607211953e-05, + "loss": 0.2568, + "num_input_tokens_seen": 18694912, + "step": 32225 + }, + { + "epoch": 4.800417039022937, + "grad_norm": 24.69671058654785, + "learning_rate": 4.707314061335991e-05, + "loss": 0.1432, + "num_input_tokens_seen": 18697952, + "step": 32230 + }, + { + "epoch": 4.801161751563896, + "grad_norm": 14.003777503967285, + "learning_rate": 4.707161478169702e-05, + "loss": 0.2037, + "num_input_tokens_seen": 18700768, + "step": 32235 + }, + { + "epoch": 4.801906464104856, + "grad_norm": 34.48062515258789, + "learning_rate": 4.7070088577156625e-05, + "loss": 0.3903, + "num_input_tokens_seen": 18703968, + "step": 32240 + }, + { + "epoch": 4.802651176645814, + "grad_norm": 13.66392993927002, + "learning_rate": 4.706856199976451e-05, + "loss": 0.3864, + "num_input_tokens_seen": 18706784, + "step": 32245 + }, + { + "epoch": 4.803395889186774, + "grad_norm": 34.15410232543945, + "learning_rate": 4.706703504954647e-05, + "loss": 0.1989, + "num_input_tokens_seen": 18709376, + "step": 32250 + }, + { + "epoch": 4.804140601727733, + "grad_norm": 11.598067283630371, + "learning_rate": 4.706550772652831e-05, + "loss": 0.1796, + "num_input_tokens_seen": 18712096, + "step": 32255 + }, + { + "epoch": 4.8048853142686925, + "grad_norm": 0.08490133285522461, + "learning_rate": 4.7063980030735824e-05, + "loss": 0.3288, + "num_input_tokens_seen": 18715008, + "step": 32260 + }, + { + "epoch": 4.805630026809651, + "grad_norm": 17.002994537353516, + "learning_rate": 4.7062451962194806e-05, + "loss": 0.2878, + "num_input_tokens_seen": 18717856, + "step": 32265 + }, + { + "epoch": 4.806374739350611, + "grad_norm": 2.5502490997314453, + "learning_rate": 4.70609235209311e-05, + "loss": 0.3148, + "num_input_tokens_seen": 18720512, + "step": 32270 + }, + { + "epoch": 4.80711945189157, + "grad_norm": 11.833813667297363, + "learning_rate": 4.705939470697051e-05, + "loss": 0.3873, + "num_input_tokens_seen": 18723264, + "step": 32275 + }, + { + "epoch": 4.807864164432529, + "grad_norm": 52.130306243896484, + "learning_rate": 4.7057865520338865e-05, + "loss": 0.5856, + "num_input_tokens_seen": 18726496, + "step": 32280 + }, + { + "epoch": 4.808608876973488, + "grad_norm": 4.827303409576416, + "learning_rate": 4.7056335961061994e-05, + "loss": 0.4002, + "num_input_tokens_seen": 18729280, + "step": 32285 + }, + { + "epoch": 4.809353589514448, + "grad_norm": 22.97165870666504, + "learning_rate": 4.705480602916575e-05, + "loss": 0.3556, + "num_input_tokens_seen": 18732160, + "step": 32290 + }, + { + "epoch": 4.8100983020554064, + "grad_norm": 1.2738033533096313, + "learning_rate": 4.705327572467597e-05, + "loss": 0.3073, + "num_input_tokens_seen": 18734944, + "step": 32295 + }, + { + "epoch": 4.810843014596366, + "grad_norm": 4.984851837158203, + "learning_rate": 4.705174504761851e-05, + "loss": 0.216, + "num_input_tokens_seen": 18737440, + "step": 32300 + }, + { + "epoch": 4.811587727137325, + "grad_norm": 17.189064025878906, + "learning_rate": 4.705021399801924e-05, + "loss": 0.3434, + "num_input_tokens_seen": 18740256, + "step": 32305 + }, + { + "epoch": 4.8123324396782845, + "grad_norm": 11.487872123718262, + "learning_rate": 4.704868257590401e-05, + "loss": 0.4465, + "num_input_tokens_seen": 18743104, + "step": 32310 + }, + { + "epoch": 4.813077152219243, + "grad_norm": 5.518132209777832, + "learning_rate": 4.7047150781298693e-05, + "loss": 0.0831, + "num_input_tokens_seen": 18745888, + "step": 32315 + }, + { + "epoch": 4.813821864760203, + "grad_norm": 35.751991271972656, + "learning_rate": 4.704561861422917e-05, + "loss": 0.933, + "num_input_tokens_seen": 18748960, + "step": 32320 + }, + { + "epoch": 4.814566577301162, + "grad_norm": 13.79642105102539, + "learning_rate": 4.704408607472134e-05, + "loss": 0.2773, + "num_input_tokens_seen": 18752000, + "step": 32325 + }, + { + "epoch": 4.815311289842121, + "grad_norm": 3.6856048107147217, + "learning_rate": 4.704255316280106e-05, + "loss": 0.1745, + "num_input_tokens_seen": 18754944, + "step": 32330 + }, + { + "epoch": 4.81605600238308, + "grad_norm": 5.948089599609375, + "learning_rate": 4.704101987849426e-05, + "loss": 0.3315, + "num_input_tokens_seen": 18757696, + "step": 32335 + }, + { + "epoch": 4.816800714924039, + "grad_norm": 6.29688024520874, + "learning_rate": 4.7039486221826834e-05, + "loss": 0.2594, + "num_input_tokens_seen": 18760576, + "step": 32340 + }, + { + "epoch": 4.8175454274649985, + "grad_norm": 7.515061378479004, + "learning_rate": 4.703795219282469e-05, + "loss": 0.3218, + "num_input_tokens_seen": 18763616, + "step": 32345 + }, + { + "epoch": 4.818290140005958, + "grad_norm": 17.82758331298828, + "learning_rate": 4.7036417791513735e-05, + "loss": 0.4501, + "num_input_tokens_seen": 18766688, + "step": 32350 + }, + { + "epoch": 4.819034852546917, + "grad_norm": 1.6802934408187866, + "learning_rate": 4.7034883017919896e-05, + "loss": 0.142, + "num_input_tokens_seen": 18769504, + "step": 32355 + }, + { + "epoch": 4.819779565087876, + "grad_norm": 9.617213249206543, + "learning_rate": 4.70333478720691e-05, + "loss": 0.3789, + "num_input_tokens_seen": 18772384, + "step": 32360 + }, + { + "epoch": 4.820524277628835, + "grad_norm": 19.709980010986328, + "learning_rate": 4.703181235398729e-05, + "loss": 0.2807, + "num_input_tokens_seen": 18775392, + "step": 32365 + }, + { + "epoch": 4.821268990169795, + "grad_norm": 9.187531471252441, + "learning_rate": 4.7030276463700405e-05, + "loss": 0.3845, + "num_input_tokens_seen": 18778304, + "step": 32370 + }, + { + "epoch": 4.822013702710754, + "grad_norm": 11.852832794189453, + "learning_rate": 4.702874020123439e-05, + "loss": 0.6425, + "num_input_tokens_seen": 18781376, + "step": 32375 + }, + { + "epoch": 4.8227584152517124, + "grad_norm": 5.46917724609375, + "learning_rate": 4.70272035666152e-05, + "loss": 0.462, + "num_input_tokens_seen": 18784288, + "step": 32380 + }, + { + "epoch": 4.823503127792672, + "grad_norm": 5.767661094665527, + "learning_rate": 4.702566655986879e-05, + "loss": 0.2449, + "num_input_tokens_seen": 18787488, + "step": 32385 + }, + { + "epoch": 4.824247840333631, + "grad_norm": 18.311227798461914, + "learning_rate": 4.702412918102113e-05, + "loss": 0.3873, + "num_input_tokens_seen": 18790528, + "step": 32390 + }, + { + "epoch": 4.8249925528745905, + "grad_norm": 27.07303810119629, + "learning_rate": 4.702259143009819e-05, + "loss": 0.2842, + "num_input_tokens_seen": 18793472, + "step": 32395 + }, + { + "epoch": 4.825737265415549, + "grad_norm": 28.115949630737305, + "learning_rate": 4.702105330712595e-05, + "loss": 0.3282, + "num_input_tokens_seen": 18796416, + "step": 32400 + }, + { + "epoch": 4.826481977956509, + "grad_norm": 13.382131576538086, + "learning_rate": 4.70195148121304e-05, + "loss": 0.3172, + "num_input_tokens_seen": 18799104, + "step": 32405 + }, + { + "epoch": 4.827226690497468, + "grad_norm": 0.4591090679168701, + "learning_rate": 4.7017975945137524e-05, + "loss": 0.367, + "num_input_tokens_seen": 18801984, + "step": 32410 + }, + { + "epoch": 4.827971403038427, + "grad_norm": 20.926342010498047, + "learning_rate": 4.701643670617333e-05, + "loss": 0.5001, + "num_input_tokens_seen": 18804736, + "step": 32415 + }, + { + "epoch": 4.828716115579386, + "grad_norm": 0.0809905156493187, + "learning_rate": 4.70148970952638e-05, + "loss": 0.2783, + "num_input_tokens_seen": 18807744, + "step": 32420 + }, + { + "epoch": 4.829460828120346, + "grad_norm": 5.261575698852539, + "learning_rate": 4.701335711243497e-05, + "loss": 0.1998, + "num_input_tokens_seen": 18810624, + "step": 32425 + }, + { + "epoch": 4.8302055406613045, + "grad_norm": 0.2985028624534607, + "learning_rate": 4.7011816757712835e-05, + "loss": 0.2088, + "num_input_tokens_seen": 18813664, + "step": 32430 + }, + { + "epoch": 4.830950253202264, + "grad_norm": 12.794614791870117, + "learning_rate": 4.701027603112343e-05, + "loss": 0.4505, + "num_input_tokens_seen": 18816672, + "step": 32435 + }, + { + "epoch": 4.831694965743223, + "grad_norm": 14.37582778930664, + "learning_rate": 4.7008734932692795e-05, + "loss": 0.2903, + "num_input_tokens_seen": 18819584, + "step": 32440 + }, + { + "epoch": 4.8324396782841825, + "grad_norm": 3.90639328956604, + "learning_rate": 4.700719346244694e-05, + "loss": 0.2233, + "num_input_tokens_seen": 18822272, + "step": 32445 + }, + { + "epoch": 4.833184390825141, + "grad_norm": 0.6275150179862976, + "learning_rate": 4.7005651620411914e-05, + "loss": 0.3536, + "num_input_tokens_seen": 18824928, + "step": 32450 + }, + { + "epoch": 4.833929103366101, + "grad_norm": 14.413907051086426, + "learning_rate": 4.7004109406613786e-05, + "loss": 0.3832, + "num_input_tokens_seen": 18827712, + "step": 32455 + }, + { + "epoch": 4.83467381590706, + "grad_norm": 39.465904235839844, + "learning_rate": 4.700256682107858e-05, + "loss": 0.4554, + "num_input_tokens_seen": 18830784, + "step": 32460 + }, + { + "epoch": 4.835418528448019, + "grad_norm": 17.721643447875977, + "learning_rate": 4.700102386383237e-05, + "loss": 0.2801, + "num_input_tokens_seen": 18833792, + "step": 32465 + }, + { + "epoch": 4.836163240988978, + "grad_norm": 1.9179009199142456, + "learning_rate": 4.699948053490123e-05, + "loss": 0.5148, + "num_input_tokens_seen": 18836928, + "step": 32470 + }, + { + "epoch": 4.836907953529938, + "grad_norm": 5.469531059265137, + "learning_rate": 4.699793683431122e-05, + "loss": 0.0171, + "num_input_tokens_seen": 18839840, + "step": 32475 + }, + { + "epoch": 4.8376526660708965, + "grad_norm": 13.126209259033203, + "learning_rate": 4.699639276208843e-05, + "loss": 0.2914, + "num_input_tokens_seen": 18842944, + "step": 32480 + }, + { + "epoch": 4.838397378611856, + "grad_norm": 6.620680332183838, + "learning_rate": 4.699484831825894e-05, + "loss": 0.1529, + "num_input_tokens_seen": 18845888, + "step": 32485 + }, + { + "epoch": 4.839142091152815, + "grad_norm": 15.562382698059082, + "learning_rate": 4.699330350284884e-05, + "loss": 0.3301, + "num_input_tokens_seen": 18848672, + "step": 32490 + }, + { + "epoch": 4.8398868036937746, + "grad_norm": 4.045253753662109, + "learning_rate": 4.6991758315884225e-05, + "loss": 0.42, + "num_input_tokens_seen": 18851712, + "step": 32495 + }, + { + "epoch": 4.840631516234733, + "grad_norm": 47.64567947387695, + "learning_rate": 4.699021275739121e-05, + "loss": 0.2151, + "num_input_tokens_seen": 18854720, + "step": 32500 + }, + { + "epoch": 4.841376228775693, + "grad_norm": 68.49346160888672, + "learning_rate": 4.69886668273959e-05, + "loss": 0.3644, + "num_input_tokens_seen": 18857408, + "step": 32505 + }, + { + "epoch": 4.842120941316652, + "grad_norm": 17.864913940429688, + "learning_rate": 4.698712052592441e-05, + "loss": 0.2009, + "num_input_tokens_seen": 18860224, + "step": 32510 + }, + { + "epoch": 4.842865653857611, + "grad_norm": 0.5808534622192383, + "learning_rate": 4.6985573853002875e-05, + "loss": 0.3186, + "num_input_tokens_seen": 18863104, + "step": 32515 + }, + { + "epoch": 4.84361036639857, + "grad_norm": 4.18108606338501, + "learning_rate": 4.698402680865741e-05, + "loss": 0.2478, + "num_input_tokens_seen": 18866176, + "step": 32520 + }, + { + "epoch": 4.844355078939529, + "grad_norm": 10.053901672363281, + "learning_rate": 4.6982479392914144e-05, + "loss": 0.1272, + "num_input_tokens_seen": 18869216, + "step": 32525 + }, + { + "epoch": 4.8450997914804885, + "grad_norm": 2.1060798168182373, + "learning_rate": 4.698093160579924e-05, + "loss": 0.1073, + "num_input_tokens_seen": 18872032, + "step": 32530 + }, + { + "epoch": 4.845844504021448, + "grad_norm": 0.32078054547309875, + "learning_rate": 4.697938344733884e-05, + "loss": 0.2566, + "num_input_tokens_seen": 18875008, + "step": 32535 + }, + { + "epoch": 4.846589216562407, + "grad_norm": 45.1161994934082, + "learning_rate": 4.6977834917559095e-05, + "loss": 0.3467, + "num_input_tokens_seen": 18877792, + "step": 32540 + }, + { + "epoch": 4.847333929103366, + "grad_norm": 17.47078514099121, + "learning_rate": 4.697628601648616e-05, + "loss": 0.2389, + "num_input_tokens_seen": 18880672, + "step": 32545 + }, + { + "epoch": 4.848078641644325, + "grad_norm": 15.214116096496582, + "learning_rate": 4.697473674414621e-05, + "loss": 0.471, + "num_input_tokens_seen": 18883328, + "step": 32550 + }, + { + "epoch": 4.848823354185284, + "grad_norm": 16.102924346923828, + "learning_rate": 4.697318710056542e-05, + "loss": 0.5103, + "num_input_tokens_seen": 18886144, + "step": 32555 + }, + { + "epoch": 4.849568066726244, + "grad_norm": 18.94547462463379, + "learning_rate": 4.697163708576997e-05, + "loss": 0.4234, + "num_input_tokens_seen": 18888928, + "step": 32560 + }, + { + "epoch": 4.8503127792672025, + "grad_norm": 3.205836534500122, + "learning_rate": 4.697008669978603e-05, + "loss": 0.4754, + "num_input_tokens_seen": 18891680, + "step": 32565 + }, + { + "epoch": 4.851057491808162, + "grad_norm": 0.043994415551424026, + "learning_rate": 4.696853594263981e-05, + "loss": 0.3112, + "num_input_tokens_seen": 18894528, + "step": 32570 + }, + { + "epoch": 4.851802204349121, + "grad_norm": 3.508476495742798, + "learning_rate": 4.6966984814357515e-05, + "loss": 0.1832, + "num_input_tokens_seen": 18897248, + "step": 32575 + }, + { + "epoch": 4.8525469168900806, + "grad_norm": 6.257088661193848, + "learning_rate": 4.6965433314965325e-05, + "loss": 0.2818, + "num_input_tokens_seen": 18900352, + "step": 32580 + }, + { + "epoch": 4.853291629431039, + "grad_norm": 10.432644844055176, + "learning_rate": 4.6963881444489464e-05, + "loss": 0.2012, + "num_input_tokens_seen": 18903456, + "step": 32585 + }, + { + "epoch": 4.854036341971999, + "grad_norm": 12.607451438903809, + "learning_rate": 4.696232920295616e-05, + "loss": 0.1461, + "num_input_tokens_seen": 18906336, + "step": 32590 + }, + { + "epoch": 4.854781054512958, + "grad_norm": 14.635880470275879, + "learning_rate": 4.696077659039161e-05, + "loss": 0.5052, + "num_input_tokens_seen": 18909536, + "step": 32595 + }, + { + "epoch": 4.855525767053917, + "grad_norm": 38.7400016784668, + "learning_rate": 4.6959223606822066e-05, + "loss": 0.2473, + "num_input_tokens_seen": 18912096, + "step": 32600 + }, + { + "epoch": 4.856270479594876, + "grad_norm": 24.147621154785156, + "learning_rate": 4.695767025227376e-05, + "loss": 0.2993, + "num_input_tokens_seen": 18915104, + "step": 32605 + }, + { + "epoch": 4.857015192135836, + "grad_norm": 10.088709831237793, + "learning_rate": 4.6956116526772934e-05, + "loss": 0.1625, + "num_input_tokens_seen": 18917920, + "step": 32610 + }, + { + "epoch": 4.8577599046767945, + "grad_norm": 5.45865535736084, + "learning_rate": 4.6954562430345825e-05, + "loss": 0.3931, + "num_input_tokens_seen": 18921120, + "step": 32615 + }, + { + "epoch": 4.858504617217754, + "grad_norm": 25.41695213317871, + "learning_rate": 4.695300796301871e-05, + "loss": 0.6473, + "num_input_tokens_seen": 18924256, + "step": 32620 + }, + { + "epoch": 4.859249329758713, + "grad_norm": 21.12734031677246, + "learning_rate": 4.695145312481783e-05, + "loss": 0.4102, + "num_input_tokens_seen": 18927168, + "step": 32625 + }, + { + "epoch": 4.859994042299673, + "grad_norm": 0.048753224313259125, + "learning_rate": 4.694989791576946e-05, + "loss": 0.3474, + "num_input_tokens_seen": 18930080, + "step": 32630 + }, + { + "epoch": 4.860738754840631, + "grad_norm": 15.844785690307617, + "learning_rate": 4.6948342335899874e-05, + "loss": 0.6431, + "num_input_tokens_seen": 18933120, + "step": 32635 + }, + { + "epoch": 4.861483467381591, + "grad_norm": 4.13828182220459, + "learning_rate": 4.694678638523535e-05, + "loss": 0.2483, + "num_input_tokens_seen": 18935904, + "step": 32640 + }, + { + "epoch": 4.86222817992255, + "grad_norm": 12.531612396240234, + "learning_rate": 4.694523006380218e-05, + "loss": 0.1894, + "num_input_tokens_seen": 18939104, + "step": 32645 + }, + { + "epoch": 4.862972892463509, + "grad_norm": 35.07682418823242, + "learning_rate": 4.694367337162665e-05, + "loss": 0.5022, + "num_input_tokens_seen": 18941696, + "step": 32650 + }, + { + "epoch": 4.863717605004468, + "grad_norm": 5.490697860717773, + "learning_rate": 4.694211630873506e-05, + "loss": 0.3444, + "num_input_tokens_seen": 18944800, + "step": 32655 + }, + { + "epoch": 4.864462317545428, + "grad_norm": 1.9260783195495605, + "learning_rate": 4.694055887515372e-05, + "loss": 0.6346, + "num_input_tokens_seen": 18947936, + "step": 32660 + }, + { + "epoch": 4.8652070300863866, + "grad_norm": 17.8286075592041, + "learning_rate": 4.6939001070908925e-05, + "loss": 0.2945, + "num_input_tokens_seen": 18950912, + "step": 32665 + }, + { + "epoch": 4.865951742627346, + "grad_norm": 0.10809989273548126, + "learning_rate": 4.693744289602702e-05, + "loss": 0.3883, + "num_input_tokens_seen": 18953760, + "step": 32670 + }, + { + "epoch": 4.866696455168305, + "grad_norm": 25.010787963867188, + "learning_rate": 4.693588435053431e-05, + "loss": 0.6619, + "num_input_tokens_seen": 18956768, + "step": 32675 + }, + { + "epoch": 4.867441167709265, + "grad_norm": 9.208076477050781, + "learning_rate": 4.693432543445712e-05, + "loss": 0.7588, + "num_input_tokens_seen": 18959584, + "step": 32680 + }, + { + "epoch": 4.868185880250223, + "grad_norm": 6.354979515075684, + "learning_rate": 4.6932766147821804e-05, + "loss": 0.3433, + "num_input_tokens_seen": 18962208, + "step": 32685 + }, + { + "epoch": 4.868930592791182, + "grad_norm": 14.853206634521484, + "learning_rate": 4.693120649065469e-05, + "loss": 0.3135, + "num_input_tokens_seen": 18965408, + "step": 32690 + }, + { + "epoch": 4.869675305332142, + "grad_norm": 48.87890625, + "learning_rate": 4.6929646462982135e-05, + "loss": 0.1627, + "num_input_tokens_seen": 18968160, + "step": 32695 + }, + { + "epoch": 4.870420017873101, + "grad_norm": 4.601415157318115, + "learning_rate": 4.692808606483049e-05, + "loss": 0.5337, + "num_input_tokens_seen": 18971552, + "step": 32700 + }, + { + "epoch": 4.87116473041406, + "grad_norm": 27.555604934692383, + "learning_rate": 4.692652529622612e-05, + "loss": 0.4218, + "num_input_tokens_seen": 18974688, + "step": 32705 + }, + { + "epoch": 4.871909442955019, + "grad_norm": 31.300580978393555, + "learning_rate": 4.692496415719539e-05, + "loss": 0.5321, + "num_input_tokens_seen": 18977504, + "step": 32710 + }, + { + "epoch": 4.872654155495979, + "grad_norm": 33.40978240966797, + "learning_rate": 4.692340264776467e-05, + "loss": 0.1623, + "num_input_tokens_seen": 18980288, + "step": 32715 + }, + { + "epoch": 4.873398868036938, + "grad_norm": 25.48238182067871, + "learning_rate": 4.6921840767960346e-05, + "loss": 0.5362, + "num_input_tokens_seen": 18982976, + "step": 32720 + }, + { + "epoch": 4.874143580577897, + "grad_norm": 12.847367286682129, + "learning_rate": 4.69202785178088e-05, + "loss": 0.4952, + "num_input_tokens_seen": 18985856, + "step": 32725 + }, + { + "epoch": 4.874888293118856, + "grad_norm": 0.09855975210666656, + "learning_rate": 4.6918715897336434e-05, + "loss": 0.2113, + "num_input_tokens_seen": 18988608, + "step": 32730 + }, + { + "epoch": 4.875633005659815, + "grad_norm": 15.372233390808105, + "learning_rate": 4.691715290656964e-05, + "loss": 0.4268, + "num_input_tokens_seen": 18991200, + "step": 32735 + }, + { + "epoch": 4.876377718200774, + "grad_norm": 6.477662563323975, + "learning_rate": 4.6915589545534814e-05, + "loss": 0.6186, + "num_input_tokens_seen": 18994272, + "step": 32740 + }, + { + "epoch": 4.877122430741734, + "grad_norm": 12.79367446899414, + "learning_rate": 4.691402581425839e-05, + "loss": 0.2485, + "num_input_tokens_seen": 18997152, + "step": 32745 + }, + { + "epoch": 4.8778671432826926, + "grad_norm": 13.753011703491211, + "learning_rate": 4.691246171276676e-05, + "loss": 0.3855, + "num_input_tokens_seen": 19000096, + "step": 32750 + }, + { + "epoch": 4.878611855823652, + "grad_norm": 75.33065795898438, + "learning_rate": 4.691089724108636e-05, + "loss": 0.2572, + "num_input_tokens_seen": 19003424, + "step": 32755 + }, + { + "epoch": 4.879356568364611, + "grad_norm": 36.221920013427734, + "learning_rate": 4.6909332399243636e-05, + "loss": 0.2687, + "num_input_tokens_seen": 19006304, + "step": 32760 + }, + { + "epoch": 4.880101280905571, + "grad_norm": 10.261853218078613, + "learning_rate": 4.690776718726499e-05, + "loss": 0.2839, + "num_input_tokens_seen": 19009248, + "step": 32765 + }, + { + "epoch": 4.880845993446529, + "grad_norm": 5.102478981018066, + "learning_rate": 4.690620160517689e-05, + "loss": 0.4333, + "num_input_tokens_seen": 19012000, + "step": 32770 + }, + { + "epoch": 4.881590705987489, + "grad_norm": 1.4976495504379272, + "learning_rate": 4.690463565300579e-05, + "loss": 0.2481, + "num_input_tokens_seen": 19014816, + "step": 32775 + }, + { + "epoch": 4.882335418528448, + "grad_norm": 13.530123710632324, + "learning_rate": 4.690306933077811e-05, + "loss": 0.1395, + "num_input_tokens_seen": 19017664, + "step": 32780 + }, + { + "epoch": 4.883080131069407, + "grad_norm": 10.324929237365723, + "learning_rate": 4.6901502638520355e-05, + "loss": 0.6201, + "num_input_tokens_seen": 19020256, + "step": 32785 + }, + { + "epoch": 4.883824843610366, + "grad_norm": 1.5509774684906006, + "learning_rate": 4.689993557625897e-05, + "loss": 0.3982, + "num_input_tokens_seen": 19023264, + "step": 32790 + }, + { + "epoch": 4.884569556151326, + "grad_norm": 29.243675231933594, + "learning_rate": 4.689836814402042e-05, + "loss": 0.277, + "num_input_tokens_seen": 19025984, + "step": 32795 + }, + { + "epoch": 4.885314268692285, + "grad_norm": 40.48643493652344, + "learning_rate": 4.689680034183121e-05, + "loss": 0.3673, + "num_input_tokens_seen": 19028672, + "step": 32800 + }, + { + "epoch": 4.886058981233244, + "grad_norm": 20.45671272277832, + "learning_rate": 4.689523216971781e-05, + "loss": 0.4492, + "num_input_tokens_seen": 19031456, + "step": 32805 + }, + { + "epoch": 4.886803693774203, + "grad_norm": 28.54245376586914, + "learning_rate": 4.689366362770671e-05, + "loss": 0.2973, + "num_input_tokens_seen": 19034240, + "step": 32810 + }, + { + "epoch": 4.887548406315163, + "grad_norm": 33.58824157714844, + "learning_rate": 4.689209471582442e-05, + "loss": 0.4885, + "num_input_tokens_seen": 19036928, + "step": 32815 + }, + { + "epoch": 4.888293118856121, + "grad_norm": 27.241987228393555, + "learning_rate": 4.689052543409743e-05, + "loss": 0.3628, + "num_input_tokens_seen": 19039584, + "step": 32820 + }, + { + "epoch": 4.889037831397081, + "grad_norm": 0.28417837619781494, + "learning_rate": 4.6888955782552274e-05, + "loss": 0.3917, + "num_input_tokens_seen": 19042560, + "step": 32825 + }, + { + "epoch": 4.88978254393804, + "grad_norm": 15.651469230651855, + "learning_rate": 4.688738576121545e-05, + "loss": 0.6539, + "num_input_tokens_seen": 19045408, + "step": 32830 + }, + { + "epoch": 4.890527256478999, + "grad_norm": 23.739675521850586, + "learning_rate": 4.68858153701135e-05, + "loss": 0.2894, + "num_input_tokens_seen": 19048224, + "step": 32835 + }, + { + "epoch": 4.891271969019958, + "grad_norm": 11.017251014709473, + "learning_rate": 4.688424460927293e-05, + "loss": 0.3803, + "num_input_tokens_seen": 19051168, + "step": 32840 + }, + { + "epoch": 4.892016681560918, + "grad_norm": 9.986224174499512, + "learning_rate": 4.688267347872029e-05, + "loss": 0.325, + "num_input_tokens_seen": 19054080, + "step": 32845 + }, + { + "epoch": 4.892761394101877, + "grad_norm": 1.1326850652694702, + "learning_rate": 4.6881101978482124e-05, + "loss": 0.2066, + "num_input_tokens_seen": 19056768, + "step": 32850 + }, + { + "epoch": 4.893506106642836, + "grad_norm": 15.070255279541016, + "learning_rate": 4.687953010858498e-05, + "loss": 0.3381, + "num_input_tokens_seen": 19059584, + "step": 32855 + }, + { + "epoch": 4.894250819183795, + "grad_norm": 9.273098945617676, + "learning_rate": 4.6877957869055414e-05, + "loss": 0.2519, + "num_input_tokens_seen": 19062304, + "step": 32860 + }, + { + "epoch": 4.894995531724755, + "grad_norm": 11.167753219604492, + "learning_rate": 4.6876385259919984e-05, + "loss": 0.3793, + "num_input_tokens_seen": 19065184, + "step": 32865 + }, + { + "epoch": 4.895740244265713, + "grad_norm": 31.53228187561035, + "learning_rate": 4.687481228120526e-05, + "loss": 0.6348, + "num_input_tokens_seen": 19068256, + "step": 32870 + }, + { + "epoch": 4.896484956806672, + "grad_norm": 23.01788902282715, + "learning_rate": 4.687323893293781e-05, + "loss": 0.2722, + "num_input_tokens_seen": 19071104, + "step": 32875 + }, + { + "epoch": 4.897229669347632, + "grad_norm": 78.77637481689453, + "learning_rate": 4.687166521514423e-05, + "loss": 0.4075, + "num_input_tokens_seen": 19074144, + "step": 32880 + }, + { + "epoch": 4.8979743818885915, + "grad_norm": 2.671935558319092, + "learning_rate": 4.687009112785109e-05, + "loss": 0.5733, + "num_input_tokens_seen": 19077184, + "step": 32885 + }, + { + "epoch": 4.89871909442955, + "grad_norm": 11.612462043762207, + "learning_rate": 4.686851667108499e-05, + "loss": 0.1607, + "num_input_tokens_seen": 19080128, + "step": 32890 + }, + { + "epoch": 4.899463806970509, + "grad_norm": 15.786680221557617, + "learning_rate": 4.686694184487253e-05, + "loss": 0.1852, + "num_input_tokens_seen": 19083104, + "step": 32895 + }, + { + "epoch": 4.900208519511469, + "grad_norm": 42.87122344970703, + "learning_rate": 4.68653666492403e-05, + "loss": 0.5289, + "num_input_tokens_seen": 19085856, + "step": 32900 + }, + { + "epoch": 4.900953232052427, + "grad_norm": 11.996088027954102, + "learning_rate": 4.686379108421493e-05, + "loss": 0.2182, + "num_input_tokens_seen": 19088704, + "step": 32905 + }, + { + "epoch": 4.901697944593387, + "grad_norm": 16.525318145751953, + "learning_rate": 4.686221514982303e-05, + "loss": 0.6452, + "num_input_tokens_seen": 19091776, + "step": 32910 + }, + { + "epoch": 4.902442657134346, + "grad_norm": 11.60373592376709, + "learning_rate": 4.686063884609122e-05, + "loss": 0.4634, + "num_input_tokens_seen": 19094976, + "step": 32915 + }, + { + "epoch": 4.903187369675305, + "grad_norm": 30.640939712524414, + "learning_rate": 4.685906217304615e-05, + "loss": 0.3569, + "num_input_tokens_seen": 19097856, + "step": 32920 + }, + { + "epoch": 4.903932082216264, + "grad_norm": 3.4244892597198486, + "learning_rate": 4.685748513071443e-05, + "loss": 0.4837, + "num_input_tokens_seen": 19100672, + "step": 32925 + }, + { + "epoch": 4.904676794757224, + "grad_norm": 5.5495285987854, + "learning_rate": 4.685590771912272e-05, + "loss": 0.4213, + "num_input_tokens_seen": 19103680, + "step": 32930 + }, + { + "epoch": 4.905421507298183, + "grad_norm": 10.762871742248535, + "learning_rate": 4.685432993829765e-05, + "loss": 0.3615, + "num_input_tokens_seen": 19106464, + "step": 32935 + }, + { + "epoch": 4.906166219839142, + "grad_norm": 11.755558013916016, + "learning_rate": 4.6852751788265895e-05, + "loss": 0.2809, + "num_input_tokens_seen": 19109632, + "step": 32940 + }, + { + "epoch": 4.906910932380101, + "grad_norm": 8.038955688476562, + "learning_rate": 4.6851173269054116e-05, + "loss": 0.3639, + "num_input_tokens_seen": 19112416, + "step": 32945 + }, + { + "epoch": 4.907655644921061, + "grad_norm": 11.425459861755371, + "learning_rate": 4.6849594380688966e-05, + "loss": 0.2743, + "num_input_tokens_seen": 19115552, + "step": 32950 + }, + { + "epoch": 4.908400357462019, + "grad_norm": 3.9291810989379883, + "learning_rate": 4.684801512319712e-05, + "loss": 0.1215, + "num_input_tokens_seen": 19118368, + "step": 32955 + }, + { + "epoch": 4.909145070002979, + "grad_norm": 17.147232055664062, + "learning_rate": 4.6846435496605275e-05, + "loss": 0.4311, + "num_input_tokens_seen": 19121184, + "step": 32960 + }, + { + "epoch": 4.909889782543938, + "grad_norm": 7.2294602394104, + "learning_rate": 4.6844855500940096e-05, + "loss": 0.4705, + "num_input_tokens_seen": 19123968, + "step": 32965 + }, + { + "epoch": 4.9106344950848975, + "grad_norm": 33.69186019897461, + "learning_rate": 4.684327513622829e-05, + "loss": 0.329, + "num_input_tokens_seen": 19126976, + "step": 32970 + }, + { + "epoch": 4.911379207625856, + "grad_norm": 32.730770111083984, + "learning_rate": 4.684169440249656e-05, + "loss": 0.2088, + "num_input_tokens_seen": 19130016, + "step": 32975 + }, + { + "epoch": 4.912123920166816, + "grad_norm": 10.117905616760254, + "learning_rate": 4.684011329977159e-05, + "loss": 0.4278, + "num_input_tokens_seen": 19132928, + "step": 32980 + }, + { + "epoch": 4.912868632707775, + "grad_norm": 10.503634452819824, + "learning_rate": 4.6838531828080104e-05, + "loss": 0.3627, + "num_input_tokens_seen": 19136192, + "step": 32985 + }, + { + "epoch": 4.913613345248734, + "grad_norm": 7.330177307128906, + "learning_rate": 4.6836949987448824e-05, + "loss": 0.2759, + "num_input_tokens_seen": 19139296, + "step": 32990 + }, + { + "epoch": 4.914358057789693, + "grad_norm": 8.834185600280762, + "learning_rate": 4.6835367777904466e-05, + "loss": 0.4847, + "num_input_tokens_seen": 19142240, + "step": 32995 + }, + { + "epoch": 4.915102770330653, + "grad_norm": 4.823785305023193, + "learning_rate": 4.6833785199473756e-05, + "loss": 0.4606, + "num_input_tokens_seen": 19145184, + "step": 33000 + }, + { + "epoch": 4.915847482871611, + "grad_norm": 20.64963150024414, + "learning_rate": 4.683220225218344e-05, + "loss": 0.3895, + "num_input_tokens_seen": 19148032, + "step": 33005 + }, + { + "epoch": 4.916592195412571, + "grad_norm": 17.39266014099121, + "learning_rate": 4.683061893606026e-05, + "loss": 0.352, + "num_input_tokens_seen": 19150720, + "step": 33010 + }, + { + "epoch": 4.91733690795353, + "grad_norm": 7.599581241607666, + "learning_rate": 4.682903525113096e-05, + "loss": 0.1834, + "num_input_tokens_seen": 19153408, + "step": 33015 + }, + { + "epoch": 4.9180816204944895, + "grad_norm": 13.27093505859375, + "learning_rate": 4.682745119742229e-05, + "loss": 0.2505, + "num_input_tokens_seen": 19156544, + "step": 33020 + }, + { + "epoch": 4.918826333035448, + "grad_norm": 26.255582809448242, + "learning_rate": 4.682586677496102e-05, + "loss": 0.3067, + "num_input_tokens_seen": 19159360, + "step": 33025 + }, + { + "epoch": 4.919571045576408, + "grad_norm": 4.5513410568237305, + "learning_rate": 4.6824281983773914e-05, + "loss": 0.1615, + "num_input_tokens_seen": 19162272, + "step": 33030 + }, + { + "epoch": 4.920315758117367, + "grad_norm": 9.489853858947754, + "learning_rate": 4.682269682388775e-05, + "loss": 0.3109, + "num_input_tokens_seen": 19165024, + "step": 33035 + }, + { + "epoch": 4.921060470658325, + "grad_norm": 15.363826751708984, + "learning_rate": 4.6821111295329294e-05, + "loss": 0.5998, + "num_input_tokens_seen": 19167872, + "step": 33040 + }, + { + "epoch": 4.921805183199285, + "grad_norm": 16.38674545288086, + "learning_rate": 4.681952539812534e-05, + "loss": 0.48, + "num_input_tokens_seen": 19170848, + "step": 33045 + }, + { + "epoch": 4.922549895740245, + "grad_norm": 6.006124973297119, + "learning_rate": 4.681793913230269e-05, + "loss": 0.4238, + "num_input_tokens_seen": 19173760, + "step": 33050 + }, + { + "epoch": 4.9232946082812035, + "grad_norm": 1.5294398069381714, + "learning_rate": 4.6816352497888125e-05, + "loss": 0.1213, + "num_input_tokens_seen": 19176672, + "step": 33055 + }, + { + "epoch": 4.924039320822162, + "grad_norm": 31.816987991333008, + "learning_rate": 4.6814765494908465e-05, + "loss": 0.6099, + "num_input_tokens_seen": 19179680, + "step": 33060 + }, + { + "epoch": 4.924784033363122, + "grad_norm": 1.2944657802581787, + "learning_rate": 4.681317812339051e-05, + "loss": 0.3691, + "num_input_tokens_seen": 19182624, + "step": 33065 + }, + { + "epoch": 4.9255287459040815, + "grad_norm": 0.9470286965370178, + "learning_rate": 4.681159038336108e-05, + "loss": 0.3947, + "num_input_tokens_seen": 19185344, + "step": 33070 + }, + { + "epoch": 4.92627345844504, + "grad_norm": 5.734231948852539, + "learning_rate": 4.6810002274847e-05, + "loss": 0.2039, + "num_input_tokens_seen": 19188288, + "step": 33075 + }, + { + "epoch": 4.927018170985999, + "grad_norm": 3.927624464035034, + "learning_rate": 4.680841379787509e-05, + "loss": 0.6047, + "num_input_tokens_seen": 19191264, + "step": 33080 + }, + { + "epoch": 4.927762883526959, + "grad_norm": 18.829504013061523, + "learning_rate": 4.6806824952472204e-05, + "loss": 0.5269, + "num_input_tokens_seen": 19194368, + "step": 33085 + }, + { + "epoch": 4.928507596067917, + "grad_norm": 12.823393821716309, + "learning_rate": 4.6805235738665164e-05, + "loss": 0.4496, + "num_input_tokens_seen": 19197216, + "step": 33090 + }, + { + "epoch": 4.929252308608877, + "grad_norm": 13.294321060180664, + "learning_rate": 4.680364615648084e-05, + "loss": 0.2727, + "num_input_tokens_seen": 19200384, + "step": 33095 + }, + { + "epoch": 4.929997021149836, + "grad_norm": 25.327693939208984, + "learning_rate": 4.680205620594606e-05, + "loss": 0.1997, + "num_input_tokens_seen": 19203488, + "step": 33100 + }, + { + "epoch": 4.9307417336907955, + "grad_norm": 29.120466232299805, + "learning_rate": 4.680046588708772e-05, + "loss": 0.3699, + "num_input_tokens_seen": 19206784, + "step": 33105 + }, + { + "epoch": 4.931486446231754, + "grad_norm": 37.95854949951172, + "learning_rate": 4.679887519993265e-05, + "loss": 0.3402, + "num_input_tokens_seen": 19209696, + "step": 33110 + }, + { + "epoch": 4.932231158772714, + "grad_norm": 1.6396468877792358, + "learning_rate": 4.679728414450774e-05, + "loss": 0.1144, + "num_input_tokens_seen": 19212544, + "step": 33115 + }, + { + "epoch": 4.932975871313673, + "grad_norm": 12.464171409606934, + "learning_rate": 4.679569272083987e-05, + "loss": 0.6895, + "num_input_tokens_seen": 19215648, + "step": 33120 + }, + { + "epoch": 4.933720583854632, + "grad_norm": 0.38219401240348816, + "learning_rate": 4.6794100928955934e-05, + "loss": 0.1192, + "num_input_tokens_seen": 19218208, + "step": 33125 + }, + { + "epoch": 4.934465296395591, + "grad_norm": 8.179093360900879, + "learning_rate": 4.67925087688828e-05, + "loss": 0.343, + "num_input_tokens_seen": 19221376, + "step": 33130 + }, + { + "epoch": 4.935210008936551, + "grad_norm": 11.02768611907959, + "learning_rate": 4.679091624064738e-05, + "loss": 0.1258, + "num_input_tokens_seen": 19224160, + "step": 33135 + }, + { + "epoch": 4.9359547214775095, + "grad_norm": 29.525943756103516, + "learning_rate": 4.678932334427658e-05, + "loss": 0.3618, + "num_input_tokens_seen": 19227136, + "step": 33140 + }, + { + "epoch": 4.936699434018469, + "grad_norm": 5.720949172973633, + "learning_rate": 4.678773007979731e-05, + "loss": 0.199, + "num_input_tokens_seen": 19229856, + "step": 33145 + }, + { + "epoch": 4.937444146559428, + "grad_norm": 18.746274948120117, + "learning_rate": 4.678613644723649e-05, + "loss": 0.5439, + "num_input_tokens_seen": 19232576, + "step": 33150 + }, + { + "epoch": 4.9381888591003875, + "grad_norm": 6.2934980392456055, + "learning_rate": 4.6784542446621026e-05, + "loss": 0.4372, + "num_input_tokens_seen": 19235488, + "step": 33155 + }, + { + "epoch": 4.938933571641346, + "grad_norm": 15.31105899810791, + "learning_rate": 4.678294807797786e-05, + "loss": 0.2661, + "num_input_tokens_seen": 19238304, + "step": 33160 + }, + { + "epoch": 4.939678284182306, + "grad_norm": 12.892080307006836, + "learning_rate": 4.6781353341333926e-05, + "loss": 0.5405, + "num_input_tokens_seen": 19241216, + "step": 33165 + }, + { + "epoch": 4.940422996723265, + "grad_norm": 7.244167327880859, + "learning_rate": 4.6779758236716165e-05, + "loss": 0.4667, + "num_input_tokens_seen": 19244064, + "step": 33170 + }, + { + "epoch": 4.941167709264224, + "grad_norm": 55.48810958862305, + "learning_rate": 4.677816276415153e-05, + "loss": 0.2764, + "num_input_tokens_seen": 19246784, + "step": 33175 + }, + { + "epoch": 4.941912421805183, + "grad_norm": 43.837894439697266, + "learning_rate": 4.677656692366696e-05, + "loss": 0.458, + "num_input_tokens_seen": 19249696, + "step": 33180 + }, + { + "epoch": 4.942657134346143, + "grad_norm": 6.045255184173584, + "learning_rate": 4.677497071528944e-05, + "loss": 0.3457, + "num_input_tokens_seen": 19252704, + "step": 33185 + }, + { + "epoch": 4.9434018468871015, + "grad_norm": 10.349239349365234, + "learning_rate": 4.67733741390459e-05, + "loss": 0.2145, + "num_input_tokens_seen": 19255424, + "step": 33190 + }, + { + "epoch": 4.944146559428061, + "grad_norm": 10.809676170349121, + "learning_rate": 4.677177719496335e-05, + "loss": 0.4306, + "num_input_tokens_seen": 19258400, + "step": 33195 + }, + { + "epoch": 4.94489127196902, + "grad_norm": 10.497895240783691, + "learning_rate": 4.677017988306874e-05, + "loss": 0.1534, + "num_input_tokens_seen": 19261216, + "step": 33200 + }, + { + "epoch": 4.945635984509979, + "grad_norm": 8.27303695678711, + "learning_rate": 4.676858220338908e-05, + "loss": 0.4205, + "num_input_tokens_seen": 19264256, + "step": 33205 + }, + { + "epoch": 4.946380697050938, + "grad_norm": 13.233993530273438, + "learning_rate": 4.676698415595134e-05, + "loss": 0.3032, + "num_input_tokens_seen": 19267360, + "step": 33210 + }, + { + "epoch": 4.947125409591898, + "grad_norm": 15.978385925292969, + "learning_rate": 4.676538574078253e-05, + "loss": 0.2549, + "num_input_tokens_seen": 19270208, + "step": 33215 + }, + { + "epoch": 4.947870122132857, + "grad_norm": 2.226759672164917, + "learning_rate": 4.676378695790964e-05, + "loss": 0.2585, + "num_input_tokens_seen": 19273056, + "step": 33220 + }, + { + "epoch": 4.9486148346738155, + "grad_norm": 4.221363544464111, + "learning_rate": 4.67621878073597e-05, + "loss": 0.3377, + "num_input_tokens_seen": 19276000, + "step": 33225 + }, + { + "epoch": 4.949359547214775, + "grad_norm": 5.279568672180176, + "learning_rate": 4.676058828915971e-05, + "loss": 0.3649, + "num_input_tokens_seen": 19278752, + "step": 33230 + }, + { + "epoch": 4.950104259755735, + "grad_norm": 5.5606689453125, + "learning_rate": 4.67589884033367e-05, + "loss": 0.2396, + "num_input_tokens_seen": 19281632, + "step": 33235 + }, + { + "epoch": 4.9508489722966935, + "grad_norm": 4.354998588562012, + "learning_rate": 4.675738814991769e-05, + "loss": 0.1807, + "num_input_tokens_seen": 19284736, + "step": 33240 + }, + { + "epoch": 4.951593684837652, + "grad_norm": 1.8966702222824097, + "learning_rate": 4.6755787528929726e-05, + "loss": 0.5561, + "num_input_tokens_seen": 19287616, + "step": 33245 + }, + { + "epoch": 4.952338397378612, + "grad_norm": 2.9768331050872803, + "learning_rate": 4.675418654039984e-05, + "loss": 0.1625, + "num_input_tokens_seen": 19290304, + "step": 33250 + }, + { + "epoch": 4.953083109919571, + "grad_norm": 0.3379618525505066, + "learning_rate": 4.6752585184355084e-05, + "loss": 0.2263, + "num_input_tokens_seen": 19293568, + "step": 33255 + }, + { + "epoch": 4.95382782246053, + "grad_norm": 10.701478958129883, + "learning_rate": 4.675098346082251e-05, + "loss": 0.2887, + "num_input_tokens_seen": 19296704, + "step": 33260 + }, + { + "epoch": 4.954572535001489, + "grad_norm": 30.41191864013672, + "learning_rate": 4.674938136982918e-05, + "loss": 0.4493, + "num_input_tokens_seen": 19299680, + "step": 33265 + }, + { + "epoch": 4.955317247542449, + "grad_norm": 1.9188390970230103, + "learning_rate": 4.674777891140215e-05, + "loss": 0.3342, + "num_input_tokens_seen": 19302528, + "step": 33270 + }, + { + "epoch": 4.9560619600834075, + "grad_norm": 4.652761936187744, + "learning_rate": 4.6746176085568506e-05, + "loss": 0.2668, + "num_input_tokens_seen": 19305696, + "step": 33275 + }, + { + "epoch": 4.956806672624367, + "grad_norm": 1.3055684566497803, + "learning_rate": 4.674457289235531e-05, + "loss": 0.4011, + "num_input_tokens_seen": 19308544, + "step": 33280 + }, + { + "epoch": 4.957551385165326, + "grad_norm": 0.4446766674518585, + "learning_rate": 4.674296933178967e-05, + "loss": 0.5241, + "num_input_tokens_seen": 19311680, + "step": 33285 + }, + { + "epoch": 4.9582960977062855, + "grad_norm": 12.55249309539795, + "learning_rate": 4.674136540389864e-05, + "loss": 0.4681, + "num_input_tokens_seen": 19314592, + "step": 33290 + }, + { + "epoch": 4.959040810247244, + "grad_norm": 16.72296142578125, + "learning_rate": 4.6739761108709356e-05, + "loss": 0.4195, + "num_input_tokens_seen": 19317440, + "step": 33295 + }, + { + "epoch": 4.959785522788204, + "grad_norm": 25.915037155151367, + "learning_rate": 4.673815644624889e-05, + "loss": 0.5117, + "num_input_tokens_seen": 19320512, + "step": 33300 + }, + { + "epoch": 4.960530235329163, + "grad_norm": 4.655110836029053, + "learning_rate": 4.673655141654438e-05, + "loss": 0.3155, + "num_input_tokens_seen": 19323488, + "step": 33305 + }, + { + "epoch": 4.961274947870122, + "grad_norm": 10.460533142089844, + "learning_rate": 4.673494601962292e-05, + "loss": 0.3709, + "num_input_tokens_seen": 19326272, + "step": 33310 + }, + { + "epoch": 4.962019660411081, + "grad_norm": 34.10285568237305, + "learning_rate": 4.673334025551164e-05, + "loss": 0.4574, + "num_input_tokens_seen": 19329248, + "step": 33315 + }, + { + "epoch": 4.962764372952041, + "grad_norm": 29.817840576171875, + "learning_rate": 4.6731734124237654e-05, + "loss": 0.3538, + "num_input_tokens_seen": 19332352, + "step": 33320 + }, + { + "epoch": 4.9635090854929995, + "grad_norm": 19.303394317626953, + "learning_rate": 4.6730127625828113e-05, + "loss": 0.2264, + "num_input_tokens_seen": 19335168, + "step": 33325 + }, + { + "epoch": 4.964253798033959, + "grad_norm": 5.18964147567749, + "learning_rate": 4.672852076031015e-05, + "loss": 0.2159, + "num_input_tokens_seen": 19337984, + "step": 33330 + }, + { + "epoch": 4.964998510574918, + "grad_norm": 16.73362922668457, + "learning_rate": 4.6726913527710915e-05, + "loss": 0.2096, + "num_input_tokens_seen": 19340896, + "step": 33335 + }, + { + "epoch": 4.965743223115878, + "grad_norm": 8.505550384521484, + "learning_rate": 4.672530592805756e-05, + "loss": 0.4382, + "num_input_tokens_seen": 19343520, + "step": 33340 + }, + { + "epoch": 4.966487935656836, + "grad_norm": 7.559093952178955, + "learning_rate": 4.672369796137724e-05, + "loss": 0.215, + "num_input_tokens_seen": 19346432, + "step": 33345 + }, + { + "epoch": 4.967232648197796, + "grad_norm": 21.24867820739746, + "learning_rate": 4.672208962769713e-05, + "loss": 0.3831, + "num_input_tokens_seen": 19349312, + "step": 33350 + }, + { + "epoch": 4.967977360738755, + "grad_norm": 9.874444961547852, + "learning_rate": 4.672048092704438e-05, + "loss": 0.199, + "num_input_tokens_seen": 19352128, + "step": 33355 + }, + { + "epoch": 4.968722073279714, + "grad_norm": 1.1319037675857544, + "learning_rate": 4.671887185944618e-05, + "loss": 0.1983, + "num_input_tokens_seen": 19355264, + "step": 33360 + }, + { + "epoch": 4.969466785820673, + "grad_norm": 19.939537048339844, + "learning_rate": 4.671726242492972e-05, + "loss": 0.4629, + "num_input_tokens_seen": 19358368, + "step": 33365 + }, + { + "epoch": 4.970211498361633, + "grad_norm": 36.20964431762695, + "learning_rate": 4.671565262352219e-05, + "loss": 0.5715, + "num_input_tokens_seen": 19361344, + "step": 33370 + }, + { + "epoch": 4.9709562109025915, + "grad_norm": 16.463613510131836, + "learning_rate": 4.671404245525077e-05, + "loss": 0.5942, + "num_input_tokens_seen": 19364224, + "step": 33375 + }, + { + "epoch": 4.971700923443551, + "grad_norm": 7.40077018737793, + "learning_rate": 4.671243192014267e-05, + "loss": 0.4392, + "num_input_tokens_seen": 19367168, + "step": 33380 + }, + { + "epoch": 4.97244563598451, + "grad_norm": 16.4559268951416, + "learning_rate": 4.6710821018225104e-05, + "loss": 0.3411, + "num_input_tokens_seen": 19370144, + "step": 33385 + }, + { + "epoch": 4.973190348525469, + "grad_norm": 6.38187313079834, + "learning_rate": 4.670920974952529e-05, + "loss": 0.1535, + "num_input_tokens_seen": 19372992, + "step": 33390 + }, + { + "epoch": 4.973935061066428, + "grad_norm": 7.294907093048096, + "learning_rate": 4.6707598114070436e-05, + "loss": 0.369, + "num_input_tokens_seen": 19375648, + "step": 33395 + }, + { + "epoch": 4.974679773607388, + "grad_norm": 12.236798286437988, + "learning_rate": 4.6705986111887765e-05, + "loss": 0.2572, + "num_input_tokens_seen": 19378432, + "step": 33400 + }, + { + "epoch": 4.975424486148347, + "grad_norm": 28.530664443969727, + "learning_rate": 4.6704373743004534e-05, + "loss": 0.271, + "num_input_tokens_seen": 19381472, + "step": 33405 + }, + { + "epoch": 4.9761691986893055, + "grad_norm": 2.1617963314056396, + "learning_rate": 4.670276100744796e-05, + "loss": 0.3962, + "num_input_tokens_seen": 19384384, + "step": 33410 + }, + { + "epoch": 4.976913911230265, + "grad_norm": 23.434051513671875, + "learning_rate": 4.67011479052453e-05, + "loss": 0.2444, + "num_input_tokens_seen": 19387200, + "step": 33415 + }, + { + "epoch": 4.977658623771224, + "grad_norm": 4.686889171600342, + "learning_rate": 4.66995344364238e-05, + "loss": 0.2865, + "num_input_tokens_seen": 19390080, + "step": 33420 + }, + { + "epoch": 4.978403336312184, + "grad_norm": 6.7400803565979, + "learning_rate": 4.6697920601010724e-05, + "loss": 0.5766, + "num_input_tokens_seen": 19392896, + "step": 33425 + }, + { + "epoch": 4.979148048853142, + "grad_norm": 11.014707565307617, + "learning_rate": 4.669630639903333e-05, + "loss": 0.2756, + "num_input_tokens_seen": 19395776, + "step": 33430 + }, + { + "epoch": 4.979892761394102, + "grad_norm": 10.60361099243164, + "learning_rate": 4.669469183051889e-05, + "loss": 0.5065, + "num_input_tokens_seen": 19398816, + "step": 33435 + }, + { + "epoch": 4.980637473935061, + "grad_norm": 4.741782188415527, + "learning_rate": 4.669307689549468e-05, + "loss": 0.1304, + "num_input_tokens_seen": 19401760, + "step": 33440 + }, + { + "epoch": 4.98138218647602, + "grad_norm": 12.145779609680176, + "learning_rate": 4.6691461593987985e-05, + "loss": 0.3771, + "num_input_tokens_seen": 19404640, + "step": 33445 + }, + { + "epoch": 4.982126899016979, + "grad_norm": 11.557308197021484, + "learning_rate": 4.668984592602609e-05, + "loss": 0.2179, + "num_input_tokens_seen": 19407328, + "step": 33450 + }, + { + "epoch": 4.982871611557939, + "grad_norm": 22.214374542236328, + "learning_rate": 4.66882298916363e-05, + "loss": 0.4781, + "num_input_tokens_seen": 19410400, + "step": 33455 + }, + { + "epoch": 4.9836163240988975, + "grad_norm": 15.667437553405762, + "learning_rate": 4.66866134908459e-05, + "loss": 0.1922, + "num_input_tokens_seen": 19413536, + "step": 33460 + }, + { + "epoch": 4.984361036639857, + "grad_norm": 8.609918594360352, + "learning_rate": 4.668499672368221e-05, + "loss": 0.4295, + "num_input_tokens_seen": 19416384, + "step": 33465 + }, + { + "epoch": 4.985105749180816, + "grad_norm": 12.413704872131348, + "learning_rate": 4.668337959017254e-05, + "loss": 0.1353, + "num_input_tokens_seen": 19419200, + "step": 33470 + }, + { + "epoch": 4.985850461721776, + "grad_norm": 5.52083683013916, + "learning_rate": 4.668176209034421e-05, + "loss": 0.2201, + "num_input_tokens_seen": 19422464, + "step": 33475 + }, + { + "epoch": 4.986595174262734, + "grad_norm": 15.846487045288086, + "learning_rate": 4.668014422422455e-05, + "loss": 0.2793, + "num_input_tokens_seen": 19425312, + "step": 33480 + }, + { + "epoch": 4.987339886803694, + "grad_norm": 3.672468662261963, + "learning_rate": 4.6678525991840886e-05, + "loss": 0.2999, + "num_input_tokens_seen": 19428096, + "step": 33485 + }, + { + "epoch": 4.988084599344653, + "grad_norm": 10.887043952941895, + "learning_rate": 4.667690739322055e-05, + "loss": 0.373, + "num_input_tokens_seen": 19431104, + "step": 33490 + }, + { + "epoch": 4.988829311885612, + "grad_norm": 1.1425544023513794, + "learning_rate": 4.667528842839091e-05, + "loss": 0.3586, + "num_input_tokens_seen": 19433952, + "step": 33495 + }, + { + "epoch": 4.989574024426571, + "grad_norm": 35.94989776611328, + "learning_rate": 4.6673669097379294e-05, + "loss": 0.5633, + "num_input_tokens_seen": 19437088, + "step": 33500 + }, + { + "epoch": 4.990318736967531, + "grad_norm": 7.51611852645874, + "learning_rate": 4.6672049400213056e-05, + "loss": 0.1742, + "num_input_tokens_seen": 19440000, + "step": 33505 + }, + { + "epoch": 4.99106344950849, + "grad_norm": 9.140088081359863, + "learning_rate": 4.6670429336919585e-05, + "loss": 0.2291, + "num_input_tokens_seen": 19442784, + "step": 33510 + }, + { + "epoch": 4.991808162049449, + "grad_norm": 34.139713287353516, + "learning_rate": 4.666880890752623e-05, + "loss": 0.5288, + "num_input_tokens_seen": 19445664, + "step": 33515 + }, + { + "epoch": 4.992552874590408, + "grad_norm": 60.36414337158203, + "learning_rate": 4.6667188112060365e-05, + "loss": 0.3089, + "num_input_tokens_seen": 19448128, + "step": 33520 + }, + { + "epoch": 4.993297587131368, + "grad_norm": 7.989266872406006, + "learning_rate": 4.666556695054939e-05, + "loss": 0.3046, + "num_input_tokens_seen": 19451040, + "step": 33525 + }, + { + "epoch": 4.994042299672326, + "grad_norm": 24.202051162719727, + "learning_rate": 4.666394542302068e-05, + "loss": 0.4814, + "num_input_tokens_seen": 19453888, + "step": 33530 + }, + { + "epoch": 4.994787012213286, + "grad_norm": 6.0997443199157715, + "learning_rate": 4.6662323529501625e-05, + "loss": 0.2441, + "num_input_tokens_seen": 19457056, + "step": 33535 + }, + { + "epoch": 4.995531724754245, + "grad_norm": 9.504308700561523, + "learning_rate": 4.666070127001963e-05, + "loss": 0.2909, + "num_input_tokens_seen": 19459936, + "step": 33540 + }, + { + "epoch": 4.996276437295204, + "grad_norm": 0.14429530501365662, + "learning_rate": 4.6659078644602103e-05, + "loss": 0.2654, + "num_input_tokens_seen": 19462848, + "step": 33545 + }, + { + "epoch": 4.997021149836163, + "grad_norm": 13.575499534606934, + "learning_rate": 4.665745565327646e-05, + "loss": 0.4306, + "num_input_tokens_seen": 19465696, + "step": 33550 + }, + { + "epoch": 4.997765862377122, + "grad_norm": 8.696622848510742, + "learning_rate": 4.665583229607011e-05, + "loss": 0.1403, + "num_input_tokens_seen": 19468576, + "step": 33555 + }, + { + "epoch": 4.998510574918082, + "grad_norm": 16.75677490234375, + "learning_rate": 4.6654208573010484e-05, + "loss": 0.4694, + "num_input_tokens_seen": 19471424, + "step": 33560 + }, + { + "epoch": 4.999255287459041, + "grad_norm": 6.932664394378662, + "learning_rate": 4.665258448412502e-05, + "loss": 0.4117, + "num_input_tokens_seen": 19474208, + "step": 33565 + }, + { + "epoch": 5.0, + "grad_norm": 4.606500625610352, + "learning_rate": 4.665096002944114e-05, + "loss": 0.3445, + "num_input_tokens_seen": 19476576, + "step": 33570 + }, + { + "epoch": 5.0, + "eval_loss": 0.8774887919425964, + "eval_runtime": 51.2379, + "eval_samples_per_second": 58.238, + "eval_steps_per_second": 14.56, + "num_input_tokens_seen": 19476576, + "step": 33570 + }, + { + "epoch": 5.000744712540959, + "grad_norm": 36.191246032714844, + "learning_rate": 4.6649335208986294e-05, + "loss": 0.129, + "num_input_tokens_seen": 19479616, + "step": 33575 + }, + { + "epoch": 5.001489425081918, + "grad_norm": 10.457266807556152, + "learning_rate": 4.6647710022787935e-05, + "loss": 0.2042, + "num_input_tokens_seen": 19482464, + "step": 33580 + }, + { + "epoch": 5.002234137622877, + "grad_norm": 0.16777317225933075, + "learning_rate": 4.664608447087352e-05, + "loss": 0.0831, + "num_input_tokens_seen": 19485504, + "step": 33585 + }, + { + "epoch": 5.002978850163837, + "grad_norm": 2.151637077331543, + "learning_rate": 4.664445855327051e-05, + "loss": 0.2445, + "num_input_tokens_seen": 19488544, + "step": 33590 + }, + { + "epoch": 5.003723562704796, + "grad_norm": 4.722310543060303, + "learning_rate": 4.664283227000636e-05, + "loss": 0.1902, + "num_input_tokens_seen": 19491520, + "step": 33595 + }, + { + "epoch": 5.004468275245755, + "grad_norm": 28.935773849487305, + "learning_rate": 4.664120562110857e-05, + "loss": 0.2929, + "num_input_tokens_seen": 19494336, + "step": 33600 + }, + { + "epoch": 5.005212987786714, + "grad_norm": 15.439868927001953, + "learning_rate": 4.6639578606604596e-05, + "loss": 0.2646, + "num_input_tokens_seen": 19497248, + "step": 33605 + }, + { + "epoch": 5.005957700327674, + "grad_norm": 9.336736679077148, + "learning_rate": 4.6637951226521935e-05, + "loss": 0.1641, + "num_input_tokens_seen": 19500288, + "step": 33610 + }, + { + "epoch": 5.006702412868632, + "grad_norm": 14.262595176696777, + "learning_rate": 4.663632348088809e-05, + "loss": 0.24, + "num_input_tokens_seen": 19503488, + "step": 33615 + }, + { + "epoch": 5.007447125409592, + "grad_norm": 0.19209223985671997, + "learning_rate": 4.663469536973054e-05, + "loss": 0.4445, + "num_input_tokens_seen": 19506432, + "step": 33620 + }, + { + "epoch": 5.008191837950551, + "grad_norm": 22.62267303466797, + "learning_rate": 4.6633066893076804e-05, + "loss": 0.2892, + "num_input_tokens_seen": 19509248, + "step": 33625 + }, + { + "epoch": 5.00893655049151, + "grad_norm": 6.226907730102539, + "learning_rate": 4.663143805095439e-05, + "loss": 0.279, + "num_input_tokens_seen": 19511936, + "step": 33630 + }, + { + "epoch": 5.009681263032469, + "grad_norm": 1.9125157594680786, + "learning_rate": 4.662980884339081e-05, + "loss": 0.026, + "num_input_tokens_seen": 19514784, + "step": 33635 + }, + { + "epoch": 5.010425975573429, + "grad_norm": 10.307975769042969, + "learning_rate": 4.66281792704136e-05, + "loss": 0.2745, + "num_input_tokens_seen": 19517664, + "step": 33640 + }, + { + "epoch": 5.011170688114388, + "grad_norm": 0.2899531126022339, + "learning_rate": 4.6626549332050284e-05, + "loss": 0.0651, + "num_input_tokens_seen": 19520544, + "step": 33645 + }, + { + "epoch": 5.011915400655347, + "grad_norm": 0.7447577714920044, + "learning_rate": 4.6624919028328394e-05, + "loss": 0.2707, + "num_input_tokens_seen": 19523232, + "step": 33650 + }, + { + "epoch": 5.012660113196306, + "grad_norm": 18.502683639526367, + "learning_rate": 4.6623288359275474e-05, + "loss": 0.285, + "num_input_tokens_seen": 19526272, + "step": 33655 + }, + { + "epoch": 5.013404825737266, + "grad_norm": 8.185242652893066, + "learning_rate": 4.662165732491907e-05, + "loss": 0.1163, + "num_input_tokens_seen": 19529376, + "step": 33660 + }, + { + "epoch": 5.014149538278224, + "grad_norm": 28.593050003051758, + "learning_rate": 4.662002592528675e-05, + "loss": 0.2904, + "num_input_tokens_seen": 19532672, + "step": 33665 + }, + { + "epoch": 5.014894250819184, + "grad_norm": 16.987314224243164, + "learning_rate": 4.661839416040606e-05, + "loss": 0.1315, + "num_input_tokens_seen": 19535456, + "step": 33670 + }, + { + "epoch": 5.015638963360143, + "grad_norm": 15.083044052124023, + "learning_rate": 4.6616762030304576e-05, + "loss": 0.1431, + "num_input_tokens_seen": 19538592, + "step": 33675 + }, + { + "epoch": 5.0163836759011025, + "grad_norm": 5.3267741203308105, + "learning_rate": 4.661512953500987e-05, + "loss": 0.1296, + "num_input_tokens_seen": 19541632, + "step": 33680 + }, + { + "epoch": 5.017128388442061, + "grad_norm": 17.326322555541992, + "learning_rate": 4.661349667454951e-05, + "loss": 0.2, + "num_input_tokens_seen": 19544928, + "step": 33685 + }, + { + "epoch": 5.017873100983021, + "grad_norm": 38.843299865722656, + "learning_rate": 4.6611863448951096e-05, + "loss": 0.0385, + "num_input_tokens_seen": 19547680, + "step": 33690 + }, + { + "epoch": 5.01861781352398, + "grad_norm": 0.24436123669147491, + "learning_rate": 4.661022985824222e-05, + "loss": 0.2066, + "num_input_tokens_seen": 19550656, + "step": 33695 + }, + { + "epoch": 5.019362526064939, + "grad_norm": 25.434663772583008, + "learning_rate": 4.660859590245046e-05, + "loss": 0.2142, + "num_input_tokens_seen": 19553312, + "step": 33700 + }, + { + "epoch": 5.020107238605898, + "grad_norm": 43.395748138427734, + "learning_rate": 4.6606961581603446e-05, + "loss": 0.3301, + "num_input_tokens_seen": 19556480, + "step": 33705 + }, + { + "epoch": 5.020851951146858, + "grad_norm": 0.0051314933225512505, + "learning_rate": 4.6605326895728773e-05, + "loss": 0.2832, + "num_input_tokens_seen": 19559264, + "step": 33710 + }, + { + "epoch": 5.021596663687816, + "grad_norm": 4.780583381652832, + "learning_rate": 4.6603691844854065e-05, + "loss": 0.2256, + "num_input_tokens_seen": 19561984, + "step": 33715 + }, + { + "epoch": 5.022341376228776, + "grad_norm": 2.9576172828674316, + "learning_rate": 4.660205642900693e-05, + "loss": 0.2101, + "num_input_tokens_seen": 19565056, + "step": 33720 + }, + { + "epoch": 5.023086088769735, + "grad_norm": 40.0858154296875, + "learning_rate": 4.660042064821501e-05, + "loss": 0.1317, + "num_input_tokens_seen": 19567872, + "step": 33725 + }, + { + "epoch": 5.0238308013106945, + "grad_norm": 9.405439376831055, + "learning_rate": 4.659878450250595e-05, + "loss": 0.0971, + "num_input_tokens_seen": 19570784, + "step": 33730 + }, + { + "epoch": 5.024575513851653, + "grad_norm": 25.755325317382812, + "learning_rate": 4.6597147991907365e-05, + "loss": 0.242, + "num_input_tokens_seen": 19573472, + "step": 33735 + }, + { + "epoch": 5.025320226392613, + "grad_norm": 2.737074136734009, + "learning_rate": 4.659551111644692e-05, + "loss": 0.177, + "num_input_tokens_seen": 19576512, + "step": 33740 + }, + { + "epoch": 5.026064938933572, + "grad_norm": 0.01090728398412466, + "learning_rate": 4.659387387615226e-05, + "loss": 0.0355, + "num_input_tokens_seen": 19579136, + "step": 33745 + }, + { + "epoch": 5.02680965147453, + "grad_norm": 1.8200602531433105, + "learning_rate": 4.659223627105105e-05, + "loss": 0.5518, + "num_input_tokens_seen": 19581824, + "step": 33750 + }, + { + "epoch": 5.02755436401549, + "grad_norm": 0.014002884738147259, + "learning_rate": 4.659059830117095e-05, + "loss": 0.1274, + "num_input_tokens_seen": 19584832, + "step": 33755 + }, + { + "epoch": 5.028299076556449, + "grad_norm": 5.272397994995117, + "learning_rate": 4.658895996653964e-05, + "loss": 0.1584, + "num_input_tokens_seen": 19587648, + "step": 33760 + }, + { + "epoch": 5.0290437890974085, + "grad_norm": 0.15558059513568878, + "learning_rate": 4.658732126718479e-05, + "loss": 0.194, + "num_input_tokens_seen": 19590400, + "step": 33765 + }, + { + "epoch": 5.029788501638367, + "grad_norm": 19.3903751373291, + "learning_rate": 4.6585682203134094e-05, + "loss": 0.6657, + "num_input_tokens_seen": 19593376, + "step": 33770 + }, + { + "epoch": 5.030533214179327, + "grad_norm": 9.161725997924805, + "learning_rate": 4.658404277441523e-05, + "loss": 0.0257, + "num_input_tokens_seen": 19596064, + "step": 33775 + }, + { + "epoch": 5.031277926720286, + "grad_norm": 0.008873257786035538, + "learning_rate": 4.65824029810559e-05, + "loss": 0.1462, + "num_input_tokens_seen": 19598976, + "step": 33780 + }, + { + "epoch": 5.032022639261245, + "grad_norm": 22.212669372558594, + "learning_rate": 4.658076282308381e-05, + "loss": 0.1222, + "num_input_tokens_seen": 19601856, + "step": 33785 + }, + { + "epoch": 5.032767351802204, + "grad_norm": 37.412471771240234, + "learning_rate": 4.657912230052667e-05, + "loss": 0.1369, + "num_input_tokens_seen": 19604608, + "step": 33790 + }, + { + "epoch": 5.033512064343164, + "grad_norm": 33.001258850097656, + "learning_rate": 4.657748141341218e-05, + "loss": 0.1708, + "num_input_tokens_seen": 19608000, + "step": 33795 + }, + { + "epoch": 5.034256776884122, + "grad_norm": 58.688629150390625, + "learning_rate": 4.657584016176808e-05, + "loss": 0.2742, + "num_input_tokens_seen": 19611008, + "step": 33800 + }, + { + "epoch": 5.035001489425082, + "grad_norm": 11.95629596710205, + "learning_rate": 4.657419854562208e-05, + "loss": 0.3988, + "num_input_tokens_seen": 19614368, + "step": 33805 + }, + { + "epoch": 5.035746201966041, + "grad_norm": 1.1180528402328491, + "learning_rate": 4.657255656500193e-05, + "loss": 0.005, + "num_input_tokens_seen": 19617120, + "step": 33810 + }, + { + "epoch": 5.0364909145070005, + "grad_norm": 56.86683654785156, + "learning_rate": 4.657091421993536e-05, + "loss": 0.1023, + "num_input_tokens_seen": 19620160, + "step": 33815 + }, + { + "epoch": 5.037235627047959, + "grad_norm": 17.519527435302734, + "learning_rate": 4.656927151045012e-05, + "loss": 0.3533, + "num_input_tokens_seen": 19622976, + "step": 33820 + }, + { + "epoch": 5.037980339588919, + "grad_norm": 33.27791213989258, + "learning_rate": 4.656762843657396e-05, + "loss": 0.4052, + "num_input_tokens_seen": 19625952, + "step": 33825 + }, + { + "epoch": 5.038725052129878, + "grad_norm": 2.9224777221679688, + "learning_rate": 4.656598499833463e-05, + "loss": 0.1083, + "num_input_tokens_seen": 19628576, + "step": 33830 + }, + { + "epoch": 5.039469764670837, + "grad_norm": 6.662792682647705, + "learning_rate": 4.6564341195759915e-05, + "loss": 0.1088, + "num_input_tokens_seen": 19631616, + "step": 33835 + }, + { + "epoch": 5.040214477211796, + "grad_norm": 34.873451232910156, + "learning_rate": 4.656269702887757e-05, + "loss": 0.2094, + "num_input_tokens_seen": 19634176, + "step": 33840 + }, + { + "epoch": 5.040959189752756, + "grad_norm": 0.16602873802185059, + "learning_rate": 4.656105249771536e-05, + "loss": 0.3263, + "num_input_tokens_seen": 19636928, + "step": 33845 + }, + { + "epoch": 5.0417039022937145, + "grad_norm": 1.4446077346801758, + "learning_rate": 4.65594076023011e-05, + "loss": 0.2086, + "num_input_tokens_seen": 19639744, + "step": 33850 + }, + { + "epoch": 5.042448614834674, + "grad_norm": 0.03716954588890076, + "learning_rate": 4.655776234266255e-05, + "loss": 0.087, + "num_input_tokens_seen": 19642944, + "step": 33855 + }, + { + "epoch": 5.043193327375633, + "grad_norm": 30.960094451904297, + "learning_rate": 4.655611671882752e-05, + "loss": 0.579, + "num_input_tokens_seen": 19645920, + "step": 33860 + }, + { + "epoch": 5.0439380399165925, + "grad_norm": 0.07367071509361267, + "learning_rate": 4.655447073082381e-05, + "loss": 0.0023, + "num_input_tokens_seen": 19648576, + "step": 33865 + }, + { + "epoch": 5.044682752457551, + "grad_norm": 0.08001315593719482, + "learning_rate": 4.6552824378679216e-05, + "loss": 0.0377, + "num_input_tokens_seen": 19651328, + "step": 33870 + }, + { + "epoch": 5.045427464998511, + "grad_norm": 14.120153427124023, + "learning_rate": 4.655117766242156e-05, + "loss": 0.2065, + "num_input_tokens_seen": 19653952, + "step": 33875 + }, + { + "epoch": 5.04617217753947, + "grad_norm": 0.10497231781482697, + "learning_rate": 4.654953058207866e-05, + "loss": 0.4017, + "num_input_tokens_seen": 19657056, + "step": 33880 + }, + { + "epoch": 5.046916890080429, + "grad_norm": 10.620258331298828, + "learning_rate": 4.654788313767835e-05, + "loss": 0.1125, + "num_input_tokens_seen": 19660480, + "step": 33885 + }, + { + "epoch": 5.047661602621388, + "grad_norm": 0.24974319338798523, + "learning_rate": 4.654623532924845e-05, + "loss": 0.022, + "num_input_tokens_seen": 19663104, + "step": 33890 + }, + { + "epoch": 5.048406315162348, + "grad_norm": 36.664222717285156, + "learning_rate": 4.6544587156816806e-05, + "loss": 0.5595, + "num_input_tokens_seen": 19665888, + "step": 33895 + }, + { + "epoch": 5.0491510277033065, + "grad_norm": 0.05243193730711937, + "learning_rate": 4.6542938620411256e-05, + "loss": 0.2265, + "num_input_tokens_seen": 19668640, + "step": 33900 + }, + { + "epoch": 5.049895740244266, + "grad_norm": 10.715136528015137, + "learning_rate": 4.654128972005966e-05, + "loss": 0.4522, + "num_input_tokens_seen": 19671456, + "step": 33905 + }, + { + "epoch": 5.050640452785225, + "grad_norm": 0.7892820835113525, + "learning_rate": 4.653964045578986e-05, + "loss": 0.1181, + "num_input_tokens_seen": 19674400, + "step": 33910 + }, + { + "epoch": 5.0513851653261845, + "grad_norm": 38.97552490234375, + "learning_rate": 4.6537990827629726e-05, + "loss": 0.1964, + "num_input_tokens_seen": 19677312, + "step": 33915 + }, + { + "epoch": 5.052129877867143, + "grad_norm": 0.08074962347745895, + "learning_rate": 4.653634083560713e-05, + "loss": 0.1999, + "num_input_tokens_seen": 19680544, + "step": 33920 + }, + { + "epoch": 5.052874590408102, + "grad_norm": 194.81358337402344, + "learning_rate": 4.653469047974994e-05, + "loss": 0.3336, + "num_input_tokens_seen": 19683264, + "step": 33925 + }, + { + "epoch": 5.053619302949062, + "grad_norm": 8.592297554016113, + "learning_rate": 4.653303976008604e-05, + "loss": 0.3005, + "num_input_tokens_seen": 19686080, + "step": 33930 + }, + { + "epoch": 5.0543640154900205, + "grad_norm": 19.34542465209961, + "learning_rate": 4.6531388676643325e-05, + "loss": 0.291, + "num_input_tokens_seen": 19688896, + "step": 33935 + }, + { + "epoch": 5.05510872803098, + "grad_norm": 7.978439807891846, + "learning_rate": 4.6529737229449676e-05, + "loss": 0.3025, + "num_input_tokens_seen": 19691616, + "step": 33940 + }, + { + "epoch": 5.055853440571939, + "grad_norm": 13.954792022705078, + "learning_rate": 4.6528085418533004e-05, + "loss": 0.1657, + "num_input_tokens_seen": 19694784, + "step": 33945 + }, + { + "epoch": 5.0565981531128985, + "grad_norm": 0.33356598019599915, + "learning_rate": 4.652643324392121e-05, + "loss": 0.066, + "num_input_tokens_seen": 19697888, + "step": 33950 + }, + { + "epoch": 5.057342865653857, + "grad_norm": 29.50855255126953, + "learning_rate": 4.65247807056422e-05, + "loss": 0.1519, + "num_input_tokens_seen": 19700992, + "step": 33955 + }, + { + "epoch": 5.058087578194817, + "grad_norm": 12.978734970092773, + "learning_rate": 4.65231278037239e-05, + "loss": 0.2497, + "num_input_tokens_seen": 19703872, + "step": 33960 + }, + { + "epoch": 5.058832290735776, + "grad_norm": 0.20814716815948486, + "learning_rate": 4.652147453819423e-05, + "loss": 0.6584, + "num_input_tokens_seen": 19706944, + "step": 33965 + }, + { + "epoch": 5.059577003276735, + "grad_norm": 43.10578155517578, + "learning_rate": 4.651982090908112e-05, + "loss": 0.1413, + "num_input_tokens_seen": 19709664, + "step": 33970 + }, + { + "epoch": 5.060321715817694, + "grad_norm": 21.304149627685547, + "learning_rate": 4.6518166916412506e-05, + "loss": 0.0674, + "num_input_tokens_seen": 19712416, + "step": 33975 + }, + { + "epoch": 5.061066428358654, + "grad_norm": 54.12159729003906, + "learning_rate": 4.651651256021634e-05, + "loss": 0.3866, + "num_input_tokens_seen": 19715360, + "step": 33980 + }, + { + "epoch": 5.0618111408996125, + "grad_norm": 0.07436283677816391, + "learning_rate": 4.651485784052055e-05, + "loss": 0.0247, + "num_input_tokens_seen": 19718240, + "step": 33985 + }, + { + "epoch": 5.062555853440572, + "grad_norm": 0.024482935667037964, + "learning_rate": 4.6513202757353116e-05, + "loss": 0.1519, + "num_input_tokens_seen": 19721216, + "step": 33990 + }, + { + "epoch": 5.063300565981531, + "grad_norm": 17.58629608154297, + "learning_rate": 4.6511547310741984e-05, + "loss": 0.1986, + "num_input_tokens_seen": 19724064, + "step": 33995 + }, + { + "epoch": 5.0640452785224905, + "grad_norm": 7.011094570159912, + "learning_rate": 4.650989150071512e-05, + "loss": 0.1542, + "num_input_tokens_seen": 19726880, + "step": 34000 + }, + { + "epoch": 5.064789991063449, + "grad_norm": 21.883832931518555, + "learning_rate": 4.6508235327300496e-05, + "loss": 0.2758, + "num_input_tokens_seen": 19729664, + "step": 34005 + }, + { + "epoch": 5.065534703604409, + "grad_norm": 13.470619201660156, + "learning_rate": 4.65065787905261e-05, + "loss": 0.3568, + "num_input_tokens_seen": 19732704, + "step": 34010 + }, + { + "epoch": 5.066279416145368, + "grad_norm": 12.4507417678833, + "learning_rate": 4.650492189041992e-05, + "loss": 0.4444, + "num_input_tokens_seen": 19735488, + "step": 34015 + }, + { + "epoch": 5.067024128686327, + "grad_norm": 1.7899980545043945, + "learning_rate": 4.650326462700993e-05, + "loss": 0.063, + "num_input_tokens_seen": 19738144, + "step": 34020 + }, + { + "epoch": 5.067768841227286, + "grad_norm": 1.0361993312835693, + "learning_rate": 4.650160700032416e-05, + "loss": 0.0447, + "num_input_tokens_seen": 19741152, + "step": 34025 + }, + { + "epoch": 5.068513553768246, + "grad_norm": 15.423018455505371, + "learning_rate": 4.649994901039057e-05, + "loss": 0.1491, + "num_input_tokens_seen": 19744352, + "step": 34030 + }, + { + "epoch": 5.0692582663092045, + "grad_norm": 0.1442745476961136, + "learning_rate": 4.6498290657237205e-05, + "loss": 0.1766, + "num_input_tokens_seen": 19747136, + "step": 34035 + }, + { + "epoch": 5.070002978850164, + "grad_norm": 24.009441375732422, + "learning_rate": 4.649663194089207e-05, + "loss": 0.5747, + "num_input_tokens_seen": 19749984, + "step": 34040 + }, + { + "epoch": 5.070747691391123, + "grad_norm": 0.45217588543891907, + "learning_rate": 4.649497286138318e-05, + "loss": 0.1897, + "num_input_tokens_seen": 19753184, + "step": 34045 + }, + { + "epoch": 5.071492403932083, + "grad_norm": 16.322879791259766, + "learning_rate": 4.6493313418738564e-05, + "loss": 0.1208, + "num_input_tokens_seen": 19755776, + "step": 34050 + }, + { + "epoch": 5.072237116473041, + "grad_norm": 21.705698013305664, + "learning_rate": 4.649165361298628e-05, + "loss": 0.3619, + "num_input_tokens_seen": 19758496, + "step": 34055 + }, + { + "epoch": 5.072981829014001, + "grad_norm": 0.06730731576681137, + "learning_rate": 4.6489993444154334e-05, + "loss": 0.1923, + "num_input_tokens_seen": 19761440, + "step": 34060 + }, + { + "epoch": 5.07372654155496, + "grad_norm": 32.10942077636719, + "learning_rate": 4.64883329122708e-05, + "loss": 0.0409, + "num_input_tokens_seen": 19764160, + "step": 34065 + }, + { + "epoch": 5.074471254095919, + "grad_norm": 3.9659605026245117, + "learning_rate": 4.648667201736372e-05, + "loss": 0.0569, + "num_input_tokens_seen": 19767072, + "step": 34070 + }, + { + "epoch": 5.075215966636878, + "grad_norm": 6.792667388916016, + "learning_rate": 4.648501075946116e-05, + "loss": 0.0181, + "num_input_tokens_seen": 19770208, + "step": 34075 + }, + { + "epoch": 5.075960679177838, + "grad_norm": 12.088261604309082, + "learning_rate": 4.648334913859117e-05, + "loss": 0.6856, + "num_input_tokens_seen": 19773056, + "step": 34080 + }, + { + "epoch": 5.0767053917187965, + "grad_norm": 0.07423292100429535, + "learning_rate": 4.648168715478183e-05, + "loss": 0.1754, + "num_input_tokens_seen": 19776448, + "step": 34085 + }, + { + "epoch": 5.077450104259755, + "grad_norm": 32.10308074951172, + "learning_rate": 4.648002480806123e-05, + "loss": 0.3396, + "num_input_tokens_seen": 19779296, + "step": 34090 + }, + { + "epoch": 5.078194816800715, + "grad_norm": 16.777057647705078, + "learning_rate": 4.647836209845744e-05, + "loss": 0.3717, + "num_input_tokens_seen": 19782208, + "step": 34095 + }, + { + "epoch": 5.078939529341674, + "grad_norm": 0.11838080734014511, + "learning_rate": 4.647669902599854e-05, + "loss": 0.1554, + "num_input_tokens_seen": 19785024, + "step": 34100 + }, + { + "epoch": 5.079684241882633, + "grad_norm": 7.1824822425842285, + "learning_rate": 4.6475035590712646e-05, + "loss": 0.0227, + "num_input_tokens_seen": 19788064, + "step": 34105 + }, + { + "epoch": 5.080428954423592, + "grad_norm": 16.338546752929688, + "learning_rate": 4.6473371792627854e-05, + "loss": 0.1533, + "num_input_tokens_seen": 19790816, + "step": 34110 + }, + { + "epoch": 5.081173666964552, + "grad_norm": 33.102516174316406, + "learning_rate": 4.6471707631772267e-05, + "loss": 0.4479, + "num_input_tokens_seen": 19793760, + "step": 34115 + }, + { + "epoch": 5.0819183795055105, + "grad_norm": 23.51821517944336, + "learning_rate": 4.6470043108174e-05, + "loss": 0.7109, + "num_input_tokens_seen": 19796768, + "step": 34120 + }, + { + "epoch": 5.08266309204647, + "grad_norm": 14.018034934997559, + "learning_rate": 4.6468378221861175e-05, + "loss": 0.1841, + "num_input_tokens_seen": 19799456, + "step": 34125 + }, + { + "epoch": 5.083407804587429, + "grad_norm": 1.3196678161621094, + "learning_rate": 4.646671297286193e-05, + "loss": 0.2652, + "num_input_tokens_seen": 19802464, + "step": 34130 + }, + { + "epoch": 5.084152517128389, + "grad_norm": 40.10786437988281, + "learning_rate": 4.646504736120438e-05, + "loss": 0.202, + "num_input_tokens_seen": 19805088, + "step": 34135 + }, + { + "epoch": 5.084897229669347, + "grad_norm": 26.18003273010254, + "learning_rate": 4.646338138691667e-05, + "loss": 0.3019, + "num_input_tokens_seen": 19808000, + "step": 34140 + }, + { + "epoch": 5.085641942210307, + "grad_norm": 12.760704040527344, + "learning_rate": 4.646171505002694e-05, + "loss": 0.1449, + "num_input_tokens_seen": 19811072, + "step": 34145 + }, + { + "epoch": 5.086386654751266, + "grad_norm": 5.626876354217529, + "learning_rate": 4.646004835056336e-05, + "loss": 0.0785, + "num_input_tokens_seen": 19814112, + "step": 34150 + }, + { + "epoch": 5.087131367292225, + "grad_norm": 41.465171813964844, + "learning_rate": 4.645838128855406e-05, + "loss": 0.6093, + "num_input_tokens_seen": 19816992, + "step": 34155 + }, + { + "epoch": 5.087876079833184, + "grad_norm": 9.029282569885254, + "learning_rate": 4.6456713864027234e-05, + "loss": 0.1936, + "num_input_tokens_seen": 19819872, + "step": 34160 + }, + { + "epoch": 5.088620792374144, + "grad_norm": 0.10616593062877655, + "learning_rate": 4.645504607701102e-05, + "loss": 0.3442, + "num_input_tokens_seen": 19822848, + "step": 34165 + }, + { + "epoch": 5.0893655049151025, + "grad_norm": 47.00607681274414, + "learning_rate": 4.645337792753362e-05, + "loss": 0.3213, + "num_input_tokens_seen": 19825664, + "step": 34170 + }, + { + "epoch": 5.090110217456062, + "grad_norm": 0.9623501896858215, + "learning_rate": 4.64517094156232e-05, + "loss": 0.2455, + "num_input_tokens_seen": 19828544, + "step": 34175 + }, + { + "epoch": 5.090854929997021, + "grad_norm": 0.06111603602766991, + "learning_rate": 4.645004054130795e-05, + "loss": 0.2749, + "num_input_tokens_seen": 19831520, + "step": 34180 + }, + { + "epoch": 5.091599642537981, + "grad_norm": 24.553218841552734, + "learning_rate": 4.644837130461607e-05, + "loss": 0.2885, + "num_input_tokens_seen": 19834208, + "step": 34185 + }, + { + "epoch": 5.092344355078939, + "grad_norm": 30.47105598449707, + "learning_rate": 4.644670170557575e-05, + "loss": 0.3587, + "num_input_tokens_seen": 19836992, + "step": 34190 + }, + { + "epoch": 5.093089067619899, + "grad_norm": 0.03693767264485359, + "learning_rate": 4.644503174421521e-05, + "loss": 0.0072, + "num_input_tokens_seen": 19839680, + "step": 34195 + }, + { + "epoch": 5.093833780160858, + "grad_norm": 0.37558630108833313, + "learning_rate": 4.644336142056265e-05, + "loss": 0.3779, + "num_input_tokens_seen": 19842400, + "step": 34200 + }, + { + "epoch": 5.094578492701817, + "grad_norm": 23.922609329223633, + "learning_rate": 4.644169073464629e-05, + "loss": 0.5068, + "num_input_tokens_seen": 19845280, + "step": 34205 + }, + { + "epoch": 5.095323205242776, + "grad_norm": 4.620208740234375, + "learning_rate": 4.644001968649436e-05, + "loss": 0.234, + "num_input_tokens_seen": 19848256, + "step": 34210 + }, + { + "epoch": 5.096067917783736, + "grad_norm": 40.616554260253906, + "learning_rate": 4.643834827613508e-05, + "loss": 0.1232, + "num_input_tokens_seen": 19850944, + "step": 34215 + }, + { + "epoch": 5.096812630324695, + "grad_norm": 0.17543676495552063, + "learning_rate": 4.643667650359671e-05, + "loss": 0.0052, + "num_input_tokens_seen": 19853856, + "step": 34220 + }, + { + "epoch": 5.097557342865654, + "grad_norm": 0.38074690103530884, + "learning_rate": 4.643500436890746e-05, + "loss": 0.104, + "num_input_tokens_seen": 19856896, + "step": 34225 + }, + { + "epoch": 5.098302055406613, + "grad_norm": 12.919092178344727, + "learning_rate": 4.6433331872095615e-05, + "loss": 0.2181, + "num_input_tokens_seen": 19859808, + "step": 34230 + }, + { + "epoch": 5.099046767947573, + "grad_norm": 0.153349831700325, + "learning_rate": 4.643165901318941e-05, + "loss": 0.32, + "num_input_tokens_seen": 19862752, + "step": 34235 + }, + { + "epoch": 5.099791480488531, + "grad_norm": 0.00755751458927989, + "learning_rate": 4.6429985792217095e-05, + "loss": 0.0689, + "num_input_tokens_seen": 19865280, + "step": 34240 + }, + { + "epoch": 5.100536193029491, + "grad_norm": 18.01543426513672, + "learning_rate": 4.642831220920696e-05, + "loss": 0.1564, + "num_input_tokens_seen": 19868128, + "step": 34245 + }, + { + "epoch": 5.10128090557045, + "grad_norm": 12.222707748413086, + "learning_rate": 4.642663826418726e-05, + "loss": 0.3756, + "num_input_tokens_seen": 19871328, + "step": 34250 + }, + { + "epoch": 5.102025618111409, + "grad_norm": 23.16653060913086, + "learning_rate": 4.64249639571863e-05, + "loss": 0.1655, + "num_input_tokens_seen": 19874592, + "step": 34255 + }, + { + "epoch": 5.102770330652368, + "grad_norm": 16.940631866455078, + "learning_rate": 4.642328928823234e-05, + "loss": 0.289, + "num_input_tokens_seen": 19877728, + "step": 34260 + }, + { + "epoch": 5.103515043193327, + "grad_norm": 0.03363337740302086, + "learning_rate": 4.6421614257353676e-05, + "loss": 0.1779, + "num_input_tokens_seen": 19880736, + "step": 34265 + }, + { + "epoch": 5.104259755734287, + "grad_norm": 12.147123336791992, + "learning_rate": 4.6419938864578615e-05, + "loss": 0.3055, + "num_input_tokens_seen": 19883712, + "step": 34270 + }, + { + "epoch": 5.105004468275245, + "grad_norm": 45.59760665893555, + "learning_rate": 4.641826310993546e-05, + "loss": 0.4525, + "num_input_tokens_seen": 19886720, + "step": 34275 + }, + { + "epoch": 5.105749180816205, + "grad_norm": 16.548601150512695, + "learning_rate": 4.641658699345251e-05, + "loss": 0.2011, + "num_input_tokens_seen": 19889568, + "step": 34280 + }, + { + "epoch": 5.106493893357164, + "grad_norm": 2.235203266143799, + "learning_rate": 4.64149105151581e-05, + "loss": 0.1698, + "num_input_tokens_seen": 19892544, + "step": 34285 + }, + { + "epoch": 5.107238605898123, + "grad_norm": 0.02814619243144989, + "learning_rate": 4.641323367508054e-05, + "loss": 0.2613, + "num_input_tokens_seen": 19895168, + "step": 34290 + }, + { + "epoch": 5.107983318439082, + "grad_norm": 0.05503052845597267, + "learning_rate": 4.641155647324816e-05, + "loss": 0.3106, + "num_input_tokens_seen": 19897824, + "step": 34295 + }, + { + "epoch": 5.108728030980042, + "grad_norm": 24.24730110168457, + "learning_rate": 4.6409878909689286e-05, + "loss": 0.2673, + "num_input_tokens_seen": 19900480, + "step": 34300 + }, + { + "epoch": 5.109472743521001, + "grad_norm": 2.3591010570526123, + "learning_rate": 4.6408200984432276e-05, + "loss": 0.1712, + "num_input_tokens_seen": 19903328, + "step": 34305 + }, + { + "epoch": 5.11021745606196, + "grad_norm": 3.497028350830078, + "learning_rate": 4.640652269750547e-05, + "loss": 0.373, + "num_input_tokens_seen": 19906048, + "step": 34310 + }, + { + "epoch": 5.110962168602919, + "grad_norm": 2.470463991165161, + "learning_rate": 4.640484404893722e-05, + "loss": 0.1173, + "num_input_tokens_seen": 19909248, + "step": 34315 + }, + { + "epoch": 5.111706881143879, + "grad_norm": 5.640386581420898, + "learning_rate": 4.640316503875588e-05, + "loss": 0.308, + "num_input_tokens_seen": 19912352, + "step": 34320 + }, + { + "epoch": 5.112451593684837, + "grad_norm": 11.319511413574219, + "learning_rate": 4.640148566698982e-05, + "loss": 0.0778, + "num_input_tokens_seen": 19915168, + "step": 34325 + }, + { + "epoch": 5.113196306225797, + "grad_norm": 11.844778060913086, + "learning_rate": 4.639980593366742e-05, + "loss": 0.3843, + "num_input_tokens_seen": 19917888, + "step": 34330 + }, + { + "epoch": 5.113941018766756, + "grad_norm": 32.51664733886719, + "learning_rate": 4.639812583881704e-05, + "loss": 0.0482, + "num_input_tokens_seen": 19920640, + "step": 34335 + }, + { + "epoch": 5.114685731307715, + "grad_norm": 20.585458755493164, + "learning_rate": 4.6396445382467067e-05, + "loss": 0.3771, + "num_input_tokens_seen": 19923776, + "step": 34340 + }, + { + "epoch": 5.115430443848674, + "grad_norm": 1.6668689250946045, + "learning_rate": 4.639476456464591e-05, + "loss": 0.2382, + "num_input_tokens_seen": 19926720, + "step": 34345 + }, + { + "epoch": 5.116175156389634, + "grad_norm": 0.1530599743127823, + "learning_rate": 4.639308338538194e-05, + "loss": 0.1605, + "num_input_tokens_seen": 19929536, + "step": 34350 + }, + { + "epoch": 5.116919868930593, + "grad_norm": 1.8266247510910034, + "learning_rate": 4.639140184470357e-05, + "loss": 0.2323, + "num_input_tokens_seen": 19932448, + "step": 34355 + }, + { + "epoch": 5.117664581471552, + "grad_norm": 0.8183470368385315, + "learning_rate": 4.638971994263921e-05, + "loss": 0.3146, + "num_input_tokens_seen": 19935232, + "step": 34360 + }, + { + "epoch": 5.118409294012511, + "grad_norm": 0.07723220437765121, + "learning_rate": 4.6388037679217274e-05, + "loss": 0.5721, + "num_input_tokens_seen": 19938464, + "step": 34365 + }, + { + "epoch": 5.119154006553471, + "grad_norm": 0.9850869178771973, + "learning_rate": 4.638635505446617e-05, + "loss": 0.0042, + "num_input_tokens_seen": 19941504, + "step": 34370 + }, + { + "epoch": 5.119898719094429, + "grad_norm": 22.625930786132812, + "learning_rate": 4.638467206841434e-05, + "loss": 0.5395, + "num_input_tokens_seen": 19944096, + "step": 34375 + }, + { + "epoch": 5.120643431635389, + "grad_norm": 36.660186767578125, + "learning_rate": 4.6382988721090214e-05, + "loss": 0.4781, + "num_input_tokens_seen": 19946880, + "step": 34380 + }, + { + "epoch": 5.121388144176348, + "grad_norm": 2.4862828254699707, + "learning_rate": 4.638130501252221e-05, + "loss": 0.2421, + "num_input_tokens_seen": 19949824, + "step": 34385 + }, + { + "epoch": 5.1221328567173074, + "grad_norm": 20.90003776550293, + "learning_rate": 4.6379620942738814e-05, + "loss": 0.2784, + "num_input_tokens_seen": 19952544, + "step": 34390 + }, + { + "epoch": 5.122877569258266, + "grad_norm": 0.5948959589004517, + "learning_rate": 4.637793651176843e-05, + "loss": 0.3648, + "num_input_tokens_seen": 19955680, + "step": 34395 + }, + { + "epoch": 5.123622281799226, + "grad_norm": 0.019397860392928123, + "learning_rate": 4.637625171963954e-05, + "loss": 0.1451, + "num_input_tokens_seen": 19958880, + "step": 34400 + }, + { + "epoch": 5.124366994340185, + "grad_norm": 9.718825340270996, + "learning_rate": 4.63745665663806e-05, + "loss": 0.1414, + "num_input_tokens_seen": 19961856, + "step": 34405 + }, + { + "epoch": 5.125111706881144, + "grad_norm": 9.837430953979492, + "learning_rate": 4.63728810520201e-05, + "loss": 0.2429, + "num_input_tokens_seen": 19964704, + "step": 34410 + }, + { + "epoch": 5.125856419422103, + "grad_norm": 8.505960464477539, + "learning_rate": 4.637119517658648e-05, + "loss": 0.5036, + "num_input_tokens_seen": 19967360, + "step": 34415 + }, + { + "epoch": 5.126601131963063, + "grad_norm": 0.09864376485347748, + "learning_rate": 4.636950894010825e-05, + "loss": 0.1271, + "num_input_tokens_seen": 19970496, + "step": 34420 + }, + { + "epoch": 5.127345844504021, + "grad_norm": 17.481433868408203, + "learning_rate": 4.636782234261388e-05, + "loss": 0.2909, + "num_input_tokens_seen": 19973312, + "step": 34425 + }, + { + "epoch": 5.128090557044981, + "grad_norm": 13.513524055480957, + "learning_rate": 4.6366135384131866e-05, + "loss": 0.3587, + "num_input_tokens_seen": 19976448, + "step": 34430 + }, + { + "epoch": 5.12883526958594, + "grad_norm": 37.5564079284668, + "learning_rate": 4.6364448064690716e-05, + "loss": 0.3355, + "num_input_tokens_seen": 19979296, + "step": 34435 + }, + { + "epoch": 5.129579982126899, + "grad_norm": 6.243585109710693, + "learning_rate": 4.636276038431892e-05, + "loss": 0.3386, + "num_input_tokens_seen": 19982240, + "step": 34440 + }, + { + "epoch": 5.130324694667858, + "grad_norm": 13.007913589477539, + "learning_rate": 4.636107234304501e-05, + "loss": 0.0277, + "num_input_tokens_seen": 19985088, + "step": 34445 + }, + { + "epoch": 5.131069407208817, + "grad_norm": 17.84972381591797, + "learning_rate": 4.635938394089748e-05, + "loss": 0.2011, + "num_input_tokens_seen": 19987936, + "step": 34450 + }, + { + "epoch": 5.131814119749777, + "grad_norm": 9.727328300476074, + "learning_rate": 4.635769517790488e-05, + "loss": 0.2931, + "num_input_tokens_seen": 19991008, + "step": 34455 + }, + { + "epoch": 5.132558832290735, + "grad_norm": 14.732635498046875, + "learning_rate": 4.635600605409572e-05, + "loss": 0.2864, + "num_input_tokens_seen": 19993984, + "step": 34460 + }, + { + "epoch": 5.133303544831695, + "grad_norm": 29.995319366455078, + "learning_rate": 4.6354316569498545e-05, + "loss": 0.4747, + "num_input_tokens_seen": 19997120, + "step": 34465 + }, + { + "epoch": 5.134048257372654, + "grad_norm": 13.577031135559082, + "learning_rate": 4.63526267241419e-05, + "loss": 0.0225, + "num_input_tokens_seen": 20000192, + "step": 34470 + }, + { + "epoch": 5.1347929699136134, + "grad_norm": 0.051398880779743195, + "learning_rate": 4.6350936518054325e-05, + "loss": 0.1807, + "num_input_tokens_seen": 20003168, + "step": 34475 + }, + { + "epoch": 5.135537682454572, + "grad_norm": 0.8629224896430969, + "learning_rate": 4.634924595126437e-05, + "loss": 0.2465, + "num_input_tokens_seen": 20006048, + "step": 34480 + }, + { + "epoch": 5.136282394995532, + "grad_norm": 0.06893142312765121, + "learning_rate": 4.6347555023800616e-05, + "loss": 0.0072, + "num_input_tokens_seen": 20009120, + "step": 34485 + }, + { + "epoch": 5.137027107536491, + "grad_norm": 56.97157669067383, + "learning_rate": 4.634586373569161e-05, + "loss": 0.424, + "num_input_tokens_seen": 20012064, + "step": 34490 + }, + { + "epoch": 5.13777182007745, + "grad_norm": 1.9716534614562988, + "learning_rate": 4.634417208696593e-05, + "loss": 0.1414, + "num_input_tokens_seen": 20015040, + "step": 34495 + }, + { + "epoch": 5.138516532618409, + "grad_norm": 0.4413548707962036, + "learning_rate": 4.634248007765216e-05, + "loss": 0.0712, + "num_input_tokens_seen": 20017920, + "step": 34500 + }, + { + "epoch": 5.139261245159369, + "grad_norm": 31.929893493652344, + "learning_rate": 4.6340787707778874e-05, + "loss": 0.3944, + "num_input_tokens_seen": 20020672, + "step": 34505 + }, + { + "epoch": 5.140005957700327, + "grad_norm": 1.9324977397918701, + "learning_rate": 4.633909497737468e-05, + "loss": 0.1047, + "num_input_tokens_seen": 20023520, + "step": 34510 + }, + { + "epoch": 5.140750670241287, + "grad_norm": 0.09589644521474838, + "learning_rate": 4.6337401886468156e-05, + "loss": 0.6215, + "num_input_tokens_seen": 20026432, + "step": 34515 + }, + { + "epoch": 5.141495382782246, + "grad_norm": 0.14858625829219818, + "learning_rate": 4.633570843508792e-05, + "loss": 0.2653, + "num_input_tokens_seen": 20029056, + "step": 34520 + }, + { + "epoch": 5.1422400953232055, + "grad_norm": 18.80789566040039, + "learning_rate": 4.633401462326257e-05, + "loss": 0.3092, + "num_input_tokens_seen": 20031872, + "step": 34525 + }, + { + "epoch": 5.142984807864164, + "grad_norm": 32.1470947265625, + "learning_rate": 4.633232045102072e-05, + "loss": 0.0841, + "num_input_tokens_seen": 20034560, + "step": 34530 + }, + { + "epoch": 5.143729520405124, + "grad_norm": 7.349623203277588, + "learning_rate": 4.6330625918391e-05, + "loss": 0.3282, + "num_input_tokens_seen": 20037472, + "step": 34535 + }, + { + "epoch": 5.144474232946083, + "grad_norm": 44.3126220703125, + "learning_rate": 4.6328931025402045e-05, + "loss": 0.239, + "num_input_tokens_seen": 20040160, + "step": 34540 + }, + { + "epoch": 5.145218945487042, + "grad_norm": 17.77394676208496, + "learning_rate": 4.6327235772082466e-05, + "loss": 0.3584, + "num_input_tokens_seen": 20043040, + "step": 34545 + }, + { + "epoch": 5.145963658028001, + "grad_norm": 0.20397309958934784, + "learning_rate": 4.632554015846092e-05, + "loss": 0.0651, + "num_input_tokens_seen": 20045856, + "step": 34550 + }, + { + "epoch": 5.146708370568961, + "grad_norm": 0.8794541954994202, + "learning_rate": 4.6323844184566045e-05, + "loss": 0.2683, + "num_input_tokens_seen": 20049056, + "step": 34555 + }, + { + "epoch": 5.1474530831099194, + "grad_norm": 11.6298828125, + "learning_rate": 4.6322147850426504e-05, + "loss": 0.1314, + "num_input_tokens_seen": 20051936, + "step": 34560 + }, + { + "epoch": 5.148197795650879, + "grad_norm": 1.537503719329834, + "learning_rate": 4.6320451156070934e-05, + "loss": 0.2015, + "num_input_tokens_seen": 20054688, + "step": 34565 + }, + { + "epoch": 5.148942508191838, + "grad_norm": 38.545440673828125, + "learning_rate": 4.6318754101528014e-05, + "loss": 0.5525, + "num_input_tokens_seen": 20057696, + "step": 34570 + }, + { + "epoch": 5.1496872207327975, + "grad_norm": 54.85167694091797, + "learning_rate": 4.631705668682641e-05, + "loss": 0.5669, + "num_input_tokens_seen": 20060576, + "step": 34575 + }, + { + "epoch": 5.150431933273756, + "grad_norm": 0.3205476701259613, + "learning_rate": 4.631535891199481e-05, + "loss": 0.0708, + "num_input_tokens_seen": 20063552, + "step": 34580 + }, + { + "epoch": 5.151176645814716, + "grad_norm": 6.12977933883667, + "learning_rate": 4.6313660777061874e-05, + "loss": 0.3336, + "num_input_tokens_seen": 20066944, + "step": 34585 + }, + { + "epoch": 5.151921358355675, + "grad_norm": 18.295034408569336, + "learning_rate": 4.63119622820563e-05, + "loss": 0.2043, + "num_input_tokens_seen": 20070112, + "step": 34590 + }, + { + "epoch": 5.152666070896634, + "grad_norm": 18.605222702026367, + "learning_rate": 4.6310263427006786e-05, + "loss": 0.1304, + "num_input_tokens_seen": 20072928, + "step": 34595 + }, + { + "epoch": 5.153410783437593, + "grad_norm": 13.78018569946289, + "learning_rate": 4.6308564211942044e-05, + "loss": 0.3341, + "num_input_tokens_seen": 20075712, + "step": 34600 + }, + { + "epoch": 5.154155495978552, + "grad_norm": 13.88293743133545, + "learning_rate": 4.6306864636890745e-05, + "loss": 0.3667, + "num_input_tokens_seen": 20078432, + "step": 34605 + }, + { + "epoch": 5.1549002085195115, + "grad_norm": 0.8890438675880432, + "learning_rate": 4.6305164701881634e-05, + "loss": 0.3487, + "num_input_tokens_seen": 20081248, + "step": 34610 + }, + { + "epoch": 5.15564492106047, + "grad_norm": 16.589313507080078, + "learning_rate": 4.6303464406943416e-05, + "loss": 0.2967, + "num_input_tokens_seen": 20084256, + "step": 34615 + }, + { + "epoch": 5.15638963360143, + "grad_norm": 16.091991424560547, + "learning_rate": 4.630176375210482e-05, + "loss": 0.1573, + "num_input_tokens_seen": 20086976, + "step": 34620 + }, + { + "epoch": 5.157134346142389, + "grad_norm": 5.305433750152588, + "learning_rate": 4.630006273739458e-05, + "loss": 0.0545, + "num_input_tokens_seen": 20089824, + "step": 34625 + }, + { + "epoch": 5.157879058683348, + "grad_norm": 0.10220970958471298, + "learning_rate": 4.629836136284143e-05, + "loss": 0.0801, + "num_input_tokens_seen": 20092640, + "step": 34630 + }, + { + "epoch": 5.158623771224307, + "grad_norm": 0.2618373930454254, + "learning_rate": 4.62966596284741e-05, + "loss": 0.3592, + "num_input_tokens_seen": 20095680, + "step": 34635 + }, + { + "epoch": 5.159368483765267, + "grad_norm": 9.174818992614746, + "learning_rate": 4.629495753432136e-05, + "loss": 0.3333, + "num_input_tokens_seen": 20098560, + "step": 34640 + }, + { + "epoch": 5.1601131963062254, + "grad_norm": 13.709518432617188, + "learning_rate": 4.629325508041195e-05, + "loss": 0.1778, + "num_input_tokens_seen": 20101536, + "step": 34645 + }, + { + "epoch": 5.160857908847185, + "grad_norm": 36.152862548828125, + "learning_rate": 4.629155226677464e-05, + "loss": 0.1217, + "num_input_tokens_seen": 20104480, + "step": 34650 + }, + { + "epoch": 5.161602621388144, + "grad_norm": 47.03950119018555, + "learning_rate": 4.628984909343819e-05, + "loss": 0.1934, + "num_input_tokens_seen": 20107136, + "step": 34655 + }, + { + "epoch": 5.1623473339291035, + "grad_norm": 11.225157737731934, + "learning_rate": 4.6288145560431385e-05, + "loss": 0.1024, + "num_input_tokens_seen": 20110336, + "step": 34660 + }, + { + "epoch": 5.163092046470062, + "grad_norm": 0.02031974121928215, + "learning_rate": 4.628644166778299e-05, + "loss": 0.1927, + "num_input_tokens_seen": 20113504, + "step": 34665 + }, + { + "epoch": 5.163836759011022, + "grad_norm": 5.746897220611572, + "learning_rate": 4.6284737415521806e-05, + "loss": 0.3214, + "num_input_tokens_seen": 20116544, + "step": 34670 + }, + { + "epoch": 5.164581471551981, + "grad_norm": 19.318635940551758, + "learning_rate": 4.628303280367661e-05, + "loss": 0.2816, + "num_input_tokens_seen": 20119424, + "step": 34675 + }, + { + "epoch": 5.16532618409294, + "grad_norm": 35.470890045166016, + "learning_rate": 4.6281327832276204e-05, + "loss": 0.046, + "num_input_tokens_seen": 20122400, + "step": 34680 + }, + { + "epoch": 5.166070896633899, + "grad_norm": 8.531963348388672, + "learning_rate": 4.627962250134939e-05, + "loss": 0.2743, + "num_input_tokens_seen": 20125312, + "step": 34685 + }, + { + "epoch": 5.166815609174859, + "grad_norm": 0.3749838173389435, + "learning_rate": 4.627791681092499e-05, + "loss": 0.289, + "num_input_tokens_seen": 20128064, + "step": 34690 + }, + { + "epoch": 5.1675603217158175, + "grad_norm": 43.06461715698242, + "learning_rate": 4.627621076103181e-05, + "loss": 0.1943, + "num_input_tokens_seen": 20130816, + "step": 34695 + }, + { + "epoch": 5.168305034256777, + "grad_norm": 0.8534278273582458, + "learning_rate": 4.627450435169868e-05, + "loss": 0.1826, + "num_input_tokens_seen": 20133856, + "step": 34700 + }, + { + "epoch": 5.169049746797736, + "grad_norm": 0.5468796491622925, + "learning_rate": 4.627279758295441e-05, + "loss": 0.1448, + "num_input_tokens_seen": 20136704, + "step": 34705 + }, + { + "epoch": 5.1697944593386955, + "grad_norm": 67.33948516845703, + "learning_rate": 4.627109045482785e-05, + "loss": 0.4708, + "num_input_tokens_seen": 20139456, + "step": 34710 + }, + { + "epoch": 5.170539171879654, + "grad_norm": 61.581661224365234, + "learning_rate": 4.626938296734784e-05, + "loss": 0.5385, + "num_input_tokens_seen": 20142144, + "step": 34715 + }, + { + "epoch": 5.171283884420614, + "grad_norm": 12.945013999938965, + "learning_rate": 4.626767512054321e-05, + "loss": 0.1581, + "num_input_tokens_seen": 20144896, + "step": 34720 + }, + { + "epoch": 5.172028596961573, + "grad_norm": 0.04941834136843681, + "learning_rate": 4.626596691444284e-05, + "loss": 0.3365, + "num_input_tokens_seen": 20147488, + "step": 34725 + }, + { + "epoch": 5.172773309502532, + "grad_norm": 0.19021527469158173, + "learning_rate": 4.626425834907556e-05, + "loss": 0.3849, + "num_input_tokens_seen": 20150528, + "step": 34730 + }, + { + "epoch": 5.173518022043491, + "grad_norm": 0.03142204135656357, + "learning_rate": 4.6262549424470253e-05, + "loss": 0.1448, + "num_input_tokens_seen": 20153664, + "step": 34735 + }, + { + "epoch": 5.174262734584451, + "grad_norm": 104.72578430175781, + "learning_rate": 4.626084014065578e-05, + "loss": 0.3182, + "num_input_tokens_seen": 20156128, + "step": 34740 + }, + { + "epoch": 5.1750074471254095, + "grad_norm": 0.3587518036365509, + "learning_rate": 4.625913049766103e-05, + "loss": 0.6005, + "num_input_tokens_seen": 20159008, + "step": 34745 + }, + { + "epoch": 5.175752159666369, + "grad_norm": 15.150104522705078, + "learning_rate": 4.625742049551487e-05, + "loss": 0.2216, + "num_input_tokens_seen": 20162080, + "step": 34750 + }, + { + "epoch": 5.176496872207328, + "grad_norm": 19.8234920501709, + "learning_rate": 4.6255710134246197e-05, + "loss": 0.1847, + "num_input_tokens_seen": 20165088, + "step": 34755 + }, + { + "epoch": 5.1772415847482876, + "grad_norm": 19.104328155517578, + "learning_rate": 4.6253999413883905e-05, + "loss": 0.2438, + "num_input_tokens_seen": 20167904, + "step": 34760 + }, + { + "epoch": 5.177986297289246, + "grad_norm": 19.95793914794922, + "learning_rate": 4.6252288334456887e-05, + "loss": 0.5779, + "num_input_tokens_seen": 20170880, + "step": 34765 + }, + { + "epoch": 5.178731009830206, + "grad_norm": 0.14991158246994019, + "learning_rate": 4.625057689599407e-05, + "loss": 0.2707, + "num_input_tokens_seen": 20173568, + "step": 34770 + }, + { + "epoch": 5.179475722371165, + "grad_norm": 2.8078267574310303, + "learning_rate": 4.6248865098524346e-05, + "loss": 0.1593, + "num_input_tokens_seen": 20176320, + "step": 34775 + }, + { + "epoch": 5.180220434912124, + "grad_norm": 9.840442657470703, + "learning_rate": 4.6247152942076646e-05, + "loss": 0.1029, + "num_input_tokens_seen": 20179040, + "step": 34780 + }, + { + "epoch": 5.180965147453083, + "grad_norm": 13.138562202453613, + "learning_rate": 4.624544042667989e-05, + "loss": 0.1846, + "num_input_tokens_seen": 20181920, + "step": 34785 + }, + { + "epoch": 5.181709859994042, + "grad_norm": 20.167600631713867, + "learning_rate": 4.624372755236301e-05, + "loss": 0.2228, + "num_input_tokens_seen": 20184576, + "step": 34790 + }, + { + "epoch": 5.1824545725350015, + "grad_norm": 15.437115669250488, + "learning_rate": 4.624201431915495e-05, + "loss": 0.4069, + "num_input_tokens_seen": 20187840, + "step": 34795 + }, + { + "epoch": 5.18319928507596, + "grad_norm": 0.38447073101997375, + "learning_rate": 4.624030072708464e-05, + "loss": 0.1704, + "num_input_tokens_seen": 20190944, + "step": 34800 + }, + { + "epoch": 5.18394399761692, + "grad_norm": 52.61514663696289, + "learning_rate": 4.623858677618104e-05, + "loss": 0.3197, + "num_input_tokens_seen": 20194016, + "step": 34805 + }, + { + "epoch": 5.184688710157879, + "grad_norm": 11.093082427978516, + "learning_rate": 4.62368724664731e-05, + "loss": 0.1595, + "num_input_tokens_seen": 20197024, + "step": 34810 + }, + { + "epoch": 5.185433422698838, + "grad_norm": 47.49955368041992, + "learning_rate": 4.623515779798979e-05, + "loss": 0.3966, + "num_input_tokens_seen": 20199968, + "step": 34815 + }, + { + "epoch": 5.186178135239797, + "grad_norm": 3.3816633224487305, + "learning_rate": 4.623344277076007e-05, + "loss": 0.5157, + "num_input_tokens_seen": 20202688, + "step": 34820 + }, + { + "epoch": 5.186922847780757, + "grad_norm": 0.08207728713750839, + "learning_rate": 4.6231727384812916e-05, + "loss": 0.4459, + "num_input_tokens_seen": 20205600, + "step": 34825 + }, + { + "epoch": 5.1876675603217155, + "grad_norm": 10.179476737976074, + "learning_rate": 4.62300116401773e-05, + "loss": 0.3235, + "num_input_tokens_seen": 20208288, + "step": 34830 + }, + { + "epoch": 5.188412272862675, + "grad_norm": 4.673048496246338, + "learning_rate": 4.622829553688222e-05, + "loss": 0.0569, + "num_input_tokens_seen": 20211136, + "step": 34835 + }, + { + "epoch": 5.189156985403634, + "grad_norm": 25.07379913330078, + "learning_rate": 4.622657907495667e-05, + "loss": 0.2297, + "num_input_tokens_seen": 20213920, + "step": 34840 + }, + { + "epoch": 5.1899016979445936, + "grad_norm": 0.03813270479440689, + "learning_rate": 4.6224862254429623e-05, + "loss": 0.2435, + "num_input_tokens_seen": 20216896, + "step": 34845 + }, + { + "epoch": 5.190646410485552, + "grad_norm": 1.3170099258422852, + "learning_rate": 4.622314507533011e-05, + "loss": 0.2081, + "num_input_tokens_seen": 20219808, + "step": 34850 + }, + { + "epoch": 5.191391123026512, + "grad_norm": 0.3553301692008972, + "learning_rate": 4.622142753768713e-05, + "loss": 0.0838, + "num_input_tokens_seen": 20222560, + "step": 34855 + }, + { + "epoch": 5.192135835567471, + "grad_norm": 1.8522473573684692, + "learning_rate": 4.6219709641529695e-05, + "loss": 0.1708, + "num_input_tokens_seen": 20225376, + "step": 34860 + }, + { + "epoch": 5.19288054810843, + "grad_norm": 0.23509319126605988, + "learning_rate": 4.621799138688684e-05, + "loss": 0.1461, + "num_input_tokens_seen": 20227936, + "step": 34865 + }, + { + "epoch": 5.193625260649389, + "grad_norm": 0.02556014060974121, + "learning_rate": 4.6216272773787586e-05, + "loss": 0.2277, + "num_input_tokens_seen": 20230624, + "step": 34870 + }, + { + "epoch": 5.194369973190349, + "grad_norm": 2.274070978164673, + "learning_rate": 4.621455380226096e-05, + "loss": 0.1151, + "num_input_tokens_seen": 20233504, + "step": 34875 + }, + { + "epoch": 5.1951146857313075, + "grad_norm": 0.05215610936284065, + "learning_rate": 4.6212834472336016e-05, + "loss": 0.4761, + "num_input_tokens_seen": 20236448, + "step": 34880 + }, + { + "epoch": 5.195859398272267, + "grad_norm": 0.09562011808156967, + "learning_rate": 4.6211114784041784e-05, + "loss": 0.0269, + "num_input_tokens_seen": 20239552, + "step": 34885 + }, + { + "epoch": 5.196604110813226, + "grad_norm": 54.286895751953125, + "learning_rate": 4.620939473740733e-05, + "loss": 0.2766, + "num_input_tokens_seen": 20242336, + "step": 34890 + }, + { + "epoch": 5.197348823354186, + "grad_norm": 16.088552474975586, + "learning_rate": 4.6207674332461716e-05, + "loss": 0.2225, + "num_input_tokens_seen": 20244992, + "step": 34895 + }, + { + "epoch": 5.198093535895144, + "grad_norm": 32.31837463378906, + "learning_rate": 4.620595356923399e-05, + "loss": 0.4908, + "num_input_tokens_seen": 20248064, + "step": 34900 + }, + { + "epoch": 5.198838248436104, + "grad_norm": 4.055009841918945, + "learning_rate": 4.620423244775323e-05, + "loss": 0.3181, + "num_input_tokens_seen": 20250976, + "step": 34905 + }, + { + "epoch": 5.199582960977063, + "grad_norm": 47.40503692626953, + "learning_rate": 4.6202510968048515e-05, + "loss": 0.5656, + "num_input_tokens_seen": 20253792, + "step": 34910 + }, + { + "epoch": 5.200327673518022, + "grad_norm": 42.790870666503906, + "learning_rate": 4.620078913014893e-05, + "loss": 0.6923, + "num_input_tokens_seen": 20257120, + "step": 34915 + }, + { + "epoch": 5.201072386058981, + "grad_norm": 61.409725189208984, + "learning_rate": 4.619906693408357e-05, + "loss": 0.2579, + "num_input_tokens_seen": 20260128, + "step": 34920 + }, + { + "epoch": 5.201817098599941, + "grad_norm": 10.24372673034668, + "learning_rate": 4.61973443798815e-05, + "loss": 0.0929, + "num_input_tokens_seen": 20263008, + "step": 34925 + }, + { + "epoch": 5.2025618111408996, + "grad_norm": 4.3328776359558105, + "learning_rate": 4.6195621467571856e-05, + "loss": 0.4837, + "num_input_tokens_seen": 20265760, + "step": 34930 + }, + { + "epoch": 5.203306523681859, + "grad_norm": 3.909020185470581, + "learning_rate": 4.619389819718371e-05, + "loss": 0.214, + "num_input_tokens_seen": 20268736, + "step": 34935 + }, + { + "epoch": 5.204051236222818, + "grad_norm": 0.22865894436836243, + "learning_rate": 4.619217456874622e-05, + "loss": 0.4072, + "num_input_tokens_seen": 20271616, + "step": 34940 + }, + { + "epoch": 5.204795948763778, + "grad_norm": 15.89664077758789, + "learning_rate": 4.619045058228847e-05, + "loss": 0.1479, + "num_input_tokens_seen": 20274304, + "step": 34945 + }, + { + "epoch": 5.205540661304736, + "grad_norm": 36.12424850463867, + "learning_rate": 4.6188726237839586e-05, + "loss": 0.2206, + "num_input_tokens_seen": 20276864, + "step": 34950 + }, + { + "epoch": 5.206285373845695, + "grad_norm": 0.9121783971786499, + "learning_rate": 4.6187001535428716e-05, + "loss": 0.3539, + "num_input_tokens_seen": 20280096, + "step": 34955 + }, + { + "epoch": 5.207030086386655, + "grad_norm": 32.38032150268555, + "learning_rate": 4.618527647508498e-05, + "loss": 0.1593, + "num_input_tokens_seen": 20283296, + "step": 34960 + }, + { + "epoch": 5.2077747989276135, + "grad_norm": 0.3194132149219513, + "learning_rate": 4.618355105683754e-05, + "loss": 0.1075, + "num_input_tokens_seen": 20286368, + "step": 34965 + }, + { + "epoch": 5.208519511468573, + "grad_norm": 33.06573486328125, + "learning_rate": 4.618182528071553e-05, + "loss": 0.2702, + "num_input_tokens_seen": 20289248, + "step": 34970 + }, + { + "epoch": 5.209264224009532, + "grad_norm": 21.185068130493164, + "learning_rate": 4.618009914674811e-05, + "loss": 0.35, + "num_input_tokens_seen": 20291872, + "step": 34975 + }, + { + "epoch": 5.210008936550492, + "grad_norm": 50.333961486816406, + "learning_rate": 4.617837265496444e-05, + "loss": 0.4703, + "num_input_tokens_seen": 20294944, + "step": 34980 + }, + { + "epoch": 5.21075364909145, + "grad_norm": 24.48660659790039, + "learning_rate": 4.617664580539369e-05, + "loss": 0.3579, + "num_input_tokens_seen": 20297792, + "step": 34985 + }, + { + "epoch": 5.21149836163241, + "grad_norm": 19.166444778442383, + "learning_rate": 4.6174918598065034e-05, + "loss": 0.615, + "num_input_tokens_seen": 20300640, + "step": 34990 + }, + { + "epoch": 5.212243074173369, + "grad_norm": 15.067439079284668, + "learning_rate": 4.617319103300764e-05, + "loss": 0.1542, + "num_input_tokens_seen": 20303648, + "step": 34995 + }, + { + "epoch": 5.212987786714328, + "grad_norm": 9.905097007751465, + "learning_rate": 4.6171463110250725e-05, + "loss": 0.2905, + "num_input_tokens_seen": 20306784, + "step": 35000 + }, + { + "epoch": 5.213732499255287, + "grad_norm": 0.22127075493335724, + "learning_rate": 4.616973482982344e-05, + "loss": 0.1788, + "num_input_tokens_seen": 20309632, + "step": 35005 + }, + { + "epoch": 5.214477211796247, + "grad_norm": 33.134002685546875, + "learning_rate": 4.616800619175501e-05, + "loss": 0.3773, + "num_input_tokens_seen": 20312352, + "step": 35010 + }, + { + "epoch": 5.2152219243372056, + "grad_norm": 14.637102127075195, + "learning_rate": 4.616627719607462e-05, + "loss": 0.2586, + "num_input_tokens_seen": 20315552, + "step": 35015 + }, + { + "epoch": 5.215966636878165, + "grad_norm": 12.859007835388184, + "learning_rate": 4.6164547842811494e-05, + "loss": 0.2484, + "num_input_tokens_seen": 20318656, + "step": 35020 + }, + { + "epoch": 5.216711349419124, + "grad_norm": 19.640125274658203, + "learning_rate": 4.6162818131994845e-05, + "loss": 0.1392, + "num_input_tokens_seen": 20321440, + "step": 35025 + }, + { + "epoch": 5.217456061960084, + "grad_norm": 6.715107440948486, + "learning_rate": 4.616108806365389e-05, + "loss": 0.2336, + "num_input_tokens_seen": 20324608, + "step": 35030 + }, + { + "epoch": 5.218200774501042, + "grad_norm": 0.27350831031799316, + "learning_rate": 4.6159357637817855e-05, + "loss": 0.2323, + "num_input_tokens_seen": 20327424, + "step": 35035 + }, + { + "epoch": 5.218945487042002, + "grad_norm": 9.715246200561523, + "learning_rate": 4.6157626854515986e-05, + "loss": 0.1022, + "num_input_tokens_seen": 20330336, + "step": 35040 + }, + { + "epoch": 5.219690199582961, + "grad_norm": 14.760388374328613, + "learning_rate": 4.615589571377752e-05, + "loss": 0.1841, + "num_input_tokens_seen": 20333376, + "step": 35045 + }, + { + "epoch": 5.22043491212392, + "grad_norm": 19.33505630493164, + "learning_rate": 4.6154164215631685e-05, + "loss": 0.3598, + "num_input_tokens_seen": 20336224, + "step": 35050 + }, + { + "epoch": 5.221179624664879, + "grad_norm": 43.469383239746094, + "learning_rate": 4.615243236010775e-05, + "loss": 0.4362, + "num_input_tokens_seen": 20339072, + "step": 35055 + }, + { + "epoch": 5.221924337205839, + "grad_norm": 31.023664474487305, + "learning_rate": 4.615070014723497e-05, + "loss": 0.1146, + "num_input_tokens_seen": 20341824, + "step": 35060 + }, + { + "epoch": 5.222669049746798, + "grad_norm": 12.679481506347656, + "learning_rate": 4.614896757704261e-05, + "loss": 0.2272, + "num_input_tokens_seen": 20344864, + "step": 35065 + }, + { + "epoch": 5.223413762287757, + "grad_norm": 16.460845947265625, + "learning_rate": 4.614723464955993e-05, + "loss": 0.1607, + "num_input_tokens_seen": 20347808, + "step": 35070 + }, + { + "epoch": 5.224158474828716, + "grad_norm": 0.6609076857566833, + "learning_rate": 4.6145501364816226e-05, + "loss": 0.1869, + "num_input_tokens_seen": 20350560, + "step": 35075 + }, + { + "epoch": 5.224903187369676, + "grad_norm": 16.907089233398438, + "learning_rate": 4.614376772284075e-05, + "loss": 0.3275, + "num_input_tokens_seen": 20353824, + "step": 35080 + }, + { + "epoch": 5.225647899910634, + "grad_norm": 29.516895294189453, + "learning_rate": 4.6142033723662825e-05, + "loss": 0.438, + "num_input_tokens_seen": 20356704, + "step": 35085 + }, + { + "epoch": 5.226392612451594, + "grad_norm": 0.6771880388259888, + "learning_rate": 4.614029936731172e-05, + "loss": 0.0767, + "num_input_tokens_seen": 20359808, + "step": 35090 + }, + { + "epoch": 5.227137324992553, + "grad_norm": 26.161518096923828, + "learning_rate": 4.613856465381674e-05, + "loss": 0.2434, + "num_input_tokens_seen": 20362880, + "step": 35095 + }, + { + "epoch": 5.227882037533512, + "grad_norm": 3.142190456390381, + "learning_rate": 4.6136829583207197e-05, + "loss": 0.2171, + "num_input_tokens_seen": 20365728, + "step": 35100 + }, + { + "epoch": 5.228626750074471, + "grad_norm": 0.2581630051136017, + "learning_rate": 4.6135094155512405e-05, + "loss": 0.1761, + "num_input_tokens_seen": 20368416, + "step": 35105 + }, + { + "epoch": 5.229371462615431, + "grad_norm": 31.267839431762695, + "learning_rate": 4.613335837076168e-05, + "loss": 0.2733, + "num_input_tokens_seen": 20371008, + "step": 35110 + }, + { + "epoch": 5.23011617515639, + "grad_norm": 0.7484513521194458, + "learning_rate": 4.6131622228984336e-05, + "loss": 0.2624, + "num_input_tokens_seen": 20374112, + "step": 35115 + }, + { + "epoch": 5.230860887697349, + "grad_norm": 8.940004348754883, + "learning_rate": 4.6129885730209715e-05, + "loss": 0.1788, + "num_input_tokens_seen": 20377056, + "step": 35120 + }, + { + "epoch": 5.231605600238308, + "grad_norm": 4.341316223144531, + "learning_rate": 4.612814887446715e-05, + "loss": 0.138, + "num_input_tokens_seen": 20380064, + "step": 35125 + }, + { + "epoch": 5.232350312779268, + "grad_norm": 24.79371452331543, + "learning_rate": 4.6126411661785984e-05, + "loss": 0.3021, + "num_input_tokens_seen": 20383104, + "step": 35130 + }, + { + "epoch": 5.233095025320226, + "grad_norm": 25.426067352294922, + "learning_rate": 4.612467409219556e-05, + "loss": 0.5248, + "num_input_tokens_seen": 20386048, + "step": 35135 + }, + { + "epoch": 5.233839737861185, + "grad_norm": 20.752281188964844, + "learning_rate": 4.612293616572525e-05, + "loss": 0.2429, + "num_input_tokens_seen": 20388800, + "step": 35140 + }, + { + "epoch": 5.234584450402145, + "grad_norm": 0.01706025004386902, + "learning_rate": 4.612119788240439e-05, + "loss": 0.2189, + "num_input_tokens_seen": 20391616, + "step": 35145 + }, + { + "epoch": 5.235329162943104, + "grad_norm": 32.227577209472656, + "learning_rate": 4.6119459242262366e-05, + "loss": 0.4396, + "num_input_tokens_seen": 20394368, + "step": 35150 + }, + { + "epoch": 5.236073875484063, + "grad_norm": 2.000577688217163, + "learning_rate": 4.611772024532854e-05, + "loss": 0.2401, + "num_input_tokens_seen": 20397344, + "step": 35155 + }, + { + "epoch": 5.236818588025022, + "grad_norm": 6.5396575927734375, + "learning_rate": 4.611598089163229e-05, + "loss": 0.2981, + "num_input_tokens_seen": 20400576, + "step": 35160 + }, + { + "epoch": 5.237563300565982, + "grad_norm": 0.19191011786460876, + "learning_rate": 4.611424118120301e-05, + "loss": 0.0686, + "num_input_tokens_seen": 20403680, + "step": 35165 + }, + { + "epoch": 5.23830801310694, + "grad_norm": 11.1105375289917, + "learning_rate": 4.611250111407008e-05, + "loss": 0.3907, + "num_input_tokens_seen": 20406464, + "step": 35170 + }, + { + "epoch": 5.2390527256479, + "grad_norm": 32.90972137451172, + "learning_rate": 4.6110760690262907e-05, + "loss": 0.1762, + "num_input_tokens_seen": 20409312, + "step": 35175 + }, + { + "epoch": 5.239797438188859, + "grad_norm": 0.23911401629447937, + "learning_rate": 4.610901990981088e-05, + "loss": 0.5969, + "num_input_tokens_seen": 20412384, + "step": 35180 + }, + { + "epoch": 5.240542150729818, + "grad_norm": 0.42842376232147217, + "learning_rate": 4.6107278772743426e-05, + "loss": 0.1991, + "num_input_tokens_seen": 20415072, + "step": 35185 + }, + { + "epoch": 5.241286863270777, + "grad_norm": 10.715120315551758, + "learning_rate": 4.610553727908994e-05, + "loss": 0.1702, + "num_input_tokens_seen": 20417984, + "step": 35190 + }, + { + "epoch": 5.242031575811737, + "grad_norm": 0.4533386528491974, + "learning_rate": 4.6103795428879856e-05, + "loss": 0.0648, + "num_input_tokens_seen": 20420864, + "step": 35195 + }, + { + "epoch": 5.242776288352696, + "grad_norm": 5.683210372924805, + "learning_rate": 4.6102053222142595e-05, + "loss": 0.4371, + "num_input_tokens_seen": 20423712, + "step": 35200 + }, + { + "epoch": 5.243521000893655, + "grad_norm": 97.04120635986328, + "learning_rate": 4.61003106589076e-05, + "loss": 0.3447, + "num_input_tokens_seen": 20426816, + "step": 35205 + }, + { + "epoch": 5.244265713434614, + "grad_norm": 0.5320619940757751, + "learning_rate": 4.6098567739204294e-05, + "loss": 0.2902, + "num_input_tokens_seen": 20429792, + "step": 35210 + }, + { + "epoch": 5.245010425975574, + "grad_norm": 115.078857421875, + "learning_rate": 4.609682446306213e-05, + "loss": 0.1626, + "num_input_tokens_seen": 20432576, + "step": 35215 + }, + { + "epoch": 5.245755138516532, + "grad_norm": 30.80946159362793, + "learning_rate": 4.6095080830510564e-05, + "loss": 0.2697, + "num_input_tokens_seen": 20435328, + "step": 35220 + }, + { + "epoch": 5.246499851057492, + "grad_norm": 46.163597106933594, + "learning_rate": 4.6093336841579044e-05, + "loss": 0.3222, + "num_input_tokens_seen": 20438176, + "step": 35225 + }, + { + "epoch": 5.247244563598451, + "grad_norm": 0.6700186729431152, + "learning_rate": 4.609159249629704e-05, + "loss": 0.2099, + "num_input_tokens_seen": 20441088, + "step": 35230 + }, + { + "epoch": 5.2479892761394105, + "grad_norm": 0.010172856971621513, + "learning_rate": 4.6089847794694005e-05, + "loss": 0.1578, + "num_input_tokens_seen": 20443808, + "step": 35235 + }, + { + "epoch": 5.248733988680369, + "grad_norm": 20.726469039916992, + "learning_rate": 4.6088102736799445e-05, + "loss": 0.503, + "num_input_tokens_seen": 20446976, + "step": 35240 + }, + { + "epoch": 5.249478701221329, + "grad_norm": 0.6181215643882751, + "learning_rate": 4.60863573226428e-05, + "loss": 0.2048, + "num_input_tokens_seen": 20449856, + "step": 35245 + }, + { + "epoch": 5.250223413762288, + "grad_norm": 123.81629180908203, + "learning_rate": 4.6084611552253595e-05, + "loss": 0.4355, + "num_input_tokens_seen": 20452640, + "step": 35250 + }, + { + "epoch": 5.250968126303247, + "grad_norm": 13.555057525634766, + "learning_rate": 4.6082865425661307e-05, + "loss": 0.6094, + "num_input_tokens_seen": 20455584, + "step": 35255 + }, + { + "epoch": 5.251712838844206, + "grad_norm": 19.069499969482422, + "learning_rate": 4.608111894289543e-05, + "loss": 0.2735, + "num_input_tokens_seen": 20458624, + "step": 35260 + }, + { + "epoch": 5.252457551385166, + "grad_norm": 23.092775344848633, + "learning_rate": 4.607937210398548e-05, + "loss": 0.2524, + "num_input_tokens_seen": 20461536, + "step": 35265 + }, + { + "epoch": 5.253202263926124, + "grad_norm": 4.959643840789795, + "learning_rate": 4.607762490896096e-05, + "loss": 0.0526, + "num_input_tokens_seen": 20464320, + "step": 35270 + }, + { + "epoch": 5.253946976467084, + "grad_norm": 22.87176513671875, + "learning_rate": 4.6075877357851384e-05, + "loss": 0.3597, + "num_input_tokens_seen": 20467200, + "step": 35275 + }, + { + "epoch": 5.254691689008043, + "grad_norm": 30.91961097717285, + "learning_rate": 4.607412945068629e-05, + "loss": 0.1578, + "num_input_tokens_seen": 20470048, + "step": 35280 + }, + { + "epoch": 5.2554364015490025, + "grad_norm": 40.5699348449707, + "learning_rate": 4.607238118749519e-05, + "loss": 0.4213, + "num_input_tokens_seen": 20472896, + "step": 35285 + }, + { + "epoch": 5.256181114089961, + "grad_norm": 54.031856536865234, + "learning_rate": 4.607063256830763e-05, + "loss": 0.1089, + "num_input_tokens_seen": 20475776, + "step": 35290 + }, + { + "epoch": 5.256925826630921, + "grad_norm": 12.006223678588867, + "learning_rate": 4.6068883593153147e-05, + "loss": 0.1601, + "num_input_tokens_seen": 20479168, + "step": 35295 + }, + { + "epoch": 5.25767053917188, + "grad_norm": 28.02248764038086, + "learning_rate": 4.606713426206129e-05, + "loss": 0.2297, + "num_input_tokens_seen": 20481984, + "step": 35300 + }, + { + "epoch": 5.258415251712838, + "grad_norm": 0.4229283928871155, + "learning_rate": 4.606538457506161e-05, + "loss": 0.1616, + "num_input_tokens_seen": 20484864, + "step": 35305 + }, + { + "epoch": 5.259159964253798, + "grad_norm": 31.752708435058594, + "learning_rate": 4.606363453218367e-05, + "loss": 0.1677, + "num_input_tokens_seen": 20487712, + "step": 35310 + }, + { + "epoch": 5.259904676794757, + "grad_norm": 0.6299256086349487, + "learning_rate": 4.606188413345704e-05, + "loss": 0.1279, + "num_input_tokens_seen": 20490592, + "step": 35315 + }, + { + "epoch": 5.2606493893357165, + "grad_norm": 69.10173034667969, + "learning_rate": 4.6060133378911265e-05, + "loss": 0.2309, + "num_input_tokens_seen": 20493824, + "step": 35320 + }, + { + "epoch": 5.261394101876675, + "grad_norm": 0.01003726664930582, + "learning_rate": 4.605838226857595e-05, + "loss": 0.2038, + "num_input_tokens_seen": 20496704, + "step": 35325 + }, + { + "epoch": 5.262138814417635, + "grad_norm": 61.5126838684082, + "learning_rate": 4.605663080248067e-05, + "loss": 0.0779, + "num_input_tokens_seen": 20499584, + "step": 35330 + }, + { + "epoch": 5.262883526958594, + "grad_norm": 7.81167459487915, + "learning_rate": 4.6054878980655015e-05, + "loss": 0.464, + "num_input_tokens_seen": 20502592, + "step": 35335 + }, + { + "epoch": 5.263628239499553, + "grad_norm": 53.38200759887695, + "learning_rate": 4.605312680312858e-05, + "loss": 0.4801, + "num_input_tokens_seen": 20505696, + "step": 35340 + }, + { + "epoch": 5.264372952040512, + "grad_norm": 22.515169143676758, + "learning_rate": 4.605137426993096e-05, + "loss": 0.5501, + "num_input_tokens_seen": 20508608, + "step": 35345 + }, + { + "epoch": 5.265117664581472, + "grad_norm": 22.636608123779297, + "learning_rate": 4.6049621381091776e-05, + "loss": 0.5586, + "num_input_tokens_seen": 20512000, + "step": 35350 + }, + { + "epoch": 5.26586237712243, + "grad_norm": 44.51664733886719, + "learning_rate": 4.604786813664063e-05, + "loss": 0.1894, + "num_input_tokens_seen": 20514720, + "step": 35355 + }, + { + "epoch": 5.26660708966339, + "grad_norm": 40.643333435058594, + "learning_rate": 4.6046114536607133e-05, + "loss": 0.3939, + "num_input_tokens_seen": 20517248, + "step": 35360 + }, + { + "epoch": 5.267351802204349, + "grad_norm": 0.16104401648044586, + "learning_rate": 4.604436058102093e-05, + "loss": 0.0298, + "num_input_tokens_seen": 20520096, + "step": 35365 + }, + { + "epoch": 5.2680965147453085, + "grad_norm": 0.11829208582639694, + "learning_rate": 4.6042606269911645e-05, + "loss": 0.2968, + "num_input_tokens_seen": 20523136, + "step": 35370 + }, + { + "epoch": 5.268841227286267, + "grad_norm": 23.223377227783203, + "learning_rate": 4.604085160330891e-05, + "loss": 0.3314, + "num_input_tokens_seen": 20526048, + "step": 35375 + }, + { + "epoch": 5.269585939827227, + "grad_norm": 37.503849029541016, + "learning_rate": 4.603909658124238e-05, + "loss": 0.1619, + "num_input_tokens_seen": 20528928, + "step": 35380 + }, + { + "epoch": 5.270330652368186, + "grad_norm": 39.047454833984375, + "learning_rate": 4.6037341203741686e-05, + "loss": 0.0747, + "num_input_tokens_seen": 20531808, + "step": 35385 + }, + { + "epoch": 5.271075364909145, + "grad_norm": 27.089265823364258, + "learning_rate": 4.6035585470836494e-05, + "loss": 0.3051, + "num_input_tokens_seen": 20534464, + "step": 35390 + }, + { + "epoch": 5.271820077450104, + "grad_norm": 1.6654751300811768, + "learning_rate": 4.603382938255647e-05, + "loss": 0.2327, + "num_input_tokens_seen": 20537376, + "step": 35395 + }, + { + "epoch": 5.272564789991064, + "grad_norm": 7.840292453765869, + "learning_rate": 4.603207293893128e-05, + "loss": 0.1429, + "num_input_tokens_seen": 20540320, + "step": 35400 + }, + { + "epoch": 5.2733095025320225, + "grad_norm": 7.619176387786865, + "learning_rate": 4.6030316139990595e-05, + "loss": 0.1142, + "num_input_tokens_seen": 20543264, + "step": 35405 + }, + { + "epoch": 5.274054215072982, + "grad_norm": 4.428953170776367, + "learning_rate": 4.602855898576408e-05, + "loss": 0.0499, + "num_input_tokens_seen": 20545952, + "step": 35410 + }, + { + "epoch": 5.274798927613941, + "grad_norm": 72.53948974609375, + "learning_rate": 4.6026801476281436e-05, + "loss": 0.1759, + "num_input_tokens_seen": 20548736, + "step": 35415 + }, + { + "epoch": 5.2755436401549005, + "grad_norm": 20.535036087036133, + "learning_rate": 4.602504361157236e-05, + "loss": 0.1077, + "num_input_tokens_seen": 20551392, + "step": 35420 + }, + { + "epoch": 5.276288352695859, + "grad_norm": 73.77420806884766, + "learning_rate": 4.602328539166654e-05, + "loss": 0.0989, + "num_input_tokens_seen": 20554048, + "step": 35425 + }, + { + "epoch": 5.277033065236819, + "grad_norm": 18.5538387298584, + "learning_rate": 4.602152681659368e-05, + "loss": 0.6081, + "num_input_tokens_seen": 20557088, + "step": 35430 + }, + { + "epoch": 5.277777777777778, + "grad_norm": 8.958171844482422, + "learning_rate": 4.601976788638349e-05, + "loss": 0.3147, + "num_input_tokens_seen": 20560032, + "step": 35435 + }, + { + "epoch": 5.278522490318737, + "grad_norm": 29.578737258911133, + "learning_rate": 4.601800860106568e-05, + "loss": 0.0929, + "num_input_tokens_seen": 20563008, + "step": 35440 + }, + { + "epoch": 5.279267202859696, + "grad_norm": 73.80816650390625, + "learning_rate": 4.601624896066998e-05, + "loss": 0.2639, + "num_input_tokens_seen": 20565920, + "step": 35445 + }, + { + "epoch": 5.280011915400656, + "grad_norm": 0.050410978496074677, + "learning_rate": 4.601448896522611e-05, + "loss": 0.1828, + "num_input_tokens_seen": 20568768, + "step": 35450 + }, + { + "epoch": 5.2807566279416145, + "grad_norm": 0.013724016025662422, + "learning_rate": 4.601272861476381e-05, + "loss": 0.1726, + "num_input_tokens_seen": 20571648, + "step": 35455 + }, + { + "epoch": 5.281501340482574, + "grad_norm": 0.040842391550540924, + "learning_rate": 4.601096790931282e-05, + "loss": 0.1815, + "num_input_tokens_seen": 20574464, + "step": 35460 + }, + { + "epoch": 5.282246053023533, + "grad_norm": 20.13567352294922, + "learning_rate": 4.6009206848902874e-05, + "loss": 0.2582, + "num_input_tokens_seen": 20577216, + "step": 35465 + }, + { + "epoch": 5.282990765564492, + "grad_norm": 37.8677978515625, + "learning_rate": 4.6007445433563734e-05, + "loss": 0.4459, + "num_input_tokens_seen": 20579936, + "step": 35470 + }, + { + "epoch": 5.283735478105451, + "grad_norm": 24.855876922607422, + "learning_rate": 4.600568366332516e-05, + "loss": 0.3678, + "num_input_tokens_seen": 20582784, + "step": 35475 + }, + { + "epoch": 5.284480190646411, + "grad_norm": 1.6485532522201538, + "learning_rate": 4.60039215382169e-05, + "loss": 0.2118, + "num_input_tokens_seen": 20585632, + "step": 35480 + }, + { + "epoch": 5.28522490318737, + "grad_norm": 24.178815841674805, + "learning_rate": 4.6002159058268744e-05, + "loss": 0.2594, + "num_input_tokens_seen": 20588992, + "step": 35485 + }, + { + "epoch": 5.2859696157283285, + "grad_norm": 11.401144027709961, + "learning_rate": 4.600039622351045e-05, + "loss": 0.1927, + "num_input_tokens_seen": 20591872, + "step": 35490 + }, + { + "epoch": 5.286714328269288, + "grad_norm": 2.577812433242798, + "learning_rate": 4.59986330339718e-05, + "loss": 0.4036, + "num_input_tokens_seen": 20594624, + "step": 35495 + }, + { + "epoch": 5.287459040810247, + "grad_norm": 6.2720255851745605, + "learning_rate": 4.59968694896826e-05, + "loss": 0.3916, + "num_input_tokens_seen": 20597408, + "step": 35500 + }, + { + "epoch": 5.2882037533512065, + "grad_norm": 27.48726463317871, + "learning_rate": 4.599510559067263e-05, + "loss": 0.6606, + "num_input_tokens_seen": 20600320, + "step": 35505 + }, + { + "epoch": 5.288948465892165, + "grad_norm": 36.71559524536133, + "learning_rate": 4.599334133697167e-05, + "loss": 0.0828, + "num_input_tokens_seen": 20603168, + "step": 35510 + }, + { + "epoch": 5.289693178433125, + "grad_norm": 0.14847272634506226, + "learning_rate": 4.5991576728609565e-05, + "loss": 0.1777, + "num_input_tokens_seen": 20606208, + "step": 35515 + }, + { + "epoch": 5.290437890974084, + "grad_norm": 25.377817153930664, + "learning_rate": 4.5989811765616094e-05, + "loss": 0.3593, + "num_input_tokens_seen": 20609088, + "step": 35520 + }, + { + "epoch": 5.291182603515043, + "grad_norm": 53.71272659301758, + "learning_rate": 4.5988046448021096e-05, + "loss": 0.0823, + "num_input_tokens_seen": 20612064, + "step": 35525 + }, + { + "epoch": 5.291927316056002, + "grad_norm": 15.763373374938965, + "learning_rate": 4.598628077585438e-05, + "loss": 0.2574, + "num_input_tokens_seen": 20614880, + "step": 35530 + }, + { + "epoch": 5.292672028596962, + "grad_norm": 0.2946057915687561, + "learning_rate": 4.598451474914578e-05, + "loss": 0.3888, + "num_input_tokens_seen": 20617728, + "step": 35535 + }, + { + "epoch": 5.2934167411379205, + "grad_norm": 32.64773941040039, + "learning_rate": 4.598274836792513e-05, + "loss": 0.1838, + "num_input_tokens_seen": 20620576, + "step": 35540 + }, + { + "epoch": 5.29416145367888, + "grad_norm": 30.04231071472168, + "learning_rate": 4.5980981632222275e-05, + "loss": 0.4962, + "num_input_tokens_seen": 20623488, + "step": 35545 + }, + { + "epoch": 5.294906166219839, + "grad_norm": 0.8333996534347534, + "learning_rate": 4.5979214542067056e-05, + "loss": 0.1161, + "num_input_tokens_seen": 20626528, + "step": 35550 + }, + { + "epoch": 5.2956508787607985, + "grad_norm": 0.09460671991109848, + "learning_rate": 4.597744709748933e-05, + "loss": 0.2259, + "num_input_tokens_seen": 20629344, + "step": 35555 + }, + { + "epoch": 5.296395591301757, + "grad_norm": 0.2571311891078949, + "learning_rate": 4.597567929851896e-05, + "loss": 0.2981, + "num_input_tokens_seen": 20632064, + "step": 35560 + }, + { + "epoch": 5.297140303842717, + "grad_norm": 24.05492401123047, + "learning_rate": 4.59739111451858e-05, + "loss": 0.2926, + "num_input_tokens_seen": 20634816, + "step": 35565 + }, + { + "epoch": 5.297885016383676, + "grad_norm": 22.729860305786133, + "learning_rate": 4.5972142637519735e-05, + "loss": 0.3779, + "num_input_tokens_seen": 20637600, + "step": 35570 + }, + { + "epoch": 5.298629728924635, + "grad_norm": 0.08164916187524796, + "learning_rate": 4.597037377555063e-05, + "loss": 0.335, + "num_input_tokens_seen": 20640224, + "step": 35575 + }, + { + "epoch": 5.299374441465594, + "grad_norm": 110.34099578857422, + "learning_rate": 4.5968604559308374e-05, + "loss": 0.6088, + "num_input_tokens_seen": 20642752, + "step": 35580 + }, + { + "epoch": 5.300119154006554, + "grad_norm": 13.05034351348877, + "learning_rate": 4.596683498882286e-05, + "loss": 0.0816, + "num_input_tokens_seen": 20645600, + "step": 35585 + }, + { + "epoch": 5.3008638665475125, + "grad_norm": 32.70879364013672, + "learning_rate": 4.596506506412398e-05, + "loss": 0.0325, + "num_input_tokens_seen": 20648256, + "step": 35590 + }, + { + "epoch": 5.301608579088472, + "grad_norm": 14.213113784790039, + "learning_rate": 4.596329478524163e-05, + "loss": 0.2729, + "num_input_tokens_seen": 20651136, + "step": 35595 + }, + { + "epoch": 5.302353291629431, + "grad_norm": 23.39799690246582, + "learning_rate": 4.596152415220572e-05, + "loss": 0.2458, + "num_input_tokens_seen": 20654400, + "step": 35600 + }, + { + "epoch": 5.303098004170391, + "grad_norm": 4.792606353759766, + "learning_rate": 4.595975316504616e-05, + "loss": 0.0509, + "num_input_tokens_seen": 20657376, + "step": 35605 + }, + { + "epoch": 5.303842716711349, + "grad_norm": 48.97230911254883, + "learning_rate": 4.595798182379288e-05, + "loss": 0.2709, + "num_input_tokens_seen": 20660288, + "step": 35610 + }, + { + "epoch": 5.304587429252309, + "grad_norm": 4.202892780303955, + "learning_rate": 4.595621012847579e-05, + "loss": 0.2989, + "num_input_tokens_seen": 20663424, + "step": 35615 + }, + { + "epoch": 5.305332141793268, + "grad_norm": 0.062467850744724274, + "learning_rate": 4.5954438079124836e-05, + "loss": 0.2937, + "num_input_tokens_seen": 20666528, + "step": 35620 + }, + { + "epoch": 5.306076854334227, + "grad_norm": 104.84205627441406, + "learning_rate": 4.595266567576995e-05, + "loss": 0.4065, + "num_input_tokens_seen": 20669536, + "step": 35625 + }, + { + "epoch": 5.306821566875186, + "grad_norm": 47.606048583984375, + "learning_rate": 4.595089291844106e-05, + "loss": 0.0983, + "num_input_tokens_seen": 20672544, + "step": 35630 + }, + { + "epoch": 5.307566279416146, + "grad_norm": 1.550770878791809, + "learning_rate": 4.594911980716814e-05, + "loss": 0.2177, + "num_input_tokens_seen": 20675392, + "step": 35635 + }, + { + "epoch": 5.3083109919571045, + "grad_norm": 29.364765167236328, + "learning_rate": 4.594734634198112e-05, + "loss": 0.2573, + "num_input_tokens_seen": 20678432, + "step": 35640 + }, + { + "epoch": 5.309055704498064, + "grad_norm": 38.10824203491211, + "learning_rate": 4.594557252290998e-05, + "loss": 0.1929, + "num_input_tokens_seen": 20680992, + "step": 35645 + }, + { + "epoch": 5.309800417039023, + "grad_norm": 0.1389516443014145, + "learning_rate": 4.594379834998469e-05, + "loss": 0.3046, + "num_input_tokens_seen": 20683616, + "step": 35650 + }, + { + "epoch": 5.310545129579982, + "grad_norm": 62.67139434814453, + "learning_rate": 4.594202382323521e-05, + "loss": 0.3686, + "num_input_tokens_seen": 20686528, + "step": 35655 + }, + { + "epoch": 5.311289842120941, + "grad_norm": 1.4262065887451172, + "learning_rate": 4.594024894269151e-05, + "loss": 0.3693, + "num_input_tokens_seen": 20689312, + "step": 35660 + }, + { + "epoch": 5.3120345546619, + "grad_norm": 0.45780858397483826, + "learning_rate": 4.59384737083836e-05, + "loss": 0.272, + "num_input_tokens_seen": 20692288, + "step": 35665 + }, + { + "epoch": 5.31277926720286, + "grad_norm": 1.465613842010498, + "learning_rate": 4.5936698120341445e-05, + "loss": 0.177, + "num_input_tokens_seen": 20695296, + "step": 35670 + }, + { + "epoch": 5.3135239797438185, + "grad_norm": 11.789778709411621, + "learning_rate": 4.593492217859506e-05, + "loss": 0.1342, + "num_input_tokens_seen": 20698464, + "step": 35675 + }, + { + "epoch": 5.314268692284778, + "grad_norm": 6.565910816192627, + "learning_rate": 4.593314588317445e-05, + "loss": 0.29, + "num_input_tokens_seen": 20701824, + "step": 35680 + }, + { + "epoch": 5.315013404825737, + "grad_norm": 12.485593795776367, + "learning_rate": 4.5931369234109614e-05, + "loss": 0.3624, + "num_input_tokens_seen": 20704992, + "step": 35685 + }, + { + "epoch": 5.315758117366697, + "grad_norm": 0.44490036368370056, + "learning_rate": 4.592959223143056e-05, + "loss": 0.1093, + "num_input_tokens_seen": 20707840, + "step": 35690 + }, + { + "epoch": 5.316502829907655, + "grad_norm": 27.454057693481445, + "learning_rate": 4.592781487516732e-05, + "loss": 0.4453, + "num_input_tokens_seen": 20710816, + "step": 35695 + }, + { + "epoch": 5.317247542448615, + "grad_norm": 16.476228713989258, + "learning_rate": 4.592603716534992e-05, + "loss": 0.2886, + "num_input_tokens_seen": 20713472, + "step": 35700 + }, + { + "epoch": 5.317992254989574, + "grad_norm": 0.02186698466539383, + "learning_rate": 4.5924259102008386e-05, + "loss": 0.1826, + "num_input_tokens_seen": 20716352, + "step": 35705 + }, + { + "epoch": 5.318736967530533, + "grad_norm": 47.299320220947266, + "learning_rate": 4.592248068517276e-05, + "loss": 0.3172, + "num_input_tokens_seen": 20719392, + "step": 35710 + }, + { + "epoch": 5.319481680071492, + "grad_norm": 36.7721061706543, + "learning_rate": 4.59207019148731e-05, + "loss": 0.1945, + "num_input_tokens_seen": 20722528, + "step": 35715 + }, + { + "epoch": 5.320226392612452, + "grad_norm": 13.177299499511719, + "learning_rate": 4.591892279113943e-05, + "loss": 0.1426, + "num_input_tokens_seen": 20725408, + "step": 35720 + }, + { + "epoch": 5.3209711051534105, + "grad_norm": 22.76249122619629, + "learning_rate": 4.591714331400183e-05, + "loss": 0.3294, + "num_input_tokens_seen": 20728064, + "step": 35725 + }, + { + "epoch": 5.32171581769437, + "grad_norm": 4.4936017990112305, + "learning_rate": 4.5915363483490346e-05, + "loss": 0.5007, + "num_input_tokens_seen": 20731072, + "step": 35730 + }, + { + "epoch": 5.322460530235329, + "grad_norm": 5.912650108337402, + "learning_rate": 4.591358329963505e-05, + "loss": 0.1868, + "num_input_tokens_seen": 20733792, + "step": 35735 + }, + { + "epoch": 5.323205242776289, + "grad_norm": 1.335719347000122, + "learning_rate": 4.5911802762466034e-05, + "loss": 0.2633, + "num_input_tokens_seen": 20736768, + "step": 35740 + }, + { + "epoch": 5.323949955317247, + "grad_norm": 9.546452522277832, + "learning_rate": 4.5910021872013355e-05, + "loss": 0.2517, + "num_input_tokens_seen": 20739616, + "step": 35745 + }, + { + "epoch": 5.324694667858207, + "grad_norm": 0.2501820921897888, + "learning_rate": 4.590824062830711e-05, + "loss": 0.1627, + "num_input_tokens_seen": 20742496, + "step": 35750 + }, + { + "epoch": 5.325439380399166, + "grad_norm": 53.12807846069336, + "learning_rate": 4.590645903137739e-05, + "loss": 0.4236, + "num_input_tokens_seen": 20745568, + "step": 35755 + }, + { + "epoch": 5.326184092940125, + "grad_norm": 19.43980598449707, + "learning_rate": 4.590467708125429e-05, + "loss": 0.2859, + "num_input_tokens_seen": 20748448, + "step": 35760 + }, + { + "epoch": 5.326928805481084, + "grad_norm": 0.15858016908168793, + "learning_rate": 4.590289477796792e-05, + "loss": 0.2371, + "num_input_tokens_seen": 20751264, + "step": 35765 + }, + { + "epoch": 5.327673518022044, + "grad_norm": 4.738558769226074, + "learning_rate": 4.590111212154839e-05, + "loss": 0.1565, + "num_input_tokens_seen": 20754400, + "step": 35770 + }, + { + "epoch": 5.328418230563003, + "grad_norm": 0.3585267663002014, + "learning_rate": 4.589932911202581e-05, + "loss": 0.0575, + "num_input_tokens_seen": 20756992, + "step": 35775 + }, + { + "epoch": 5.329162943103962, + "grad_norm": 12.63279914855957, + "learning_rate": 4.5897545749430305e-05, + "loss": 0.2036, + "num_input_tokens_seen": 20759840, + "step": 35780 + }, + { + "epoch": 5.329907655644921, + "grad_norm": 8.69520378112793, + "learning_rate": 4.5895762033792e-05, + "loss": 0.174, + "num_input_tokens_seen": 20762848, + "step": 35785 + }, + { + "epoch": 5.330652368185881, + "grad_norm": 2.7170908451080322, + "learning_rate": 4.589397796514104e-05, + "loss": 0.1054, + "num_input_tokens_seen": 20765728, + "step": 35790 + }, + { + "epoch": 5.331397080726839, + "grad_norm": 26.506359100341797, + "learning_rate": 4.5892193543507556e-05, + "loss": 0.4942, + "num_input_tokens_seen": 20768480, + "step": 35795 + }, + { + "epoch": 5.332141793267799, + "grad_norm": 0.35455864667892456, + "learning_rate": 4.58904087689217e-05, + "loss": 0.3115, + "num_input_tokens_seen": 20771648, + "step": 35800 + }, + { + "epoch": 5.332886505808758, + "grad_norm": 0.23402667045593262, + "learning_rate": 4.5888623641413615e-05, + "loss": 0.6028, + "num_input_tokens_seen": 20774528, + "step": 35805 + }, + { + "epoch": 5.333631218349717, + "grad_norm": 36.125823974609375, + "learning_rate": 4.588683816101347e-05, + "loss": 0.1821, + "num_input_tokens_seen": 20777472, + "step": 35810 + }, + { + "epoch": 5.334375930890676, + "grad_norm": 6.637085914611816, + "learning_rate": 4.588505232775141e-05, + "loss": 0.2192, + "num_input_tokens_seen": 20780320, + "step": 35815 + }, + { + "epoch": 5.335120643431635, + "grad_norm": 78.35055541992188, + "learning_rate": 4.588326614165763e-05, + "loss": 0.1971, + "num_input_tokens_seen": 20783200, + "step": 35820 + }, + { + "epoch": 5.335865355972595, + "grad_norm": 0.45446401834487915, + "learning_rate": 4.5881479602762286e-05, + "loss": 0.2669, + "num_input_tokens_seen": 20786176, + "step": 35825 + }, + { + "epoch": 5.336610068513554, + "grad_norm": 20.495763778686523, + "learning_rate": 4.587969271109557e-05, + "loss": 0.387, + "num_input_tokens_seen": 20788928, + "step": 35830 + }, + { + "epoch": 5.337354781054513, + "grad_norm": 14.664353370666504, + "learning_rate": 4.5877905466687666e-05, + "loss": 0.2512, + "num_input_tokens_seen": 20792032, + "step": 35835 + }, + { + "epoch": 5.338099493595472, + "grad_norm": 8.45852279663086, + "learning_rate": 4.5876117869568766e-05, + "loss": 0.3826, + "num_input_tokens_seen": 20795456, + "step": 35840 + }, + { + "epoch": 5.338844206136431, + "grad_norm": 0.2577592432498932, + "learning_rate": 4.587432991976908e-05, + "loss": 0.0261, + "num_input_tokens_seen": 20798624, + "step": 35845 + }, + { + "epoch": 5.33958891867739, + "grad_norm": 8.714055061340332, + "learning_rate": 4.58725416173188e-05, + "loss": 0.3128, + "num_input_tokens_seen": 20801600, + "step": 35850 + }, + { + "epoch": 5.34033363121835, + "grad_norm": 0.2406923770904541, + "learning_rate": 4.587075296224814e-05, + "loss": 0.361, + "num_input_tokens_seen": 20804416, + "step": 35855 + }, + { + "epoch": 5.341078343759309, + "grad_norm": 7.628427505493164, + "learning_rate": 4.586896395458733e-05, + "loss": 0.1596, + "num_input_tokens_seen": 20807296, + "step": 35860 + }, + { + "epoch": 5.341823056300268, + "grad_norm": 36.69044876098633, + "learning_rate": 4.586717459436658e-05, + "loss": 0.3007, + "num_input_tokens_seen": 20810240, + "step": 35865 + }, + { + "epoch": 5.342567768841227, + "grad_norm": 73.15701293945312, + "learning_rate": 4.586538488161612e-05, + "loss": 0.2667, + "num_input_tokens_seen": 20813056, + "step": 35870 + }, + { + "epoch": 5.343312481382187, + "grad_norm": 0.1810556948184967, + "learning_rate": 4.58635948163662e-05, + "loss": 0.2298, + "num_input_tokens_seen": 20815936, + "step": 35875 + }, + { + "epoch": 5.344057193923145, + "grad_norm": 0.017705701291561127, + "learning_rate": 4.586180439864704e-05, + "loss": 0.2351, + "num_input_tokens_seen": 20818720, + "step": 35880 + }, + { + "epoch": 5.344801906464105, + "grad_norm": 13.674864768981934, + "learning_rate": 4.586001362848889e-05, + "loss": 0.1928, + "num_input_tokens_seen": 20821728, + "step": 35885 + }, + { + "epoch": 5.345546619005064, + "grad_norm": 9.085912704467773, + "learning_rate": 4.5858222505922026e-05, + "loss": 0.2723, + "num_input_tokens_seen": 20824320, + "step": 35890 + }, + { + "epoch": 5.346291331546023, + "grad_norm": 24.926958084106445, + "learning_rate": 4.585643103097669e-05, + "loss": 0.2214, + "num_input_tokens_seen": 20827360, + "step": 35895 + }, + { + "epoch": 5.347036044086982, + "grad_norm": 0.403763085603714, + "learning_rate": 4.5854639203683146e-05, + "loss": 0.373, + "num_input_tokens_seen": 20830208, + "step": 35900 + }, + { + "epoch": 5.347780756627942, + "grad_norm": 0.025672782212495804, + "learning_rate": 4.5852847024071664e-05, + "loss": 0.1578, + "num_input_tokens_seen": 20832864, + "step": 35905 + }, + { + "epoch": 5.348525469168901, + "grad_norm": 0.1577552706003189, + "learning_rate": 4.585105449217253e-05, + "loss": 0.1005, + "num_input_tokens_seen": 20836000, + "step": 35910 + }, + { + "epoch": 5.34927018170986, + "grad_norm": 20.090173721313477, + "learning_rate": 4.5849261608016026e-05, + "loss": 0.4595, + "num_input_tokens_seen": 20838944, + "step": 35915 + }, + { + "epoch": 5.350014894250819, + "grad_norm": 32.620479583740234, + "learning_rate": 4.584746837163243e-05, + "loss": 0.3988, + "num_input_tokens_seen": 20842304, + "step": 35920 + }, + { + "epoch": 5.350759606791779, + "grad_norm": 0.4774439334869385, + "learning_rate": 4.584567478305205e-05, + "loss": 0.0602, + "num_input_tokens_seen": 20844992, + "step": 35925 + }, + { + "epoch": 5.351504319332737, + "grad_norm": 4.7908735275268555, + "learning_rate": 4.584388084230518e-05, + "loss": 0.385, + "num_input_tokens_seen": 20847808, + "step": 35930 + }, + { + "epoch": 5.352249031873697, + "grad_norm": 98.38050079345703, + "learning_rate": 4.584208654942212e-05, + "loss": 0.2943, + "num_input_tokens_seen": 20850752, + "step": 35935 + }, + { + "epoch": 5.352993744414656, + "grad_norm": 16.446487426757812, + "learning_rate": 4.584029190443321e-05, + "loss": 0.0901, + "num_input_tokens_seen": 20853664, + "step": 35940 + }, + { + "epoch": 5.3537384569556155, + "grad_norm": 47.249732971191406, + "learning_rate": 4.583849690736873e-05, + "loss": 0.072, + "num_input_tokens_seen": 20856704, + "step": 35945 + }, + { + "epoch": 5.354483169496574, + "grad_norm": 0.30604222416877747, + "learning_rate": 4.583670155825903e-05, + "loss": 0.0849, + "num_input_tokens_seen": 20859584, + "step": 35950 + }, + { + "epoch": 5.355227882037534, + "grad_norm": 20.281246185302734, + "learning_rate": 4.5834905857134436e-05, + "loss": 0.3209, + "num_input_tokens_seen": 20862464, + "step": 35955 + }, + { + "epoch": 5.355972594578493, + "grad_norm": 13.106483459472656, + "learning_rate": 4.583310980402529e-05, + "loss": 0.1589, + "num_input_tokens_seen": 20865248, + "step": 35960 + }, + { + "epoch": 5.356717307119452, + "grad_norm": 20.65423011779785, + "learning_rate": 4.5831313398961915e-05, + "loss": 0.2183, + "num_input_tokens_seen": 20868064, + "step": 35965 + }, + { + "epoch": 5.357462019660411, + "grad_norm": 38.418846130371094, + "learning_rate": 4.5829516641974676e-05, + "loss": 0.3709, + "num_input_tokens_seen": 20870944, + "step": 35970 + }, + { + "epoch": 5.358206732201371, + "grad_norm": 38.13361358642578, + "learning_rate": 4.582771953309393e-05, + "loss": 0.296, + "num_input_tokens_seen": 20873760, + "step": 35975 + }, + { + "epoch": 5.358951444742329, + "grad_norm": 0.032047804445028305, + "learning_rate": 4.582592207235002e-05, + "loss": 0.3771, + "num_input_tokens_seen": 20876352, + "step": 35980 + }, + { + "epoch": 5.359696157283288, + "grad_norm": 45.138710021972656, + "learning_rate": 4.5824124259773336e-05, + "loss": 0.2606, + "num_input_tokens_seen": 20879104, + "step": 35985 + }, + { + "epoch": 5.360440869824248, + "grad_norm": 17.030576705932617, + "learning_rate": 4.582232609539423e-05, + "loss": 0.1493, + "num_input_tokens_seen": 20881920, + "step": 35990 + }, + { + "epoch": 5.3611855823652075, + "grad_norm": 0.7374563217163086, + "learning_rate": 4.582052757924309e-05, + "loss": 0.0846, + "num_input_tokens_seen": 20884928, + "step": 35995 + }, + { + "epoch": 5.361930294906166, + "grad_norm": 6.063135623931885, + "learning_rate": 4.5818728711350296e-05, + "loss": 0.1331, + "num_input_tokens_seen": 20887616, + "step": 36000 + }, + { + "epoch": 5.362675007447125, + "grad_norm": 24.910390853881836, + "learning_rate": 4.581692949174624e-05, + "loss": 0.567, + "num_input_tokens_seen": 20890624, + "step": 36005 + }, + { + "epoch": 5.363419719988085, + "grad_norm": 1.9250746965408325, + "learning_rate": 4.581512992046132e-05, + "loss": 0.3571, + "num_input_tokens_seen": 20893568, + "step": 36010 + }, + { + "epoch": 5.364164432529043, + "grad_norm": 0.40104684233665466, + "learning_rate": 4.5813329997525925e-05, + "loss": 0.2039, + "num_input_tokens_seen": 20896384, + "step": 36015 + }, + { + "epoch": 5.364909145070003, + "grad_norm": 23.977720260620117, + "learning_rate": 4.5811529722970484e-05, + "loss": 0.2986, + "num_input_tokens_seen": 20899232, + "step": 36020 + }, + { + "epoch": 5.365653857610962, + "grad_norm": 11.580052375793457, + "learning_rate": 4.5809729096825396e-05, + "loss": 0.3137, + "num_input_tokens_seen": 20902272, + "step": 36025 + }, + { + "epoch": 5.3663985701519215, + "grad_norm": 0.06530178338289261, + "learning_rate": 4.580792811912109e-05, + "loss": 0.0458, + "num_input_tokens_seen": 20904864, + "step": 36030 + }, + { + "epoch": 5.36714328269288, + "grad_norm": 5.080191612243652, + "learning_rate": 4.5806126789887984e-05, + "loss": 0.2459, + "num_input_tokens_seen": 20907424, + "step": 36035 + }, + { + "epoch": 5.36788799523384, + "grad_norm": 4.568727970123291, + "learning_rate": 4.580432510915651e-05, + "loss": 0.3209, + "num_input_tokens_seen": 20910112, + "step": 36040 + }, + { + "epoch": 5.368632707774799, + "grad_norm": 4.335169792175293, + "learning_rate": 4.580252307695711e-05, + "loss": 0.3467, + "num_input_tokens_seen": 20913184, + "step": 36045 + }, + { + "epoch": 5.369377420315758, + "grad_norm": 7.992758274078369, + "learning_rate": 4.580072069332022e-05, + "loss": 0.192, + "num_input_tokens_seen": 20916000, + "step": 36050 + }, + { + "epoch": 5.370122132856717, + "grad_norm": 23.407268524169922, + "learning_rate": 4.57989179582763e-05, + "loss": 0.2037, + "num_input_tokens_seen": 20918784, + "step": 36055 + }, + { + "epoch": 5.370866845397677, + "grad_norm": 21.844989776611328, + "learning_rate": 4.57971148718558e-05, + "loss": 0.6425, + "num_input_tokens_seen": 20921696, + "step": 36060 + }, + { + "epoch": 5.371611557938635, + "grad_norm": 12.46193790435791, + "learning_rate": 4.579531143408918e-05, + "loss": 0.0592, + "num_input_tokens_seen": 20924736, + "step": 36065 + }, + { + "epoch": 5.372356270479595, + "grad_norm": 0.31596365571022034, + "learning_rate": 4.579350764500691e-05, + "loss": 0.0649, + "num_input_tokens_seen": 20927808, + "step": 36070 + }, + { + "epoch": 5.373100983020554, + "grad_norm": 0.13209933042526245, + "learning_rate": 4.579170350463946e-05, + "loss": 0.1102, + "num_input_tokens_seen": 20930656, + "step": 36075 + }, + { + "epoch": 5.3738456955615135, + "grad_norm": 21.563705444335938, + "learning_rate": 4.5789899013017315e-05, + "loss": 0.1646, + "num_input_tokens_seen": 20933312, + "step": 36080 + }, + { + "epoch": 5.374590408102472, + "grad_norm": 0.17168624699115753, + "learning_rate": 4.578809417017095e-05, + "loss": 0.1032, + "num_input_tokens_seen": 20936608, + "step": 36085 + }, + { + "epoch": 5.375335120643432, + "grad_norm": 1.5510320663452148, + "learning_rate": 4.578628897613087e-05, + "loss": 0.0724, + "num_input_tokens_seen": 20939616, + "step": 36090 + }, + { + "epoch": 5.376079833184391, + "grad_norm": 44.57244110107422, + "learning_rate": 4.578448343092756e-05, + "loss": 0.0799, + "num_input_tokens_seen": 20942560, + "step": 36095 + }, + { + "epoch": 5.37682454572535, + "grad_norm": 1.4098882675170898, + "learning_rate": 4.5782677534591524e-05, + "loss": 0.2865, + "num_input_tokens_seen": 20945504, + "step": 36100 + }, + { + "epoch": 5.377569258266309, + "grad_norm": 20.612529754638672, + "learning_rate": 4.578087128715328e-05, + "loss": 0.5816, + "num_input_tokens_seen": 20948640, + "step": 36105 + }, + { + "epoch": 5.378313970807269, + "grad_norm": 7.309929370880127, + "learning_rate": 4.577906468864333e-05, + "loss": 0.1323, + "num_input_tokens_seen": 20951552, + "step": 36110 + }, + { + "epoch": 5.3790586833482275, + "grad_norm": 2.97758412361145, + "learning_rate": 4.577725773909221e-05, + "loss": 0.3136, + "num_input_tokens_seen": 20954112, + "step": 36115 + }, + { + "epoch": 5.379803395889187, + "grad_norm": 36.433067321777344, + "learning_rate": 4.577545043853042e-05, + "loss": 0.5828, + "num_input_tokens_seen": 20956864, + "step": 36120 + }, + { + "epoch": 5.380548108430146, + "grad_norm": 14.071752548217773, + "learning_rate": 4.577364278698852e-05, + "loss": 0.4441, + "num_input_tokens_seen": 20959840, + "step": 36125 + }, + { + "epoch": 5.3812928209711055, + "grad_norm": 3.2767088413238525, + "learning_rate": 4.577183478449705e-05, + "loss": 0.222, + "num_input_tokens_seen": 20962560, + "step": 36130 + }, + { + "epoch": 5.382037533512064, + "grad_norm": 40.09414291381836, + "learning_rate": 4.5770026431086524e-05, + "loss": 0.6917, + "num_input_tokens_seen": 20965760, + "step": 36135 + }, + { + "epoch": 5.382782246053024, + "grad_norm": 0.3543069660663605, + "learning_rate": 4.576821772678752e-05, + "loss": 0.1199, + "num_input_tokens_seen": 20968768, + "step": 36140 + }, + { + "epoch": 5.383526958593983, + "grad_norm": 70.0114517211914, + "learning_rate": 4.576640867163059e-05, + "loss": 0.2714, + "num_input_tokens_seen": 20971872, + "step": 36145 + }, + { + "epoch": 5.384271671134942, + "grad_norm": 3.8193700313568115, + "learning_rate": 4.5764599265646286e-05, + "loss": 0.1609, + "num_input_tokens_seen": 20974560, + "step": 36150 + }, + { + "epoch": 5.385016383675901, + "grad_norm": 0.04984971508383751, + "learning_rate": 4.576278950886518e-05, + "loss": 0.1441, + "num_input_tokens_seen": 20977536, + "step": 36155 + }, + { + "epoch": 5.385761096216861, + "grad_norm": 44.944862365722656, + "learning_rate": 4.576097940131785e-05, + "loss": 0.5397, + "num_input_tokens_seen": 20980160, + "step": 36160 + }, + { + "epoch": 5.3865058087578195, + "grad_norm": 42.567752838134766, + "learning_rate": 4.5759168943034875e-05, + "loss": 0.254, + "num_input_tokens_seen": 20983200, + "step": 36165 + }, + { + "epoch": 5.387250521298778, + "grad_norm": 26.523761749267578, + "learning_rate": 4.5757358134046835e-05, + "loss": 0.0521, + "num_input_tokens_seen": 20986016, + "step": 36170 + }, + { + "epoch": 5.387995233839738, + "grad_norm": 1.0655676126480103, + "learning_rate": 4.5755546974384336e-05, + "loss": 0.3202, + "num_input_tokens_seen": 20988768, + "step": 36175 + }, + { + "epoch": 5.388739946380697, + "grad_norm": 19.90929412841797, + "learning_rate": 4.575373546407795e-05, + "loss": 0.3304, + "num_input_tokens_seen": 20991744, + "step": 36180 + }, + { + "epoch": 5.389484658921656, + "grad_norm": 4.8735761642456055, + "learning_rate": 4.5751923603158305e-05, + "loss": 0.1595, + "num_input_tokens_seen": 20995168, + "step": 36185 + }, + { + "epoch": 5.390229371462615, + "grad_norm": 3.308455467224121, + "learning_rate": 4.5750111391656005e-05, + "loss": 0.4023, + "num_input_tokens_seen": 20998080, + "step": 36190 + }, + { + "epoch": 5.390974084003575, + "grad_norm": 9.751859664916992, + "learning_rate": 4.574829882960166e-05, + "loss": 0.235, + "num_input_tokens_seen": 21000672, + "step": 36195 + }, + { + "epoch": 5.3917187965445335, + "grad_norm": 6.1415581703186035, + "learning_rate": 4.5746485917025894e-05, + "loss": 0.1418, + "num_input_tokens_seen": 21003424, + "step": 36200 + }, + { + "epoch": 5.392463509085493, + "grad_norm": 0.12982578575611115, + "learning_rate": 4.574467265395933e-05, + "loss": 0.3027, + "num_input_tokens_seen": 21006240, + "step": 36205 + }, + { + "epoch": 5.393208221626452, + "grad_norm": 19.58502960205078, + "learning_rate": 4.574285904043261e-05, + "loss": 0.1839, + "num_input_tokens_seen": 21009312, + "step": 36210 + }, + { + "epoch": 5.3939529341674115, + "grad_norm": 48.23215103149414, + "learning_rate": 4.574104507647637e-05, + "loss": 0.5461, + "num_input_tokens_seen": 21012256, + "step": 36215 + }, + { + "epoch": 5.39469764670837, + "grad_norm": 21.977413177490234, + "learning_rate": 4.5739230762121255e-05, + "loss": 0.2436, + "num_input_tokens_seen": 21015232, + "step": 36220 + }, + { + "epoch": 5.39544235924933, + "grad_norm": 0.07743711769580841, + "learning_rate": 4.573741609739791e-05, + "loss": 0.1122, + "num_input_tokens_seen": 21017984, + "step": 36225 + }, + { + "epoch": 5.396187071790289, + "grad_norm": 0.03026198409497738, + "learning_rate": 4.5735601082336995e-05, + "loss": 0.2658, + "num_input_tokens_seen": 21020768, + "step": 36230 + }, + { + "epoch": 5.396931784331248, + "grad_norm": 3.4176392555236816, + "learning_rate": 4.573378571696918e-05, + "loss": 0.1043, + "num_input_tokens_seen": 21023840, + "step": 36235 + }, + { + "epoch": 5.397676496872207, + "grad_norm": 1.8279935121536255, + "learning_rate": 4.573197000132512e-05, + "loss": 0.0755, + "num_input_tokens_seen": 21026528, + "step": 36240 + }, + { + "epoch": 5.398421209413167, + "grad_norm": 9.467896461486816, + "learning_rate": 4.57301539354355e-05, + "loss": 0.0255, + "num_input_tokens_seen": 21029280, + "step": 36245 + }, + { + "epoch": 5.3991659219541255, + "grad_norm": 1.4553579092025757, + "learning_rate": 4.572833751933101e-05, + "loss": 0.2119, + "num_input_tokens_seen": 21032128, + "step": 36250 + }, + { + "epoch": 5.399910634495085, + "grad_norm": 39.90798568725586, + "learning_rate": 4.5726520753042314e-05, + "loss": 0.2544, + "num_input_tokens_seen": 21035200, + "step": 36255 + }, + { + "epoch": 5.400655347036044, + "grad_norm": 45.230648040771484, + "learning_rate": 4.572470363660012e-05, + "loss": 0.3772, + "num_input_tokens_seen": 21038208, + "step": 36260 + }, + { + "epoch": 5.4014000595770035, + "grad_norm": 1.9724006652832031, + "learning_rate": 4.572288617003512e-05, + "loss": 0.308, + "num_input_tokens_seen": 21041056, + "step": 36265 + }, + { + "epoch": 5.402144772117962, + "grad_norm": 11.414563179016113, + "learning_rate": 4.5721068353378016e-05, + "loss": 0.4824, + "num_input_tokens_seen": 21044160, + "step": 36270 + }, + { + "epoch": 5.402889484658922, + "grad_norm": 47.97809982299805, + "learning_rate": 4.571925018665953e-05, + "loss": 0.5847, + "num_input_tokens_seen": 21047072, + "step": 36275 + }, + { + "epoch": 5.403634197199881, + "grad_norm": 6.145528793334961, + "learning_rate": 4.5717431669910364e-05, + "loss": 0.3212, + "num_input_tokens_seen": 21049888, + "step": 36280 + }, + { + "epoch": 5.40437890974084, + "grad_norm": 15.148674011230469, + "learning_rate": 4.571561280316125e-05, + "loss": 0.3237, + "num_input_tokens_seen": 21052448, + "step": 36285 + }, + { + "epoch": 5.405123622281799, + "grad_norm": 0.30070555210113525, + "learning_rate": 4.571379358644291e-05, + "loss": 0.0214, + "num_input_tokens_seen": 21055264, + "step": 36290 + }, + { + "epoch": 5.405868334822759, + "grad_norm": 4.477605819702148, + "learning_rate": 4.571197401978608e-05, + "loss": 0.1686, + "num_input_tokens_seen": 21058048, + "step": 36295 + }, + { + "epoch": 5.4066130473637175, + "grad_norm": 0.48543718457221985, + "learning_rate": 4.5710154103221504e-05, + "loss": 0.4387, + "num_input_tokens_seen": 21061120, + "step": 36300 + }, + { + "epoch": 5.407357759904677, + "grad_norm": 0.40404990315437317, + "learning_rate": 4.570833383677991e-05, + "loss": 0.015, + "num_input_tokens_seen": 21064128, + "step": 36305 + }, + { + "epoch": 5.408102472445636, + "grad_norm": 12.559414863586426, + "learning_rate": 4.570651322049208e-05, + "loss": 0.2159, + "num_input_tokens_seen": 21066752, + "step": 36310 + }, + { + "epoch": 5.408847184986596, + "grad_norm": 16.56005859375, + "learning_rate": 4.570469225438875e-05, + "loss": 0.3546, + "num_input_tokens_seen": 21069728, + "step": 36315 + }, + { + "epoch": 5.409591897527554, + "grad_norm": 15.439422607421875, + "learning_rate": 4.570287093850068e-05, + "loss": 0.1826, + "num_input_tokens_seen": 21072384, + "step": 36320 + }, + { + "epoch": 5.410336610068514, + "grad_norm": 19.97586441040039, + "learning_rate": 4.570104927285865e-05, + "loss": 0.2452, + "num_input_tokens_seen": 21075456, + "step": 36325 + }, + { + "epoch": 5.411081322609473, + "grad_norm": 36.6965446472168, + "learning_rate": 4.5699227257493434e-05, + "loss": 0.1976, + "num_input_tokens_seen": 21078208, + "step": 36330 + }, + { + "epoch": 5.4118260351504315, + "grad_norm": 47.02595138549805, + "learning_rate": 4.5697404892435816e-05, + "loss": 0.0811, + "num_input_tokens_seen": 21081280, + "step": 36335 + }, + { + "epoch": 5.412570747691391, + "grad_norm": 7.846512317657471, + "learning_rate": 4.5695582177716566e-05, + "loss": 0.3298, + "num_input_tokens_seen": 21084192, + "step": 36340 + }, + { + "epoch": 5.413315460232351, + "grad_norm": 6.051339626312256, + "learning_rate": 4.56937591133665e-05, + "loss": 0.1505, + "num_input_tokens_seen": 21087264, + "step": 36345 + }, + { + "epoch": 5.4140601727733095, + "grad_norm": 30.72859001159668, + "learning_rate": 4.56919356994164e-05, + "loss": 0.4348, + "num_input_tokens_seen": 21090176, + "step": 36350 + }, + { + "epoch": 5.414804885314268, + "grad_norm": 0.0877128466963768, + "learning_rate": 4.569011193589707e-05, + "loss": 0.1336, + "num_input_tokens_seen": 21092960, + "step": 36355 + }, + { + "epoch": 5.415549597855228, + "grad_norm": 9.870335578918457, + "learning_rate": 4.568828782283934e-05, + "loss": 0.2463, + "num_input_tokens_seen": 21095712, + "step": 36360 + }, + { + "epoch": 5.416294310396187, + "grad_norm": 22.237712860107422, + "learning_rate": 4.5686463360274015e-05, + "loss": 0.133, + "num_input_tokens_seen": 21098656, + "step": 36365 + }, + { + "epoch": 5.417039022937146, + "grad_norm": 0.8790649175643921, + "learning_rate": 4.568463854823191e-05, + "loss": 0.1861, + "num_input_tokens_seen": 21101344, + "step": 36370 + }, + { + "epoch": 5.417783735478105, + "grad_norm": 0.04930805414915085, + "learning_rate": 4.5682813386743864e-05, + "loss": 0.1269, + "num_input_tokens_seen": 21104384, + "step": 36375 + }, + { + "epoch": 5.418528448019065, + "grad_norm": 12.1635160446167, + "learning_rate": 4.56809878758407e-05, + "loss": 0.3673, + "num_input_tokens_seen": 21107264, + "step": 36380 + }, + { + "epoch": 5.4192731605600235, + "grad_norm": 0.052673522382974625, + "learning_rate": 4.567916201555327e-05, + "loss": 0.2532, + "num_input_tokens_seen": 21110208, + "step": 36385 + }, + { + "epoch": 5.420017873100983, + "grad_norm": 9.159185409545898, + "learning_rate": 4.567733580591241e-05, + "loss": 0.4775, + "num_input_tokens_seen": 21113184, + "step": 36390 + }, + { + "epoch": 5.420762585641942, + "grad_norm": 47.877288818359375, + "learning_rate": 4.567550924694898e-05, + "loss": 0.2535, + "num_input_tokens_seen": 21116160, + "step": 36395 + }, + { + "epoch": 5.421507298182902, + "grad_norm": 3.761822462081909, + "learning_rate": 4.5673682338693836e-05, + "loss": 0.0968, + "num_input_tokens_seen": 21119040, + "step": 36400 + }, + { + "epoch": 5.42225201072386, + "grad_norm": 42.27016067504883, + "learning_rate": 4.567185508117784e-05, + "loss": 0.3329, + "num_input_tokens_seen": 21122400, + "step": 36405 + }, + { + "epoch": 5.42299672326482, + "grad_norm": 0.038627803325653076, + "learning_rate": 4.567002747443186e-05, + "loss": 0.2854, + "num_input_tokens_seen": 21125216, + "step": 36410 + }, + { + "epoch": 5.423741435805779, + "grad_norm": 1.7480530738830566, + "learning_rate": 4.5668199518486785e-05, + "loss": 0.3203, + "num_input_tokens_seen": 21127808, + "step": 36415 + }, + { + "epoch": 5.424486148346738, + "grad_norm": 0.13905887305736542, + "learning_rate": 4.566637121337347e-05, + "loss": 0.0977, + "num_input_tokens_seen": 21130784, + "step": 36420 + }, + { + "epoch": 5.425230860887697, + "grad_norm": 37.96747970581055, + "learning_rate": 4.566454255912283e-05, + "loss": 0.3104, + "num_input_tokens_seen": 21133728, + "step": 36425 + }, + { + "epoch": 5.425975573428657, + "grad_norm": 117.46800994873047, + "learning_rate": 4.5662713555765735e-05, + "loss": 0.6896, + "num_input_tokens_seen": 21136736, + "step": 36430 + }, + { + "epoch": 5.4267202859696155, + "grad_norm": 11.920714378356934, + "learning_rate": 4.56608842033331e-05, + "loss": 0.2764, + "num_input_tokens_seen": 21139392, + "step": 36435 + }, + { + "epoch": 5.427464998510575, + "grad_norm": 85.09602355957031, + "learning_rate": 4.565905450185583e-05, + "loss": 0.3748, + "num_input_tokens_seen": 21142432, + "step": 36440 + }, + { + "epoch": 5.428209711051534, + "grad_norm": 1.2502821683883667, + "learning_rate": 4.565722445136483e-05, + "loss": 0.2036, + "num_input_tokens_seen": 21145280, + "step": 36445 + }, + { + "epoch": 5.428954423592494, + "grad_norm": 2.82133150100708, + "learning_rate": 4.565539405189101e-05, + "loss": 0.139, + "num_input_tokens_seen": 21148384, + "step": 36450 + }, + { + "epoch": 5.429699136133452, + "grad_norm": 5.902145862579346, + "learning_rate": 4.5653563303465306e-05, + "loss": 0.3452, + "num_input_tokens_seen": 21151456, + "step": 36455 + }, + { + "epoch": 5.430443848674412, + "grad_norm": 13.865748405456543, + "learning_rate": 4.565173220611864e-05, + "loss": 0.2047, + "num_input_tokens_seen": 21154304, + "step": 36460 + }, + { + "epoch": 5.431188561215371, + "grad_norm": 20.468294143676758, + "learning_rate": 4.5649900759881956e-05, + "loss": 0.5072, + "num_input_tokens_seen": 21157408, + "step": 36465 + }, + { + "epoch": 5.43193327375633, + "grad_norm": 45.92643737792969, + "learning_rate": 4.564806896478617e-05, + "loss": 0.3234, + "num_input_tokens_seen": 21160448, + "step": 36470 + }, + { + "epoch": 5.432677986297289, + "grad_norm": 6.923809051513672, + "learning_rate": 4.564623682086226e-05, + "loss": 0.2573, + "num_input_tokens_seen": 21163392, + "step": 36475 + }, + { + "epoch": 5.433422698838249, + "grad_norm": 7.097025394439697, + "learning_rate": 4.564440432814116e-05, + "loss": 0.1518, + "num_input_tokens_seen": 21166336, + "step": 36480 + }, + { + "epoch": 5.434167411379208, + "grad_norm": 16.95315933227539, + "learning_rate": 4.5642571486653825e-05, + "loss": 0.1681, + "num_input_tokens_seen": 21169056, + "step": 36485 + }, + { + "epoch": 5.434912123920167, + "grad_norm": 21.70730209350586, + "learning_rate": 4.5640738296431224e-05, + "loss": 0.193, + "num_input_tokens_seen": 21171936, + "step": 36490 + }, + { + "epoch": 5.435656836461126, + "grad_norm": 9.460700035095215, + "learning_rate": 4.563890475750433e-05, + "loss": 0.3674, + "num_input_tokens_seen": 21174976, + "step": 36495 + }, + { + "epoch": 5.436401549002086, + "grad_norm": 18.907194137573242, + "learning_rate": 4.563707086990412e-05, + "loss": 0.0746, + "num_input_tokens_seen": 21177888, + "step": 36500 + }, + { + "epoch": 5.437146261543044, + "grad_norm": 6.062693119049072, + "learning_rate": 4.563523663366157e-05, + "loss": 0.1943, + "num_input_tokens_seen": 21180768, + "step": 36505 + }, + { + "epoch": 5.437890974084004, + "grad_norm": 12.847289085388184, + "learning_rate": 4.563340204880767e-05, + "loss": 0.114, + "num_input_tokens_seen": 21183872, + "step": 36510 + }, + { + "epoch": 5.438635686624963, + "grad_norm": 7.20272970199585, + "learning_rate": 4.563156711537341e-05, + "loss": 0.2209, + "num_input_tokens_seen": 21186816, + "step": 36515 + }, + { + "epoch": 5.4393803991659215, + "grad_norm": 40.62839889526367, + "learning_rate": 4.56297318333898e-05, + "loss": 0.3992, + "num_input_tokens_seen": 21189472, + "step": 36520 + }, + { + "epoch": 5.440125111706881, + "grad_norm": 0.054560501128435135, + "learning_rate": 4.562789620288783e-05, + "loss": 0.3466, + "num_input_tokens_seen": 21192224, + "step": 36525 + }, + { + "epoch": 5.44086982424784, + "grad_norm": 0.6250419020652771, + "learning_rate": 4.562606022389853e-05, + "loss": 0.0084, + "num_input_tokens_seen": 21194912, + "step": 36530 + }, + { + "epoch": 5.4416145367888, + "grad_norm": 0.2964535355567932, + "learning_rate": 4.5624223896452894e-05, + "loss": 0.1969, + "num_input_tokens_seen": 21198080, + "step": 36535 + }, + { + "epoch": 5.442359249329758, + "grad_norm": 2.6889820098876953, + "learning_rate": 4.5622387220581965e-05, + "loss": 0.078, + "num_input_tokens_seen": 21200896, + "step": 36540 + }, + { + "epoch": 5.443103961870718, + "grad_norm": 0.9939619898796082, + "learning_rate": 4.5620550196316757e-05, + "loss": 0.1661, + "num_input_tokens_seen": 21203776, + "step": 36545 + }, + { + "epoch": 5.443848674411677, + "grad_norm": 0.9648276567459106, + "learning_rate": 4.5618712823688316e-05, + "loss": 0.1994, + "num_input_tokens_seen": 21206496, + "step": 36550 + }, + { + "epoch": 5.444593386952636, + "grad_norm": 19.844161987304688, + "learning_rate": 4.561687510272767e-05, + "loss": 0.3093, + "num_input_tokens_seen": 21209440, + "step": 36555 + }, + { + "epoch": 5.445338099493595, + "grad_norm": 48.0582160949707, + "learning_rate": 4.5615037033465876e-05, + "loss": 0.534, + "num_input_tokens_seen": 21212480, + "step": 36560 + }, + { + "epoch": 5.446082812034555, + "grad_norm": 41.76436996459961, + "learning_rate": 4.5613198615933994e-05, + "loss": 0.5102, + "num_input_tokens_seen": 21215328, + "step": 36565 + }, + { + "epoch": 5.446827524575514, + "grad_norm": 25.983718872070312, + "learning_rate": 4.561135985016306e-05, + "loss": 0.2802, + "num_input_tokens_seen": 21218240, + "step": 36570 + }, + { + "epoch": 5.447572237116473, + "grad_norm": 11.518856048583984, + "learning_rate": 4.560952073618415e-05, + "loss": 0.0706, + "num_input_tokens_seen": 21221408, + "step": 36575 + }, + { + "epoch": 5.448316949657432, + "grad_norm": 30.166135787963867, + "learning_rate": 4.560768127402834e-05, + "loss": 0.2391, + "num_input_tokens_seen": 21224352, + "step": 36580 + }, + { + "epoch": 5.449061662198392, + "grad_norm": 8.580533981323242, + "learning_rate": 4.5605841463726695e-05, + "loss": 0.1659, + "num_input_tokens_seen": 21227360, + "step": 36585 + }, + { + "epoch": 5.44980637473935, + "grad_norm": 10.401205062866211, + "learning_rate": 4.5604001305310304e-05, + "loss": 0.1528, + "num_input_tokens_seen": 21229952, + "step": 36590 + }, + { + "epoch": 5.45055108728031, + "grad_norm": 24.175880432128906, + "learning_rate": 4.5602160798810256e-05, + "loss": 0.4032, + "num_input_tokens_seen": 21232800, + "step": 36595 + }, + { + "epoch": 5.451295799821269, + "grad_norm": 14.798871994018555, + "learning_rate": 4.5600319944257635e-05, + "loss": 0.4176, + "num_input_tokens_seen": 21235584, + "step": 36600 + }, + { + "epoch": 5.452040512362228, + "grad_norm": 24.99126434326172, + "learning_rate": 4.559847874168355e-05, + "loss": 0.063, + "num_input_tokens_seen": 21238656, + "step": 36605 + }, + { + "epoch": 5.452785224903187, + "grad_norm": 0.042208481580019, + "learning_rate": 4.55966371911191e-05, + "loss": 0.1104, + "num_input_tokens_seen": 21241408, + "step": 36610 + }, + { + "epoch": 5.453529937444147, + "grad_norm": 34.41680908203125, + "learning_rate": 4.5594795292595394e-05, + "loss": 0.4028, + "num_input_tokens_seen": 21244416, + "step": 36615 + }, + { + "epoch": 5.454274649985106, + "grad_norm": 26.090194702148438, + "learning_rate": 4.559295304614355e-05, + "loss": 0.4256, + "num_input_tokens_seen": 21247296, + "step": 36620 + }, + { + "epoch": 5.455019362526065, + "grad_norm": 0.3540121614933014, + "learning_rate": 4.559111045179471e-05, + "loss": 0.0935, + "num_input_tokens_seen": 21250432, + "step": 36625 + }, + { + "epoch": 5.455764075067024, + "grad_norm": 0.14746449887752533, + "learning_rate": 4.558926750957997e-05, + "loss": 0.295, + "num_input_tokens_seen": 21253216, + "step": 36630 + }, + { + "epoch": 5.456508787607984, + "grad_norm": 59.434303283691406, + "learning_rate": 4.558742421953049e-05, + "loss": 0.2041, + "num_input_tokens_seen": 21256096, + "step": 36635 + }, + { + "epoch": 5.457253500148942, + "grad_norm": 53.213287353515625, + "learning_rate": 4.55855805816774e-05, + "loss": 0.1934, + "num_input_tokens_seen": 21258720, + "step": 36640 + }, + { + "epoch": 5.457998212689902, + "grad_norm": 0.08495452255010605, + "learning_rate": 4.558373659605185e-05, + "loss": 0.1777, + "num_input_tokens_seen": 21261536, + "step": 36645 + }, + { + "epoch": 5.458742925230861, + "grad_norm": 109.99876403808594, + "learning_rate": 4.5581892262684984e-05, + "loss": 0.2552, + "num_input_tokens_seen": 21264864, + "step": 36650 + }, + { + "epoch": 5.4594876377718204, + "grad_norm": 14.05744743347168, + "learning_rate": 4.558004758160798e-05, + "loss": 0.416, + "num_input_tokens_seen": 21267712, + "step": 36655 + }, + { + "epoch": 5.460232350312779, + "grad_norm": 10.659344673156738, + "learning_rate": 4.5578202552851976e-05, + "loss": 0.3271, + "num_input_tokens_seen": 21270688, + "step": 36660 + }, + { + "epoch": 5.460977062853739, + "grad_norm": 32.592002868652344, + "learning_rate": 4.557635717644816e-05, + "loss": 0.1427, + "num_input_tokens_seen": 21273376, + "step": 36665 + }, + { + "epoch": 5.461721775394698, + "grad_norm": 0.20028731226921082, + "learning_rate": 4.557451145242769e-05, + "loss": 0.0166, + "num_input_tokens_seen": 21276128, + "step": 36670 + }, + { + "epoch": 5.462466487935657, + "grad_norm": 10.857451438903809, + "learning_rate": 4.557266538082178e-05, + "loss": 0.234, + "num_input_tokens_seen": 21278848, + "step": 36675 + }, + { + "epoch": 5.463211200476616, + "grad_norm": 0.03028184175491333, + "learning_rate": 4.557081896166159e-05, + "loss": 0.0725, + "num_input_tokens_seen": 21281728, + "step": 36680 + }, + { + "epoch": 5.463955913017575, + "grad_norm": 5.609703540802002, + "learning_rate": 4.556897219497832e-05, + "loss": 0.0377, + "num_input_tokens_seen": 21284480, + "step": 36685 + }, + { + "epoch": 5.464700625558534, + "grad_norm": 0.42859458923339844, + "learning_rate": 4.556712508080316e-05, + "loss": 0.2415, + "num_input_tokens_seen": 21287584, + "step": 36690 + }, + { + "epoch": 5.465445338099494, + "grad_norm": 0.08318416774272919, + "learning_rate": 4.556527761916735e-05, + "loss": 0.3207, + "num_input_tokens_seen": 21290560, + "step": 36695 + }, + { + "epoch": 5.466190050640453, + "grad_norm": 46.09033966064453, + "learning_rate": 4.556342981010205e-05, + "loss": 0.4666, + "num_input_tokens_seen": 21293824, + "step": 36700 + }, + { + "epoch": 5.466934763181412, + "grad_norm": 27.61327362060547, + "learning_rate": 4.5561581653638516e-05, + "loss": 0.2644, + "num_input_tokens_seen": 21296768, + "step": 36705 + }, + { + "epoch": 5.467679475722371, + "grad_norm": 14.482025146484375, + "learning_rate": 4.555973314980796e-05, + "loss": 0.0749, + "num_input_tokens_seen": 21300064, + "step": 36710 + }, + { + "epoch": 5.46842418826333, + "grad_norm": 26.956876754760742, + "learning_rate": 4.555788429864161e-05, + "loss": 0.0994, + "num_input_tokens_seen": 21303104, + "step": 36715 + }, + { + "epoch": 5.46916890080429, + "grad_norm": 9.098175048828125, + "learning_rate": 4.5556035100170683e-05, + "loss": 0.2264, + "num_input_tokens_seen": 21305920, + "step": 36720 + }, + { + "epoch": 5.469913613345248, + "grad_norm": 15.837931632995605, + "learning_rate": 4.555418555442645e-05, + "loss": 0.5596, + "num_input_tokens_seen": 21308640, + "step": 36725 + }, + { + "epoch": 5.470658325886208, + "grad_norm": 8.552943229675293, + "learning_rate": 4.555233566144014e-05, + "loss": 0.0605, + "num_input_tokens_seen": 21311552, + "step": 36730 + }, + { + "epoch": 5.471403038427167, + "grad_norm": 2.8767478466033936, + "learning_rate": 4.5550485421243006e-05, + "loss": 0.0806, + "num_input_tokens_seen": 21314464, + "step": 36735 + }, + { + "epoch": 5.4721477509681264, + "grad_norm": 21.180282592773438, + "learning_rate": 4.554863483386631e-05, + "loss": 0.5599, + "num_input_tokens_seen": 21317280, + "step": 36740 + }, + { + "epoch": 5.472892463509085, + "grad_norm": 0.0066930074244737625, + "learning_rate": 4.554678389934131e-05, + "loss": 0.0428, + "num_input_tokens_seen": 21320608, + "step": 36745 + }, + { + "epoch": 5.473637176050045, + "grad_norm": 17.889877319335938, + "learning_rate": 4.554493261769928e-05, + "loss": 0.3132, + "num_input_tokens_seen": 21323488, + "step": 36750 + }, + { + "epoch": 5.474381888591004, + "grad_norm": 11.744718551635742, + "learning_rate": 4.5543080988971484e-05, + "loss": 0.4425, + "num_input_tokens_seen": 21326176, + "step": 36755 + }, + { + "epoch": 5.475126601131963, + "grad_norm": 0.02540375106036663, + "learning_rate": 4.554122901318922e-05, + "loss": 0.0092, + "num_input_tokens_seen": 21328928, + "step": 36760 + }, + { + "epoch": 5.475871313672922, + "grad_norm": 6.646803379058838, + "learning_rate": 4.553937669038378e-05, + "loss": 0.3105, + "num_input_tokens_seen": 21331744, + "step": 36765 + }, + { + "epoch": 5.476616026213882, + "grad_norm": 42.18887710571289, + "learning_rate": 4.553752402058644e-05, + "loss": 0.4127, + "num_input_tokens_seen": 21334496, + "step": 36770 + }, + { + "epoch": 5.47736073875484, + "grad_norm": 25.321077346801758, + "learning_rate": 4.55356710038285e-05, + "loss": 0.3782, + "num_input_tokens_seen": 21337440, + "step": 36775 + }, + { + "epoch": 5.4781054512958, + "grad_norm": 11.116103172302246, + "learning_rate": 4.5533817640141275e-05, + "loss": 0.5457, + "num_input_tokens_seen": 21340320, + "step": 36780 + }, + { + "epoch": 5.478850163836759, + "grad_norm": 39.97165298461914, + "learning_rate": 4.553196392955606e-05, + "loss": 0.3345, + "num_input_tokens_seen": 21342944, + "step": 36785 + }, + { + "epoch": 5.4795948763777185, + "grad_norm": 5.921382427215576, + "learning_rate": 4.55301098721042e-05, + "loss": 0.4634, + "num_input_tokens_seen": 21345856, + "step": 36790 + }, + { + "epoch": 5.480339588918677, + "grad_norm": 40.42082977294922, + "learning_rate": 4.5528255467816994e-05, + "loss": 0.2197, + "num_input_tokens_seen": 21348992, + "step": 36795 + }, + { + "epoch": 5.481084301459637, + "grad_norm": 0.13714301586151123, + "learning_rate": 4.552640071672577e-05, + "loss": 0.2785, + "num_input_tokens_seen": 21351648, + "step": 36800 + }, + { + "epoch": 5.481829014000596, + "grad_norm": 0.1499028503894806, + "learning_rate": 4.552454561886187e-05, + "loss": 0.2039, + "num_input_tokens_seen": 21354272, + "step": 36805 + }, + { + "epoch": 5.482573726541555, + "grad_norm": 21.700788497924805, + "learning_rate": 4.5522690174256635e-05, + "loss": 0.1046, + "num_input_tokens_seen": 21356960, + "step": 36810 + }, + { + "epoch": 5.483318439082514, + "grad_norm": 21.427928924560547, + "learning_rate": 4.55208343829414e-05, + "loss": 0.1178, + "num_input_tokens_seen": 21360096, + "step": 36815 + }, + { + "epoch": 5.484063151623474, + "grad_norm": 6.477859973907471, + "learning_rate": 4.551897824494753e-05, + "loss": 0.0945, + "num_input_tokens_seen": 21363584, + "step": 36820 + }, + { + "epoch": 5.4848078641644324, + "grad_norm": 1.0307793617248535, + "learning_rate": 4.551712176030638e-05, + "loss": 0.3295, + "num_input_tokens_seen": 21366592, + "step": 36825 + }, + { + "epoch": 5.485552576705392, + "grad_norm": 37.229496002197266, + "learning_rate": 4.551526492904931e-05, + "loss": 0.2593, + "num_input_tokens_seen": 21369440, + "step": 36830 + }, + { + "epoch": 5.486297289246351, + "grad_norm": 59.293212890625, + "learning_rate": 4.551340775120768e-05, + "loss": 0.1495, + "num_input_tokens_seen": 21372256, + "step": 36835 + }, + { + "epoch": 5.4870420017873105, + "grad_norm": 26.247955322265625, + "learning_rate": 4.551155022681288e-05, + "loss": 0.5642, + "num_input_tokens_seen": 21374848, + "step": 36840 + }, + { + "epoch": 5.487786714328269, + "grad_norm": 0.10773292183876038, + "learning_rate": 4.5509692355896296e-05, + "loss": 0.0512, + "num_input_tokens_seen": 21377728, + "step": 36845 + }, + { + "epoch": 5.488531426869228, + "grad_norm": 20.217967987060547, + "learning_rate": 4.550783413848929e-05, + "loss": 0.2902, + "num_input_tokens_seen": 21380384, + "step": 36850 + }, + { + "epoch": 5.489276139410188, + "grad_norm": 16.604076385498047, + "learning_rate": 4.550597557462328e-05, + "loss": 0.2455, + "num_input_tokens_seen": 21383136, + "step": 36855 + }, + { + "epoch": 5.490020851951147, + "grad_norm": 15.189139366149902, + "learning_rate": 4.5504116664329656e-05, + "loss": 0.4732, + "num_input_tokens_seen": 21385920, + "step": 36860 + }, + { + "epoch": 5.490765564492106, + "grad_norm": 24.09340476989746, + "learning_rate": 4.550225740763981e-05, + "loss": 0.4122, + "num_input_tokens_seen": 21388576, + "step": 36865 + }, + { + "epoch": 5.491510277033065, + "grad_norm": 19.488798141479492, + "learning_rate": 4.5500397804585166e-05, + "loss": 0.1994, + "num_input_tokens_seen": 21391456, + "step": 36870 + }, + { + "epoch": 5.4922549895740245, + "grad_norm": 18.61355209350586, + "learning_rate": 4.5498537855197145e-05, + "loss": 0.0703, + "num_input_tokens_seen": 21394304, + "step": 36875 + }, + { + "epoch": 5.492999702114983, + "grad_norm": 27.72475242614746, + "learning_rate": 4.549667755950715e-05, + "loss": 0.376, + "num_input_tokens_seen": 21397024, + "step": 36880 + }, + { + "epoch": 5.493744414655943, + "grad_norm": 24.329845428466797, + "learning_rate": 4.5494816917546625e-05, + "loss": 0.2834, + "num_input_tokens_seen": 21399872, + "step": 36885 + }, + { + "epoch": 5.494489127196902, + "grad_norm": 2.025702714920044, + "learning_rate": 4.549295592934699e-05, + "loss": 0.5566, + "num_input_tokens_seen": 21402784, + "step": 36890 + }, + { + "epoch": 5.495233839737861, + "grad_norm": 14.589418411254883, + "learning_rate": 4.5491094594939705e-05, + "loss": 0.1502, + "num_input_tokens_seen": 21405984, + "step": 36895 + }, + { + "epoch": 5.49597855227882, + "grad_norm": 9.248529434204102, + "learning_rate": 4.5489232914356196e-05, + "loss": 0.391, + "num_input_tokens_seen": 21409120, + "step": 36900 + }, + { + "epoch": 5.49672326481978, + "grad_norm": 1.1007328033447266, + "learning_rate": 4.548737088762792e-05, + "loss": 0.1512, + "num_input_tokens_seen": 21411680, + "step": 36905 + }, + { + "epoch": 5.4974679773607384, + "grad_norm": 5.754781723022461, + "learning_rate": 4.548550851478634e-05, + "loss": 0.2947, + "num_input_tokens_seen": 21414752, + "step": 36910 + }, + { + "epoch": 5.498212689901698, + "grad_norm": 32.02296447753906, + "learning_rate": 4.548364579586291e-05, + "loss": 0.0426, + "num_input_tokens_seen": 21417504, + "step": 36915 + }, + { + "epoch": 5.498957402442657, + "grad_norm": 103.2111587524414, + "learning_rate": 4.548178273088911e-05, + "loss": 0.0612, + "num_input_tokens_seen": 21420736, + "step": 36920 + }, + { + "epoch": 5.4997021149836165, + "grad_norm": 0.24386021494865417, + "learning_rate": 4.54799193198964e-05, + "loss": 0.0037, + "num_input_tokens_seen": 21423680, + "step": 36925 + }, + { + "epoch": 5.500446827524575, + "grad_norm": 110.59529113769531, + "learning_rate": 4.547805556291627e-05, + "loss": 0.1235, + "num_input_tokens_seen": 21426464, + "step": 36930 + }, + { + "epoch": 5.501191540065535, + "grad_norm": 0.12223777920007706, + "learning_rate": 4.54761914599802e-05, + "loss": 0.4761, + "num_input_tokens_seen": 21429632, + "step": 36935 + }, + { + "epoch": 5.501936252606494, + "grad_norm": 0.06783416122198105, + "learning_rate": 4.54743270111197e-05, + "loss": 0.2266, + "num_input_tokens_seen": 21432928, + "step": 36940 + }, + { + "epoch": 5.502680965147453, + "grad_norm": 0.022939002141356468, + "learning_rate": 4.547246221636624e-05, + "loss": 0.2349, + "num_input_tokens_seen": 21436032, + "step": 36945 + }, + { + "epoch": 5.503425677688412, + "grad_norm": 26.61278533935547, + "learning_rate": 4.5470597075751345e-05, + "loss": 0.3442, + "num_input_tokens_seen": 21438976, + "step": 36950 + }, + { + "epoch": 5.504170390229372, + "grad_norm": 0.040660541504621506, + "learning_rate": 4.5468731589306516e-05, + "loss": 0.212, + "num_input_tokens_seen": 21442016, + "step": 36955 + }, + { + "epoch": 5.5049151027703305, + "grad_norm": 0.3666742146015167, + "learning_rate": 4.546686575706327e-05, + "loss": 0.0229, + "num_input_tokens_seen": 21444768, + "step": 36960 + }, + { + "epoch": 5.50565981531129, + "grad_norm": 1.132315754890442, + "learning_rate": 4.546499957905313e-05, + "loss": 0.2865, + "num_input_tokens_seen": 21447456, + "step": 36965 + }, + { + "epoch": 5.506404527852249, + "grad_norm": 31.740583419799805, + "learning_rate": 4.546313305530762e-05, + "loss": 0.1351, + "num_input_tokens_seen": 21450656, + "step": 36970 + }, + { + "epoch": 5.5071492403932085, + "grad_norm": 1.1989184617996216, + "learning_rate": 4.546126618585828e-05, + "loss": 0.3636, + "num_input_tokens_seen": 21453600, + "step": 36975 + }, + { + "epoch": 5.507893952934167, + "grad_norm": 27.525827407836914, + "learning_rate": 4.5459398970736636e-05, + "loss": 0.0525, + "num_input_tokens_seen": 21456640, + "step": 36980 + }, + { + "epoch": 5.508638665475127, + "grad_norm": 21.035499572753906, + "learning_rate": 4.545753140997424e-05, + "loss": 0.4387, + "num_input_tokens_seen": 21459392, + "step": 36985 + }, + { + "epoch": 5.509383378016086, + "grad_norm": 16.127946853637695, + "learning_rate": 4.545566350360265e-05, + "loss": 0.1731, + "num_input_tokens_seen": 21462656, + "step": 36990 + }, + { + "epoch": 5.510128090557045, + "grad_norm": 0.06620416045188904, + "learning_rate": 4.5453795251653416e-05, + "loss": 0.3756, + "num_input_tokens_seen": 21465376, + "step": 36995 + }, + { + "epoch": 5.510872803098004, + "grad_norm": 0.0816991999745369, + "learning_rate": 4.545192665415809e-05, + "loss": 0.3827, + "num_input_tokens_seen": 21468224, + "step": 37000 + }, + { + "epoch": 5.511617515638964, + "grad_norm": 18.18951988220215, + "learning_rate": 4.545005771114826e-05, + "loss": 0.3297, + "num_input_tokens_seen": 21470848, + "step": 37005 + }, + { + "epoch": 5.5123622281799225, + "grad_norm": 18.32428550720215, + "learning_rate": 4.544818842265548e-05, + "loss": 0.2665, + "num_input_tokens_seen": 21473504, + "step": 37010 + }, + { + "epoch": 5.513106940720881, + "grad_norm": 81.45669555664062, + "learning_rate": 4.544631878871135e-05, + "loss": 0.452, + "num_input_tokens_seen": 21476384, + "step": 37015 + }, + { + "epoch": 5.513851653261841, + "grad_norm": 42.818424224853516, + "learning_rate": 4.544444880934744e-05, + "loss": 0.1834, + "num_input_tokens_seen": 21478912, + "step": 37020 + }, + { + "epoch": 5.5145963658028005, + "grad_norm": 66.94744873046875, + "learning_rate": 4.5442578484595346e-05, + "loss": 0.3807, + "num_input_tokens_seen": 21481664, + "step": 37025 + }, + { + "epoch": 5.515341078343759, + "grad_norm": 0.44388115406036377, + "learning_rate": 4.544070781448666e-05, + "loss": 0.1431, + "num_input_tokens_seen": 21484576, + "step": 37030 + }, + { + "epoch": 5.516085790884718, + "grad_norm": 0.7571584582328796, + "learning_rate": 4.5438836799053e-05, + "loss": 0.268, + "num_input_tokens_seen": 21487552, + "step": 37035 + }, + { + "epoch": 5.516830503425678, + "grad_norm": 0.10239581018686295, + "learning_rate": 4.5436965438325953e-05, + "loss": 0.3169, + "num_input_tokens_seen": 21490432, + "step": 37040 + }, + { + "epoch": 5.517575215966637, + "grad_norm": 8.128478050231934, + "learning_rate": 4.543509373233715e-05, + "loss": 0.1783, + "num_input_tokens_seen": 21493088, + "step": 37045 + }, + { + "epoch": 5.518319928507596, + "grad_norm": 0.24791721999645233, + "learning_rate": 4.5433221681118215e-05, + "loss": 0.2337, + "num_input_tokens_seen": 21496064, + "step": 37050 + }, + { + "epoch": 5.519064641048555, + "grad_norm": 0.05014318227767944, + "learning_rate": 4.5431349284700764e-05, + "loss": 0.1007, + "num_input_tokens_seen": 21498784, + "step": 37055 + }, + { + "epoch": 5.5198093535895145, + "grad_norm": 24.266828536987305, + "learning_rate": 4.542947654311643e-05, + "loss": 0.1653, + "num_input_tokens_seen": 21501824, + "step": 37060 + }, + { + "epoch": 5.520554066130473, + "grad_norm": 25.934343338012695, + "learning_rate": 4.542760345639686e-05, + "loss": 0.2661, + "num_input_tokens_seen": 21505056, + "step": 37065 + }, + { + "epoch": 5.521298778671433, + "grad_norm": 23.6604061126709, + "learning_rate": 4.542573002457368e-05, + "loss": 0.1861, + "num_input_tokens_seen": 21507680, + "step": 37070 + }, + { + "epoch": 5.522043491212392, + "grad_norm": 9.998400688171387, + "learning_rate": 4.5423856247678556e-05, + "loss": 0.0947, + "num_input_tokens_seen": 21510400, + "step": 37075 + }, + { + "epoch": 5.522788203753351, + "grad_norm": 6.103919982910156, + "learning_rate": 4.542198212574314e-05, + "loss": 0.2647, + "num_input_tokens_seen": 21513152, + "step": 37080 + }, + { + "epoch": 5.52353291629431, + "grad_norm": 3.1907029151916504, + "learning_rate": 4.5420107658799094e-05, + "loss": 0.0844, + "num_input_tokens_seen": 21516160, + "step": 37085 + }, + { + "epoch": 5.52427762883527, + "grad_norm": 5.371120452880859, + "learning_rate": 4.541823284687808e-05, + "loss": 0.0856, + "num_input_tokens_seen": 21519008, + "step": 37090 + }, + { + "epoch": 5.5250223413762285, + "grad_norm": 35.12039566040039, + "learning_rate": 4.541635769001178e-05, + "loss": 0.4227, + "num_input_tokens_seen": 21521824, + "step": 37095 + }, + { + "epoch": 5.525767053917188, + "grad_norm": 0.245331272482872, + "learning_rate": 4.5414482188231864e-05, + "loss": 0.1336, + "num_input_tokens_seen": 21524672, + "step": 37100 + }, + { + "epoch": 5.526511766458147, + "grad_norm": 29.56928062438965, + "learning_rate": 4.5412606341570016e-05, + "loss": 0.3156, + "num_input_tokens_seen": 21527488, + "step": 37105 + }, + { + "epoch": 5.5272564789991065, + "grad_norm": 29.558944702148438, + "learning_rate": 4.5410730150057935e-05, + "loss": 0.2982, + "num_input_tokens_seen": 21530528, + "step": 37110 + }, + { + "epoch": 5.528001191540065, + "grad_norm": 12.529571533203125, + "learning_rate": 4.5408853613727307e-05, + "loss": 0.2005, + "num_input_tokens_seen": 21533280, + "step": 37115 + }, + { + "epoch": 5.528745904081025, + "grad_norm": 79.12821960449219, + "learning_rate": 4.540697673260984e-05, + "loss": 0.1297, + "num_input_tokens_seen": 21535936, + "step": 37120 + }, + { + "epoch": 5.529490616621984, + "grad_norm": 0.028314122930169106, + "learning_rate": 4.5405099506737244e-05, + "loss": 0.3523, + "num_input_tokens_seen": 21538720, + "step": 37125 + }, + { + "epoch": 5.530235329162943, + "grad_norm": 25.650468826293945, + "learning_rate": 4.540322193614123e-05, + "loss": 0.3649, + "num_input_tokens_seen": 21541728, + "step": 37130 + }, + { + "epoch": 5.530980041703902, + "grad_norm": 12.764616012573242, + "learning_rate": 4.540134402085352e-05, + "loss": 0.322, + "num_input_tokens_seen": 21544768, + "step": 37135 + }, + { + "epoch": 5.531724754244862, + "grad_norm": 14.593335151672363, + "learning_rate": 4.539946576090584e-05, + "loss": 0.3596, + "num_input_tokens_seen": 21547840, + "step": 37140 + }, + { + "epoch": 5.5324694667858205, + "grad_norm": 25.194896697998047, + "learning_rate": 4.539758715632992e-05, + "loss": 0.7919, + "num_input_tokens_seen": 21551872, + "step": 37145 + }, + { + "epoch": 5.53321417932678, + "grad_norm": 24.74688720703125, + "learning_rate": 4.539570820715749e-05, + "loss": 0.3824, + "num_input_tokens_seen": 21554880, + "step": 37150 + }, + { + "epoch": 5.533958891867739, + "grad_norm": 1.5986155271530151, + "learning_rate": 4.53938289134203e-05, + "loss": 0.0794, + "num_input_tokens_seen": 21557632, + "step": 37155 + }, + { + "epoch": 5.534703604408699, + "grad_norm": 28.696063995361328, + "learning_rate": 4.5391949275150104e-05, + "loss": 0.3833, + "num_input_tokens_seen": 21560352, + "step": 37160 + }, + { + "epoch": 5.535448316949657, + "grad_norm": 5.431529521942139, + "learning_rate": 4.539006929237864e-05, + "loss": 0.0744, + "num_input_tokens_seen": 21563104, + "step": 37165 + }, + { + "epoch": 5.536193029490617, + "grad_norm": 0.3443866968154907, + "learning_rate": 4.538818896513769e-05, + "loss": 0.3536, + "num_input_tokens_seen": 21565952, + "step": 37170 + }, + { + "epoch": 5.536937742031576, + "grad_norm": 30.79537582397461, + "learning_rate": 4.5386308293459e-05, + "loss": 0.4002, + "num_input_tokens_seen": 21568704, + "step": 37175 + }, + { + "epoch": 5.537682454572535, + "grad_norm": 25.697799682617188, + "learning_rate": 4.5384427277374355e-05, + "loss": 0.4967, + "num_input_tokens_seen": 21571584, + "step": 37180 + }, + { + "epoch": 5.538427167113494, + "grad_norm": 0.2698490023612976, + "learning_rate": 4.538254591691553e-05, + "loss": 0.3192, + "num_input_tokens_seen": 21574560, + "step": 37185 + }, + { + "epoch": 5.539171879654454, + "grad_norm": 7.031741619110107, + "learning_rate": 4.538066421211431e-05, + "loss": 0.2068, + "num_input_tokens_seen": 21577504, + "step": 37190 + }, + { + "epoch": 5.5399165921954125, + "grad_norm": 39.75946044921875, + "learning_rate": 4.5378782163002476e-05, + "loss": 0.435, + "num_input_tokens_seen": 21580288, + "step": 37195 + }, + { + "epoch": 5.540661304736371, + "grad_norm": 25.847604751586914, + "learning_rate": 4.537689976961184e-05, + "loss": 0.2822, + "num_input_tokens_seen": 21583264, + "step": 37200 + }, + { + "epoch": 5.541406017277331, + "grad_norm": 4.941544055938721, + "learning_rate": 4.537501703197418e-05, + "loss": 0.1106, + "num_input_tokens_seen": 21586080, + "step": 37205 + }, + { + "epoch": 5.542150729818291, + "grad_norm": 10.066471099853516, + "learning_rate": 4.5373133950121314e-05, + "loss": 0.259, + "num_input_tokens_seen": 21588768, + "step": 37210 + }, + { + "epoch": 5.542895442359249, + "grad_norm": 24.394338607788086, + "learning_rate": 4.537125052408506e-05, + "loss": 0.2498, + "num_input_tokens_seen": 21591616, + "step": 37215 + }, + { + "epoch": 5.543640154900208, + "grad_norm": 29.29457664489746, + "learning_rate": 4.536936675389724e-05, + "loss": 0.4605, + "num_input_tokens_seen": 21594720, + "step": 37220 + }, + { + "epoch": 5.544384867441168, + "grad_norm": 34.6280517578125, + "learning_rate": 4.5367482639589665e-05, + "loss": 0.4707, + "num_input_tokens_seen": 21597408, + "step": 37225 + }, + { + "epoch": 5.5451295799821265, + "grad_norm": 12.54964542388916, + "learning_rate": 4.536559818119418e-05, + "loss": 0.1439, + "num_input_tokens_seen": 21600096, + "step": 37230 + }, + { + "epoch": 5.545874292523086, + "grad_norm": 12.754754066467285, + "learning_rate": 4.53637133787426e-05, + "loss": 0.4188, + "num_input_tokens_seen": 21603072, + "step": 37235 + }, + { + "epoch": 5.546619005064045, + "grad_norm": 22.28994369506836, + "learning_rate": 4.536182823226678e-05, + "loss": 0.285, + "num_input_tokens_seen": 21606112, + "step": 37240 + }, + { + "epoch": 5.547363717605005, + "grad_norm": 30.342029571533203, + "learning_rate": 4.535994274179858e-05, + "loss": 0.443, + "num_input_tokens_seen": 21609600, + "step": 37245 + }, + { + "epoch": 5.548108430145963, + "grad_norm": 10.258237838745117, + "learning_rate": 4.535805690736983e-05, + "loss": 0.1114, + "num_input_tokens_seen": 21612576, + "step": 37250 + }, + { + "epoch": 5.548853142686923, + "grad_norm": 35.1461067199707, + "learning_rate": 4.535617072901239e-05, + "loss": 0.1433, + "num_input_tokens_seen": 21615264, + "step": 37255 + }, + { + "epoch": 5.549597855227882, + "grad_norm": 10.02383041381836, + "learning_rate": 4.535428420675816e-05, + "loss": 0.1159, + "num_input_tokens_seen": 21617984, + "step": 37260 + }, + { + "epoch": 5.550342567768841, + "grad_norm": 10.213496208190918, + "learning_rate": 4.535239734063896e-05, + "loss": 0.4698, + "num_input_tokens_seen": 21620896, + "step": 37265 + }, + { + "epoch": 5.5510872803098, + "grad_norm": 27.636634826660156, + "learning_rate": 4.535051013068671e-05, + "loss": 0.1428, + "num_input_tokens_seen": 21623872, + "step": 37270 + }, + { + "epoch": 5.55183199285076, + "grad_norm": 10.951092720031738, + "learning_rate": 4.5348622576933265e-05, + "loss": 0.3075, + "num_input_tokens_seen": 21627232, + "step": 37275 + }, + { + "epoch": 5.5525767053917185, + "grad_norm": 1.6371543407440186, + "learning_rate": 4.534673467941053e-05, + "loss": 0.4473, + "num_input_tokens_seen": 21630176, + "step": 37280 + }, + { + "epoch": 5.553321417932678, + "grad_norm": 77.49311065673828, + "learning_rate": 4.534484643815038e-05, + "loss": 0.4488, + "num_input_tokens_seen": 21632832, + "step": 37285 + }, + { + "epoch": 5.554066130473637, + "grad_norm": 48.238426208496094, + "learning_rate": 4.534295785318474e-05, + "loss": 0.2385, + "num_input_tokens_seen": 21635744, + "step": 37290 + }, + { + "epoch": 5.554810843014597, + "grad_norm": 179.77056884765625, + "learning_rate": 4.53410689245455e-05, + "loss": 0.0871, + "num_input_tokens_seen": 21638464, + "step": 37295 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 10.505155563354492, + "learning_rate": 4.5339179652264576e-05, + "loss": 0.6628, + "num_input_tokens_seen": 21641344, + "step": 37300 + }, + { + "epoch": 5.556300268096515, + "grad_norm": 0.12250462174415588, + "learning_rate": 4.5337290036373875e-05, + "loss": 0.4023, + "num_input_tokens_seen": 21644224, + "step": 37305 + }, + { + "epoch": 5.557044980637474, + "grad_norm": 12.71044635772705, + "learning_rate": 4.533540007690533e-05, + "loss": 0.4977, + "num_input_tokens_seen": 21647232, + "step": 37310 + }, + { + "epoch": 5.557789693178433, + "grad_norm": 26.097780227661133, + "learning_rate": 4.533350977389087e-05, + "loss": 0.6021, + "num_input_tokens_seen": 21650336, + "step": 37315 + }, + { + "epoch": 5.558534405719392, + "grad_norm": 28.01920509338379, + "learning_rate": 4.533161912736243e-05, + "loss": 0.3651, + "num_input_tokens_seen": 21653024, + "step": 37320 + }, + { + "epoch": 5.559279118260352, + "grad_norm": 82.60037994384766, + "learning_rate": 4.532972813735196e-05, + "loss": 0.2385, + "num_input_tokens_seen": 21656000, + "step": 37325 + }, + { + "epoch": 5.560023830801311, + "grad_norm": 1.0372874736785889, + "learning_rate": 4.532783680389138e-05, + "loss": 0.1521, + "num_input_tokens_seen": 21658848, + "step": 37330 + }, + { + "epoch": 5.56076854334227, + "grad_norm": 41.717288970947266, + "learning_rate": 4.532594512701266e-05, + "loss": 0.2273, + "num_input_tokens_seen": 21661952, + "step": 37335 + }, + { + "epoch": 5.561513255883229, + "grad_norm": 0.2503383159637451, + "learning_rate": 4.532405310674776e-05, + "loss": 0.1607, + "num_input_tokens_seen": 21665248, + "step": 37340 + }, + { + "epoch": 5.562257968424189, + "grad_norm": 2.046886444091797, + "learning_rate": 4.532216074312864e-05, + "loss": 0.2328, + "num_input_tokens_seen": 21668416, + "step": 37345 + }, + { + "epoch": 5.563002680965147, + "grad_norm": 15.375107765197754, + "learning_rate": 4.5320268036187266e-05, + "loss": 0.2869, + "num_input_tokens_seen": 21671104, + "step": 37350 + }, + { + "epoch": 5.563747393506107, + "grad_norm": 7.589817047119141, + "learning_rate": 4.531837498595561e-05, + "loss": 0.1661, + "num_input_tokens_seen": 21674208, + "step": 37355 + }, + { + "epoch": 5.564492106047066, + "grad_norm": 0.03710595890879631, + "learning_rate": 4.531648159246567e-05, + "loss": 0.2643, + "num_input_tokens_seen": 21677440, + "step": 37360 + }, + { + "epoch": 5.5652368185880245, + "grad_norm": 9.095502853393555, + "learning_rate": 4.531458785574941e-05, + "loss": 0.2062, + "num_input_tokens_seen": 21680576, + "step": 37365 + }, + { + "epoch": 5.565981531128984, + "grad_norm": 54.335716247558594, + "learning_rate": 4.531269377583885e-05, + "loss": 0.4314, + "num_input_tokens_seen": 21683392, + "step": 37370 + }, + { + "epoch": 5.566726243669944, + "grad_norm": 1.0432623624801636, + "learning_rate": 4.5310799352765964e-05, + "loss": 0.0808, + "num_input_tokens_seen": 21686208, + "step": 37375 + }, + { + "epoch": 5.567470956210903, + "grad_norm": 0.009246463887393475, + "learning_rate": 4.5308904586562774e-05, + "loss": 0.2637, + "num_input_tokens_seen": 21689184, + "step": 37380 + }, + { + "epoch": 5.568215668751861, + "grad_norm": 14.312500953674316, + "learning_rate": 4.530700947726127e-05, + "loss": 0.4455, + "num_input_tokens_seen": 21691904, + "step": 37385 + }, + { + "epoch": 5.568960381292821, + "grad_norm": 38.510345458984375, + "learning_rate": 4.530511402489349e-05, + "loss": 0.1438, + "num_input_tokens_seen": 21694688, + "step": 37390 + }, + { + "epoch": 5.569705093833781, + "grad_norm": 23.752891540527344, + "learning_rate": 4.530321822949144e-05, + "loss": 0.3152, + "num_input_tokens_seen": 21697216, + "step": 37395 + }, + { + "epoch": 5.570449806374739, + "grad_norm": 26.21273422241211, + "learning_rate": 4.530132209108715e-05, + "loss": 0.063, + "num_input_tokens_seen": 21700000, + "step": 37400 + }, + { + "epoch": 5.571194518915698, + "grad_norm": 2.6917214393615723, + "learning_rate": 4.529942560971266e-05, + "loss": 0.4748, + "num_input_tokens_seen": 21702656, + "step": 37405 + }, + { + "epoch": 5.571939231456658, + "grad_norm": 77.26396179199219, + "learning_rate": 4.529752878540001e-05, + "loss": 0.3356, + "num_input_tokens_seen": 21705664, + "step": 37410 + }, + { + "epoch": 5.572683943997617, + "grad_norm": 72.14231872558594, + "learning_rate": 4.529563161818124e-05, + "loss": 0.242, + "num_input_tokens_seen": 21708384, + "step": 37415 + }, + { + "epoch": 5.573428656538576, + "grad_norm": 19.17220115661621, + "learning_rate": 4.529373410808841e-05, + "loss": 0.2581, + "num_input_tokens_seen": 21711424, + "step": 37420 + }, + { + "epoch": 5.574173369079535, + "grad_norm": 33.02372741699219, + "learning_rate": 4.5291836255153555e-05, + "loss": 0.2249, + "num_input_tokens_seen": 21714336, + "step": 37425 + }, + { + "epoch": 5.574918081620495, + "grad_norm": 26.6669921875, + "learning_rate": 4.528993805940874e-05, + "loss": 0.0644, + "num_input_tokens_seen": 21717184, + "step": 37430 + }, + { + "epoch": 5.575662794161453, + "grad_norm": 4.120089530944824, + "learning_rate": 4.528803952088606e-05, + "loss": 0.2397, + "num_input_tokens_seen": 21720224, + "step": 37435 + }, + { + "epoch": 5.576407506702413, + "grad_norm": 37.70814514160156, + "learning_rate": 4.5286140639617566e-05, + "loss": 0.4351, + "num_input_tokens_seen": 21722912, + "step": 37440 + }, + { + "epoch": 5.577152219243372, + "grad_norm": 2.2565526962280273, + "learning_rate": 4.528424141563535e-05, + "loss": 0.2967, + "num_input_tokens_seen": 21725760, + "step": 37445 + }, + { + "epoch": 5.577896931784331, + "grad_norm": 0.16190201044082642, + "learning_rate": 4.528234184897149e-05, + "loss": 0.301, + "num_input_tokens_seen": 21728704, + "step": 37450 + }, + { + "epoch": 5.57864164432529, + "grad_norm": 62.60167694091797, + "learning_rate": 4.528044193965807e-05, + "loss": 0.3475, + "num_input_tokens_seen": 21731616, + "step": 37455 + }, + { + "epoch": 5.57938635686625, + "grad_norm": 2.0448403358459473, + "learning_rate": 4.527854168772721e-05, + "loss": 0.0521, + "num_input_tokens_seen": 21734560, + "step": 37460 + }, + { + "epoch": 5.580131069407209, + "grad_norm": 16.38983726501465, + "learning_rate": 4.527664109321098e-05, + "loss": 0.2135, + "num_input_tokens_seen": 21737280, + "step": 37465 + }, + { + "epoch": 5.580875781948168, + "grad_norm": 0.4472859799861908, + "learning_rate": 4.5274740156141516e-05, + "loss": 0.3021, + "num_input_tokens_seen": 21740032, + "step": 37470 + }, + { + "epoch": 5.581620494489127, + "grad_norm": 14.566608428955078, + "learning_rate": 4.527283887655093e-05, + "loss": 0.2908, + "num_input_tokens_seen": 21742848, + "step": 37475 + }, + { + "epoch": 5.582365207030087, + "grad_norm": 29.09869384765625, + "learning_rate": 4.5270937254471325e-05, + "loss": 0.2072, + "num_input_tokens_seen": 21745600, + "step": 37480 + }, + { + "epoch": 5.583109919571045, + "grad_norm": 0.1529073715209961, + "learning_rate": 4.526903528993484e-05, + "loss": 0.1844, + "num_input_tokens_seen": 21748416, + "step": 37485 + }, + { + "epoch": 5.583854632112005, + "grad_norm": 16.532203674316406, + "learning_rate": 4.526713298297361e-05, + "loss": 0.8274, + "num_input_tokens_seen": 21751328, + "step": 37490 + }, + { + "epoch": 5.584599344652964, + "grad_norm": 0.03253888711333275, + "learning_rate": 4.526523033361976e-05, + "loss": 0.0809, + "num_input_tokens_seen": 21754144, + "step": 37495 + }, + { + "epoch": 5.5853440571939235, + "grad_norm": 17.857471466064453, + "learning_rate": 4.5263327341905443e-05, + "loss": 0.1222, + "num_input_tokens_seen": 21756704, + "step": 37500 + }, + { + "epoch": 5.586088769734882, + "grad_norm": 36.03583526611328, + "learning_rate": 4.526142400786281e-05, + "loss": 0.2239, + "num_input_tokens_seen": 21759872, + "step": 37505 + }, + { + "epoch": 5.586833482275842, + "grad_norm": 0.75238037109375, + "learning_rate": 4.5259520331524004e-05, + "loss": 0.1526, + "num_input_tokens_seen": 21762656, + "step": 37510 + }, + { + "epoch": 5.587578194816801, + "grad_norm": 121.43663787841797, + "learning_rate": 4.525761631292119e-05, + "loss": 0.3103, + "num_input_tokens_seen": 21765408, + "step": 37515 + }, + { + "epoch": 5.58832290735776, + "grad_norm": 35.8033561706543, + "learning_rate": 4.5255711952086545e-05, + "loss": 0.1206, + "num_input_tokens_seen": 21768064, + "step": 37520 + }, + { + "epoch": 5.589067619898719, + "grad_norm": 0.018506865948438644, + "learning_rate": 4.525380724905224e-05, + "loss": 0.3834, + "num_input_tokens_seen": 21771136, + "step": 37525 + }, + { + "epoch": 5.589812332439678, + "grad_norm": 22.45601463317871, + "learning_rate": 4.525190220385043e-05, + "loss": 0.5415, + "num_input_tokens_seen": 21773920, + "step": 37530 + }, + { + "epoch": 5.590557044980637, + "grad_norm": 0.35952597856521606, + "learning_rate": 4.5249996816513325e-05, + "loss": 0.347, + "num_input_tokens_seen": 21776800, + "step": 37535 + }, + { + "epoch": 5.591301757521597, + "grad_norm": 1.2854293584823608, + "learning_rate": 4.52480910870731e-05, + "loss": 0.1996, + "num_input_tokens_seen": 21780000, + "step": 37540 + }, + { + "epoch": 5.592046470062556, + "grad_norm": 38.09521484375, + "learning_rate": 4.524618501556196e-05, + "loss": 0.2764, + "num_input_tokens_seen": 21782944, + "step": 37545 + }, + { + "epoch": 5.592791182603515, + "grad_norm": 32.31460189819336, + "learning_rate": 4.52442786020121e-05, + "loss": 0.7586, + "num_input_tokens_seen": 21785824, + "step": 37550 + }, + { + "epoch": 5.593535895144474, + "grad_norm": 50.89505386352539, + "learning_rate": 4.524237184645573e-05, + "loss": 0.3061, + "num_input_tokens_seen": 21788672, + "step": 37555 + }, + { + "epoch": 5.594280607685434, + "grad_norm": 4.341821193695068, + "learning_rate": 4.524046474892506e-05, + "loss": 0.317, + "num_input_tokens_seen": 21791552, + "step": 37560 + }, + { + "epoch": 5.595025320226393, + "grad_norm": 8.22696590423584, + "learning_rate": 4.523855730945231e-05, + "loss": 0.341, + "num_input_tokens_seen": 21794624, + "step": 37565 + }, + { + "epoch": 5.595770032767351, + "grad_norm": 0.14759285748004913, + "learning_rate": 4.52366495280697e-05, + "loss": 0.2966, + "num_input_tokens_seen": 21797664, + "step": 37570 + }, + { + "epoch": 5.596514745308311, + "grad_norm": 18.88773536682129, + "learning_rate": 4.523474140480947e-05, + "loss": 0.0863, + "num_input_tokens_seen": 21800576, + "step": 37575 + }, + { + "epoch": 5.59725945784927, + "grad_norm": 16.1450252532959, + "learning_rate": 4.5232832939703846e-05, + "loss": 0.5035, + "num_input_tokens_seen": 21803648, + "step": 37580 + }, + { + "epoch": 5.5980041703902295, + "grad_norm": 9.130575180053711, + "learning_rate": 4.5230924132785066e-05, + "loss": 0.4791, + "num_input_tokens_seen": 21806624, + "step": 37585 + }, + { + "epoch": 5.598748882931188, + "grad_norm": 13.137919425964355, + "learning_rate": 4.52290149840854e-05, + "loss": 0.2009, + "num_input_tokens_seen": 21809728, + "step": 37590 + }, + { + "epoch": 5.599493595472148, + "grad_norm": 8.751379013061523, + "learning_rate": 4.522710549363708e-05, + "loss": 0.2585, + "num_input_tokens_seen": 21812576, + "step": 37595 + }, + { + "epoch": 5.600238308013107, + "grad_norm": 15.90333080291748, + "learning_rate": 4.5225195661472364e-05, + "loss": 0.1462, + "num_input_tokens_seen": 21815552, + "step": 37600 + }, + { + "epoch": 5.600983020554066, + "grad_norm": 15.653210639953613, + "learning_rate": 4.522328548762353e-05, + "loss": 0.2539, + "num_input_tokens_seen": 21818592, + "step": 37605 + }, + { + "epoch": 5.601727733095025, + "grad_norm": 1.5019253492355347, + "learning_rate": 4.5221374972122837e-05, + "loss": 0.0291, + "num_input_tokens_seen": 21821376, + "step": 37610 + }, + { + "epoch": 5.602472445635985, + "grad_norm": 33.52887725830078, + "learning_rate": 4.521946411500257e-05, + "loss": 0.1495, + "num_input_tokens_seen": 21824160, + "step": 37615 + }, + { + "epoch": 5.603217158176943, + "grad_norm": 22.62188148498535, + "learning_rate": 4.5217552916295e-05, + "loss": 0.0575, + "num_input_tokens_seen": 21827072, + "step": 37620 + }, + { + "epoch": 5.603961870717903, + "grad_norm": 0.066196009516716, + "learning_rate": 4.521564137603244e-05, + "loss": 0.0185, + "num_input_tokens_seen": 21829728, + "step": 37625 + }, + { + "epoch": 5.604706583258862, + "grad_norm": 0.08770501613616943, + "learning_rate": 4.521372949424715e-05, + "loss": 0.2927, + "num_input_tokens_seen": 21832416, + "step": 37630 + }, + { + "epoch": 5.6054512957998215, + "grad_norm": 0.05590103194117546, + "learning_rate": 4.521181727097144e-05, + "loss": 0.0233, + "num_input_tokens_seen": 21835072, + "step": 37635 + }, + { + "epoch": 5.60619600834078, + "grad_norm": 17.464387893676758, + "learning_rate": 4.5209904706237626e-05, + "loss": 0.3889, + "num_input_tokens_seen": 21837952, + "step": 37640 + }, + { + "epoch": 5.60694072088174, + "grad_norm": 0.5010012984275818, + "learning_rate": 4.5207991800078015e-05, + "loss": 0.3512, + "num_input_tokens_seen": 21841056, + "step": 37645 + }, + { + "epoch": 5.607685433422699, + "grad_norm": 20.50478744506836, + "learning_rate": 4.520607855252492e-05, + "loss": 0.3756, + "num_input_tokens_seen": 21843936, + "step": 37650 + }, + { + "epoch": 5.608430145963658, + "grad_norm": 2.5104660987854004, + "learning_rate": 4.520416496361066e-05, + "loss": 0.1701, + "num_input_tokens_seen": 21846624, + "step": 37655 + }, + { + "epoch": 5.609174858504617, + "grad_norm": 0.1759195774793625, + "learning_rate": 4.5202251033367574e-05, + "loss": 0.0993, + "num_input_tokens_seen": 21849600, + "step": 37660 + }, + { + "epoch": 5.609919571045577, + "grad_norm": 13.272542953491211, + "learning_rate": 4.5200336761827985e-05, + "loss": 0.2505, + "num_input_tokens_seen": 21852544, + "step": 37665 + }, + { + "epoch": 5.6106642835865355, + "grad_norm": 0.05719417706131935, + "learning_rate": 4.519842214902423e-05, + "loss": 0.288, + "num_input_tokens_seen": 21855456, + "step": 37670 + }, + { + "epoch": 5.611408996127495, + "grad_norm": 44.602230072021484, + "learning_rate": 4.519650719498868e-05, + "loss": 0.1548, + "num_input_tokens_seen": 21858464, + "step": 37675 + }, + { + "epoch": 5.612153708668454, + "grad_norm": 4.5428924560546875, + "learning_rate": 4.519459189975365e-05, + "loss": 0.2606, + "num_input_tokens_seen": 21861536, + "step": 37680 + }, + { + "epoch": 5.6128984212094135, + "grad_norm": 27.360729217529297, + "learning_rate": 4.519267626335153e-05, + "loss": 0.3362, + "num_input_tokens_seen": 21864384, + "step": 37685 + }, + { + "epoch": 5.613643133750372, + "grad_norm": 24.833518981933594, + "learning_rate": 4.519076028581466e-05, + "loss": 0.0941, + "num_input_tokens_seen": 21867232, + "step": 37690 + }, + { + "epoch": 5.614387846291332, + "grad_norm": 33.5071907043457, + "learning_rate": 4.518884396717541e-05, + "loss": 0.2798, + "num_input_tokens_seen": 21870240, + "step": 37695 + }, + { + "epoch": 5.615132558832291, + "grad_norm": 0.02087111584842205, + "learning_rate": 4.518692730746616e-05, + "loss": 0.2691, + "num_input_tokens_seen": 21873184, + "step": 37700 + }, + { + "epoch": 5.61587727137325, + "grad_norm": 0.12416615337133408, + "learning_rate": 4.51850103067193e-05, + "loss": 0.1613, + "num_input_tokens_seen": 21876000, + "step": 37705 + }, + { + "epoch": 5.616621983914209, + "grad_norm": 13.414334297180176, + "learning_rate": 4.5183092964967204e-05, + "loss": 0.5897, + "num_input_tokens_seen": 21879168, + "step": 37710 + }, + { + "epoch": 5.617366696455168, + "grad_norm": 0.08164598047733307, + "learning_rate": 4.518117528224226e-05, + "loss": 0.3496, + "num_input_tokens_seen": 21882080, + "step": 37715 + }, + { + "epoch": 5.6181114089961275, + "grad_norm": 0.7980658411979675, + "learning_rate": 4.517925725857688e-05, + "loss": 0.0972, + "num_input_tokens_seen": 21884800, + "step": 37720 + }, + { + "epoch": 5.618856121537087, + "grad_norm": 0.4165288209915161, + "learning_rate": 4.5177338894003454e-05, + "loss": 0.4169, + "num_input_tokens_seen": 21887904, + "step": 37725 + }, + { + "epoch": 5.619600834078046, + "grad_norm": 24.059965133666992, + "learning_rate": 4.517542018855439e-05, + "loss": 0.251, + "num_input_tokens_seen": 21890752, + "step": 37730 + }, + { + "epoch": 5.620345546619005, + "grad_norm": 0.4921865463256836, + "learning_rate": 4.517350114226211e-05, + "loss": 0.1458, + "num_input_tokens_seen": 21893504, + "step": 37735 + }, + { + "epoch": 5.621090259159964, + "grad_norm": 17.83749008178711, + "learning_rate": 4.517158175515903e-05, + "loss": 0.1582, + "num_input_tokens_seen": 21896416, + "step": 37740 + }, + { + "epoch": 5.621834971700923, + "grad_norm": 0.017372118309140205, + "learning_rate": 4.516966202727758e-05, + "loss": 0.2865, + "num_input_tokens_seen": 21899360, + "step": 37745 + }, + { + "epoch": 5.622579684241883, + "grad_norm": 17.115367889404297, + "learning_rate": 4.516774195865019e-05, + "loss": 0.1685, + "num_input_tokens_seen": 21902304, + "step": 37750 + }, + { + "epoch": 5.6233243967828415, + "grad_norm": 13.08259391784668, + "learning_rate": 4.5165821549309294e-05, + "loss": 0.4661, + "num_input_tokens_seen": 21905216, + "step": 37755 + }, + { + "epoch": 5.624069109323801, + "grad_norm": 1.2549731731414795, + "learning_rate": 4.516390079928734e-05, + "loss": 0.297, + "num_input_tokens_seen": 21908160, + "step": 37760 + }, + { + "epoch": 5.62481382186476, + "grad_norm": 0.09963800013065338, + "learning_rate": 4.516197970861679e-05, + "loss": 0.4293, + "num_input_tokens_seen": 21910912, + "step": 37765 + }, + { + "epoch": 5.6255585344057195, + "grad_norm": 0.591255247592926, + "learning_rate": 4.5160058277330066e-05, + "loss": 0.1287, + "num_input_tokens_seen": 21913696, + "step": 37770 + }, + { + "epoch": 5.626303246946678, + "grad_norm": 27.301206588745117, + "learning_rate": 4.515813650545965e-05, + "loss": 0.2965, + "num_input_tokens_seen": 21916448, + "step": 37775 + }, + { + "epoch": 5.627047959487638, + "grad_norm": 35.87211990356445, + "learning_rate": 4.515621439303801e-05, + "loss": 0.4894, + "num_input_tokens_seen": 21919424, + "step": 37780 + }, + { + "epoch": 5.627792672028597, + "grad_norm": 0.04499051719903946, + "learning_rate": 4.515429194009761e-05, + "loss": 0.2985, + "num_input_tokens_seen": 21922272, + "step": 37785 + }, + { + "epoch": 5.628537384569556, + "grad_norm": 19.45340919494629, + "learning_rate": 4.515236914667094e-05, + "loss": 0.3017, + "num_input_tokens_seen": 21925280, + "step": 37790 + }, + { + "epoch": 5.629282097110515, + "grad_norm": 15.39750862121582, + "learning_rate": 4.515044601279046e-05, + "loss": 0.1587, + "num_input_tokens_seen": 21928192, + "step": 37795 + }, + { + "epoch": 5.630026809651475, + "grad_norm": 0.45566001534461975, + "learning_rate": 4.514852253848868e-05, + "loss": 0.2739, + "num_input_tokens_seen": 21931168, + "step": 37800 + }, + { + "epoch": 5.6307715221924335, + "grad_norm": 30.20734214782715, + "learning_rate": 4.51465987237981e-05, + "loss": 0.3796, + "num_input_tokens_seen": 21933824, + "step": 37805 + }, + { + "epoch": 5.631516234733393, + "grad_norm": 27.956777572631836, + "learning_rate": 4.51446745687512e-05, + "loss": 0.3077, + "num_input_tokens_seen": 21936672, + "step": 37810 + }, + { + "epoch": 5.632260947274352, + "grad_norm": 0.45801565051078796, + "learning_rate": 4.5142750073380505e-05, + "loss": 0.0714, + "num_input_tokens_seen": 21939584, + "step": 37815 + }, + { + "epoch": 5.6330056598153115, + "grad_norm": 8.152937889099121, + "learning_rate": 4.514082523771851e-05, + "loss": 0.1771, + "num_input_tokens_seen": 21942592, + "step": 37820 + }, + { + "epoch": 5.63375037235627, + "grad_norm": 21.522130966186523, + "learning_rate": 4.513890006179775e-05, + "loss": 0.2006, + "num_input_tokens_seen": 21945600, + "step": 37825 + }, + { + "epoch": 5.63449508489723, + "grad_norm": 0.05084017291665077, + "learning_rate": 4.513697454565074e-05, + "loss": 0.1019, + "num_input_tokens_seen": 21948352, + "step": 37830 + }, + { + "epoch": 5.635239797438189, + "grad_norm": 33.933834075927734, + "learning_rate": 4.513504868931001e-05, + "loss": 0.2408, + "num_input_tokens_seen": 21950976, + "step": 37835 + }, + { + "epoch": 5.635984509979148, + "grad_norm": 12.889602661132812, + "learning_rate": 4.51331224928081e-05, + "loss": 0.2984, + "num_input_tokens_seen": 21953728, + "step": 37840 + }, + { + "epoch": 5.636729222520107, + "grad_norm": 43.49184036254883, + "learning_rate": 4.5131195956177546e-05, + "loss": 0.2448, + "num_input_tokens_seen": 21956704, + "step": 37845 + }, + { + "epoch": 5.637473935061067, + "grad_norm": 18.55109214782715, + "learning_rate": 4.5129269079450894e-05, + "loss": 0.4231, + "num_input_tokens_seen": 21959712, + "step": 37850 + }, + { + "epoch": 5.6382186476020255, + "grad_norm": 32.31814193725586, + "learning_rate": 4.512734186266071e-05, + "loss": 0.4558, + "num_input_tokens_seen": 21962688, + "step": 37855 + }, + { + "epoch": 5.638963360142985, + "grad_norm": 8.973493576049805, + "learning_rate": 4.512541430583953e-05, + "loss": 0.381, + "num_input_tokens_seen": 21965632, + "step": 37860 + }, + { + "epoch": 5.639708072683944, + "grad_norm": 26.665483474731445, + "learning_rate": 4.5123486409019936e-05, + "loss": 0.2425, + "num_input_tokens_seen": 21968448, + "step": 37865 + }, + { + "epoch": 5.640452785224904, + "grad_norm": 13.986724853515625, + "learning_rate": 4.5121558172234484e-05, + "loss": 0.232, + "num_input_tokens_seen": 21971392, + "step": 37870 + }, + { + "epoch": 5.641197497765862, + "grad_norm": 10.678166389465332, + "learning_rate": 4.511962959551576e-05, + "loss": 0.1677, + "num_input_tokens_seen": 21974304, + "step": 37875 + }, + { + "epoch": 5.641942210306821, + "grad_norm": 5.325019836425781, + "learning_rate": 4.511770067889635e-05, + "loss": 0.2805, + "num_input_tokens_seen": 21977344, + "step": 37880 + }, + { + "epoch": 5.642686922847781, + "grad_norm": 50.03676986694336, + "learning_rate": 4.5115771422408826e-05, + "loss": 0.1263, + "num_input_tokens_seen": 21979776, + "step": 37885 + }, + { + "epoch": 5.64343163538874, + "grad_norm": 38.069271087646484, + "learning_rate": 4.5113841826085796e-05, + "loss": 0.5027, + "num_input_tokens_seen": 21982432, + "step": 37890 + }, + { + "epoch": 5.644176347929699, + "grad_norm": 53.97829055786133, + "learning_rate": 4.5111911889959846e-05, + "loss": 0.2634, + "num_input_tokens_seen": 21985088, + "step": 37895 + }, + { + "epoch": 5.644921060470658, + "grad_norm": 0.40788158774375916, + "learning_rate": 4.5109981614063584e-05, + "loss": 0.1689, + "num_input_tokens_seen": 21988256, + "step": 37900 + }, + { + "epoch": 5.6456657730116175, + "grad_norm": 37.45314025878906, + "learning_rate": 4.510805099842963e-05, + "loss": 0.2863, + "num_input_tokens_seen": 21991104, + "step": 37905 + }, + { + "epoch": 5.646410485552577, + "grad_norm": 25.662546157836914, + "learning_rate": 4.5106120043090585e-05, + "loss": 0.3953, + "num_input_tokens_seen": 21993696, + "step": 37910 + }, + { + "epoch": 5.647155198093536, + "grad_norm": 7.168451309204102, + "learning_rate": 4.510418874807907e-05, + "loss": 0.1492, + "num_input_tokens_seen": 21996640, + "step": 37915 + }, + { + "epoch": 5.647899910634495, + "grad_norm": 16.412567138671875, + "learning_rate": 4.5102257113427726e-05, + "loss": 0.0716, + "num_input_tokens_seen": 21999360, + "step": 37920 + }, + { + "epoch": 5.648644623175454, + "grad_norm": 0.5743466019630432, + "learning_rate": 4.510032513916919e-05, + "loss": 0.2471, + "num_input_tokens_seen": 22002176, + "step": 37925 + }, + { + "epoch": 5.649389335716413, + "grad_norm": 7.514819145202637, + "learning_rate": 4.509839282533607e-05, + "loss": 0.4034, + "num_input_tokens_seen": 22004800, + "step": 37930 + }, + { + "epoch": 5.650134048257373, + "grad_norm": 49.866634368896484, + "learning_rate": 4.509646017196104e-05, + "loss": 0.6585, + "num_input_tokens_seen": 22007808, + "step": 37935 + }, + { + "epoch": 5.6508787607983315, + "grad_norm": 11.202840805053711, + "learning_rate": 4.509452717907674e-05, + "loss": 0.1971, + "num_input_tokens_seen": 22010784, + "step": 37940 + }, + { + "epoch": 5.651623473339291, + "grad_norm": 77.53392791748047, + "learning_rate": 4.509259384671582e-05, + "loss": 0.572, + "num_input_tokens_seen": 22013696, + "step": 37945 + }, + { + "epoch": 5.65236818588025, + "grad_norm": 9.361599922180176, + "learning_rate": 4.509066017491096e-05, + "loss": 0.1333, + "num_input_tokens_seen": 22016480, + "step": 37950 + }, + { + "epoch": 5.65311289842121, + "grad_norm": 8.670888900756836, + "learning_rate": 4.508872616369481e-05, + "loss": 0.1377, + "num_input_tokens_seen": 22019616, + "step": 37955 + }, + { + "epoch": 5.653857610962168, + "grad_norm": 9.692098617553711, + "learning_rate": 4.508679181310005e-05, + "loss": 0.0886, + "num_input_tokens_seen": 22022592, + "step": 37960 + }, + { + "epoch": 5.654602323503128, + "grad_norm": 5.556670188903809, + "learning_rate": 4.508485712315935e-05, + "loss": 0.1854, + "num_input_tokens_seen": 22025440, + "step": 37965 + }, + { + "epoch": 5.655347036044087, + "grad_norm": 12.16904067993164, + "learning_rate": 4.508292209390541e-05, + "loss": 0.1398, + "num_input_tokens_seen": 22028128, + "step": 37970 + }, + { + "epoch": 5.656091748585046, + "grad_norm": 29.901695251464844, + "learning_rate": 4.5080986725370914e-05, + "loss": 0.2628, + "num_input_tokens_seen": 22031040, + "step": 37975 + }, + { + "epoch": 5.656836461126005, + "grad_norm": 19.236591339111328, + "learning_rate": 4.507905101758855e-05, + "loss": 0.3399, + "num_input_tokens_seen": 22034016, + "step": 37980 + }, + { + "epoch": 5.657581173666965, + "grad_norm": 24.098690032958984, + "learning_rate": 4.507711497059104e-05, + "loss": 0.229, + "num_input_tokens_seen": 22037152, + "step": 37985 + }, + { + "epoch": 5.6583258862079235, + "grad_norm": 14.810079574584961, + "learning_rate": 4.5075178584411064e-05, + "loss": 0.4816, + "num_input_tokens_seen": 22039936, + "step": 37990 + }, + { + "epoch": 5.659070598748883, + "grad_norm": 0.1329181045293808, + "learning_rate": 4.507324185908135e-05, + "loss": 0.1307, + "num_input_tokens_seen": 22043168, + "step": 37995 + }, + { + "epoch": 5.659815311289842, + "grad_norm": 0.10635372996330261, + "learning_rate": 4.507130479463462e-05, + "loss": 0.0844, + "num_input_tokens_seen": 22046080, + "step": 38000 + }, + { + "epoch": 5.660560023830802, + "grad_norm": 7.427611351013184, + "learning_rate": 4.50693673911036e-05, + "loss": 0.2265, + "num_input_tokens_seen": 22048928, + "step": 38005 + }, + { + "epoch": 5.66130473637176, + "grad_norm": 24.359182357788086, + "learning_rate": 4.506742964852101e-05, + "loss": 0.0722, + "num_input_tokens_seen": 22051808, + "step": 38010 + }, + { + "epoch": 5.66204944891272, + "grad_norm": 0.25935566425323486, + "learning_rate": 4.506549156691959e-05, + "loss": 0.1734, + "num_input_tokens_seen": 22054912, + "step": 38015 + }, + { + "epoch": 5.662794161453679, + "grad_norm": 69.66724395751953, + "learning_rate": 4.506355314633209e-05, + "loss": 0.2748, + "num_input_tokens_seen": 22057920, + "step": 38020 + }, + { + "epoch": 5.663538873994638, + "grad_norm": 7.381412982940674, + "learning_rate": 4.506161438679125e-05, + "loss": 0.2674, + "num_input_tokens_seen": 22060896, + "step": 38025 + }, + { + "epoch": 5.664283586535597, + "grad_norm": 1.172131061553955, + "learning_rate": 4.5059675288329815e-05, + "loss": 0.2545, + "num_input_tokens_seen": 22063744, + "step": 38030 + }, + { + "epoch": 5.665028299076557, + "grad_norm": 79.70297241210938, + "learning_rate": 4.5057735850980564e-05, + "loss": 0.2556, + "num_input_tokens_seen": 22066624, + "step": 38035 + }, + { + "epoch": 5.665773011617516, + "grad_norm": 0.5110921263694763, + "learning_rate": 4.5055796074776244e-05, + "loss": 0.2293, + "num_input_tokens_seen": 22069504, + "step": 38040 + }, + { + "epoch": 5.666517724158475, + "grad_norm": 39.514015197753906, + "learning_rate": 4.505385595974964e-05, + "loss": 0.6675, + "num_input_tokens_seen": 22072384, + "step": 38045 + }, + { + "epoch": 5.667262436699434, + "grad_norm": 12.714909553527832, + "learning_rate": 4.505191550593352e-05, + "loss": 0.5685, + "num_input_tokens_seen": 22075264, + "step": 38050 + }, + { + "epoch": 5.668007149240394, + "grad_norm": 11.89258098602295, + "learning_rate": 4.5049974713360665e-05, + "loss": 0.1053, + "num_input_tokens_seen": 22078176, + "step": 38055 + }, + { + "epoch": 5.668751861781352, + "grad_norm": 13.13503646850586, + "learning_rate": 4.504803358206387e-05, + "loss": 0.0978, + "num_input_tokens_seen": 22080800, + "step": 38060 + }, + { + "epoch": 5.669496574322311, + "grad_norm": 0.02680911123752594, + "learning_rate": 4.504609211207591e-05, + "loss": 0.1667, + "num_input_tokens_seen": 22083648, + "step": 38065 + }, + { + "epoch": 5.670241286863271, + "grad_norm": 39.98690414428711, + "learning_rate": 4.504415030342961e-05, + "loss": 0.3749, + "num_input_tokens_seen": 22086592, + "step": 38070 + }, + { + "epoch": 5.67098599940423, + "grad_norm": 11.781185150146484, + "learning_rate": 4.504220815615776e-05, + "loss": 0.1042, + "num_input_tokens_seen": 22089472, + "step": 38075 + }, + { + "epoch": 5.671730711945189, + "grad_norm": 45.0886116027832, + "learning_rate": 4.5040265670293174e-05, + "loss": 0.2343, + "num_input_tokens_seen": 22092576, + "step": 38080 + }, + { + "epoch": 5.672475424486148, + "grad_norm": 23.18766212463379, + "learning_rate": 4.503832284586867e-05, + "loss": 0.3249, + "num_input_tokens_seen": 22095584, + "step": 38085 + }, + { + "epoch": 5.673220137027108, + "grad_norm": 37.88328170776367, + "learning_rate": 4.5036379682917065e-05, + "loss": 0.2106, + "num_input_tokens_seen": 22098400, + "step": 38090 + }, + { + "epoch": 5.673964849568066, + "grad_norm": 58.92962646484375, + "learning_rate": 4.503443618147119e-05, + "loss": 0.1414, + "num_input_tokens_seen": 22100960, + "step": 38095 + }, + { + "epoch": 5.674709562109026, + "grad_norm": 18.950557708740234, + "learning_rate": 4.503249234156387e-05, + "loss": 0.4413, + "num_input_tokens_seen": 22103808, + "step": 38100 + }, + { + "epoch": 5.675454274649985, + "grad_norm": 25.49944305419922, + "learning_rate": 4.503054816322796e-05, + "loss": 0.3544, + "num_input_tokens_seen": 22106624, + "step": 38105 + }, + { + "epoch": 5.676198987190944, + "grad_norm": 44.87239074707031, + "learning_rate": 4.50286036464963e-05, + "loss": 0.5437, + "num_input_tokens_seen": 22109376, + "step": 38110 + }, + { + "epoch": 5.676943699731903, + "grad_norm": 1.0994223356246948, + "learning_rate": 4.502665879140173e-05, + "loss": 0.4255, + "num_input_tokens_seen": 22112128, + "step": 38115 + }, + { + "epoch": 5.677688412272863, + "grad_norm": 15.415022850036621, + "learning_rate": 4.502471359797712e-05, + "loss": 0.0703, + "num_input_tokens_seen": 22115136, + "step": 38120 + }, + { + "epoch": 5.678433124813822, + "grad_norm": 7.238314151763916, + "learning_rate": 4.5022768066255315e-05, + "loss": 0.4465, + "num_input_tokens_seen": 22118144, + "step": 38125 + }, + { + "epoch": 5.679177837354781, + "grad_norm": 5.863860130310059, + "learning_rate": 4.502082219626921e-05, + "loss": 0.1958, + "num_input_tokens_seen": 22121408, + "step": 38130 + }, + { + "epoch": 5.67992254989574, + "grad_norm": 1.0683180093765259, + "learning_rate": 4.501887598805165e-05, + "loss": 0.5005, + "num_input_tokens_seen": 22124512, + "step": 38135 + }, + { + "epoch": 5.6806672624367, + "grad_norm": 0.46206510066986084, + "learning_rate": 4.501692944163553e-05, + "loss": 0.3317, + "num_input_tokens_seen": 22127296, + "step": 38140 + }, + { + "epoch": 5.681411974977658, + "grad_norm": 27.46802520751953, + "learning_rate": 4.501498255705373e-05, + "loss": 0.2936, + "num_input_tokens_seen": 22130432, + "step": 38145 + }, + { + "epoch": 5.682156687518618, + "grad_norm": 13.62325382232666, + "learning_rate": 4.501303533433915e-05, + "loss": 0.1147, + "num_input_tokens_seen": 22133568, + "step": 38150 + }, + { + "epoch": 5.682901400059577, + "grad_norm": 0.5597384572029114, + "learning_rate": 4.501108777352467e-05, + "loss": 0.1006, + "num_input_tokens_seen": 22136672, + "step": 38155 + }, + { + "epoch": 5.683646112600536, + "grad_norm": 5.427772045135498, + "learning_rate": 4.50091398746432e-05, + "loss": 0.1337, + "num_input_tokens_seen": 22139968, + "step": 38160 + }, + { + "epoch": 5.684390825141495, + "grad_norm": 13.357183456420898, + "learning_rate": 4.500719163772765e-05, + "loss": 0.1981, + "num_input_tokens_seen": 22142432, + "step": 38165 + }, + { + "epoch": 5.685135537682455, + "grad_norm": 36.391700744628906, + "learning_rate": 4.5005243062810934e-05, + "loss": 0.0731, + "num_input_tokens_seen": 22145216, + "step": 38170 + }, + { + "epoch": 5.685880250223414, + "grad_norm": 1.1674749851226807, + "learning_rate": 4.500329414992597e-05, + "loss": 0.1239, + "num_input_tokens_seen": 22148320, + "step": 38175 + }, + { + "epoch": 5.686624962764373, + "grad_norm": 50.89235305786133, + "learning_rate": 4.500134489910567e-05, + "loss": 0.172, + "num_input_tokens_seen": 22151008, + "step": 38180 + }, + { + "epoch": 5.687369675305332, + "grad_norm": 0.07287242263555527, + "learning_rate": 4.4999395310382994e-05, + "loss": 0.3456, + "num_input_tokens_seen": 22153632, + "step": 38185 + }, + { + "epoch": 5.688114387846292, + "grad_norm": 4.344675064086914, + "learning_rate": 4.4997445383790846e-05, + "loss": 0.0254, + "num_input_tokens_seen": 22156608, + "step": 38190 + }, + { + "epoch": 5.68885910038725, + "grad_norm": 0.29229578375816345, + "learning_rate": 4.499549511936219e-05, + "loss": 0.2753, + "num_input_tokens_seen": 22159584, + "step": 38195 + }, + { + "epoch": 5.68960381292821, + "grad_norm": 42.74072265625, + "learning_rate": 4.499354451712997e-05, + "loss": 0.8982, + "num_input_tokens_seen": 22162496, + "step": 38200 + }, + { + "epoch": 5.690348525469169, + "grad_norm": 41.72611999511719, + "learning_rate": 4.499159357712713e-05, + "loss": 0.1376, + "num_input_tokens_seen": 22165376, + "step": 38205 + }, + { + "epoch": 5.6910932380101285, + "grad_norm": 0.3028152287006378, + "learning_rate": 4.4989642299386636e-05, + "loss": 0.5744, + "num_input_tokens_seen": 22168160, + "step": 38210 + }, + { + "epoch": 5.691837950551087, + "grad_norm": 2.717952251434326, + "learning_rate": 4.498769068394145e-05, + "loss": 0.4203, + "num_input_tokens_seen": 22171360, + "step": 38215 + }, + { + "epoch": 5.692582663092047, + "grad_norm": 13.471366882324219, + "learning_rate": 4.498573873082454e-05, + "loss": 0.2477, + "num_input_tokens_seen": 22174368, + "step": 38220 + }, + { + "epoch": 5.693327375633006, + "grad_norm": 6.736506938934326, + "learning_rate": 4.4983786440068896e-05, + "loss": 0.0839, + "num_input_tokens_seen": 22177440, + "step": 38225 + }, + { + "epoch": 5.694072088173964, + "grad_norm": 71.4083480834961, + "learning_rate": 4.498183381170749e-05, + "loss": 0.2871, + "num_input_tokens_seen": 22180192, + "step": 38230 + }, + { + "epoch": 5.694816800714924, + "grad_norm": 13.876862525939941, + "learning_rate": 4.497988084577331e-05, + "loss": 0.4407, + "num_input_tokens_seen": 22183264, + "step": 38235 + }, + { + "epoch": 5.695561513255884, + "grad_norm": 17.175918579101562, + "learning_rate": 4.497792754229935e-05, + "loss": 0.2625, + "num_input_tokens_seen": 22186336, + "step": 38240 + }, + { + "epoch": 5.696306225796842, + "grad_norm": 1.3745518922805786, + "learning_rate": 4.49759739013186e-05, + "loss": 0.4201, + "num_input_tokens_seen": 22189184, + "step": 38245 + }, + { + "epoch": 5.697050938337801, + "grad_norm": 71.89653015136719, + "learning_rate": 4.4974019922864086e-05, + "loss": 0.3638, + "num_input_tokens_seen": 22192064, + "step": 38250 + }, + { + "epoch": 5.697795650878761, + "grad_norm": 2.184786796569824, + "learning_rate": 4.497206560696881e-05, + "loss": 0.2372, + "num_input_tokens_seen": 22194880, + "step": 38255 + }, + { + "epoch": 5.6985403634197205, + "grad_norm": 15.396688461303711, + "learning_rate": 4.497011095366577e-05, + "loss": 0.0531, + "num_input_tokens_seen": 22197824, + "step": 38260 + }, + { + "epoch": 5.699285075960679, + "grad_norm": 58.639808654785156, + "learning_rate": 4.4968155962988e-05, + "loss": 0.3016, + "num_input_tokens_seen": 22200640, + "step": 38265 + }, + { + "epoch": 5.700029788501638, + "grad_norm": 58.059791564941406, + "learning_rate": 4.496620063496854e-05, + "loss": 0.2931, + "num_input_tokens_seen": 22203520, + "step": 38270 + }, + { + "epoch": 5.700774501042598, + "grad_norm": 1.7878444194793701, + "learning_rate": 4.496424496964041e-05, + "loss": 0.3628, + "num_input_tokens_seen": 22206368, + "step": 38275 + }, + { + "epoch": 5.701519213583556, + "grad_norm": 0.7663055062294006, + "learning_rate": 4.496228896703665e-05, + "loss": 0.1614, + "num_input_tokens_seen": 22209088, + "step": 38280 + }, + { + "epoch": 5.702263926124516, + "grad_norm": 7.016773700714111, + "learning_rate": 4.496033262719031e-05, + "loss": 0.2233, + "num_input_tokens_seen": 22211776, + "step": 38285 + }, + { + "epoch": 5.703008638665475, + "grad_norm": 11.527607917785645, + "learning_rate": 4.495837595013443e-05, + "loss": 0.1376, + "num_input_tokens_seen": 22214752, + "step": 38290 + }, + { + "epoch": 5.7037533512064345, + "grad_norm": 13.889142036437988, + "learning_rate": 4.495641893590209e-05, + "loss": 0.273, + "num_input_tokens_seen": 22217728, + "step": 38295 + }, + { + "epoch": 5.704498063747393, + "grad_norm": 0.6253208518028259, + "learning_rate": 4.495446158452632e-05, + "loss": 0.0421, + "num_input_tokens_seen": 22220384, + "step": 38300 + }, + { + "epoch": 5.705242776288353, + "grad_norm": 30.714107513427734, + "learning_rate": 4.49525038960402e-05, + "loss": 0.4648, + "num_input_tokens_seen": 22223488, + "step": 38305 + }, + { + "epoch": 5.705987488829312, + "grad_norm": 14.568570137023926, + "learning_rate": 4.495054587047682e-05, + "loss": 0.2844, + "num_input_tokens_seen": 22226240, + "step": 38310 + }, + { + "epoch": 5.706732201370271, + "grad_norm": 19.3922061920166, + "learning_rate": 4.4948587507869235e-05, + "loss": 0.1725, + "num_input_tokens_seen": 22228736, + "step": 38315 + }, + { + "epoch": 5.70747691391123, + "grad_norm": 17.061811447143555, + "learning_rate": 4.494662880825053e-05, + "loss": 0.5704, + "num_input_tokens_seen": 22231680, + "step": 38320 + }, + { + "epoch": 5.70822162645219, + "grad_norm": 7.151399612426758, + "learning_rate": 4.494466977165382e-05, + "loss": 0.1178, + "num_input_tokens_seen": 22234656, + "step": 38325 + }, + { + "epoch": 5.708966338993148, + "grad_norm": 5.402536869049072, + "learning_rate": 4.494271039811217e-05, + "loss": 0.5158, + "num_input_tokens_seen": 22237280, + "step": 38330 + }, + { + "epoch": 5.709711051534108, + "grad_norm": 9.636798858642578, + "learning_rate": 4.4940750687658716e-05, + "loss": 0.397, + "num_input_tokens_seen": 22240480, + "step": 38335 + }, + { + "epoch": 5.710455764075067, + "grad_norm": 4.662420272827148, + "learning_rate": 4.4938790640326534e-05, + "loss": 0.078, + "num_input_tokens_seen": 22243488, + "step": 38340 + }, + { + "epoch": 5.7112004766160265, + "grad_norm": 39.23722457885742, + "learning_rate": 4.4936830256148755e-05, + "loss": 0.3995, + "num_input_tokens_seen": 22246464, + "step": 38345 + }, + { + "epoch": 5.711945189156985, + "grad_norm": 12.890584945678711, + "learning_rate": 4.493486953515848e-05, + "loss": 0.2277, + "num_input_tokens_seen": 22249408, + "step": 38350 + }, + { + "epoch": 5.712689901697945, + "grad_norm": 10.957197189331055, + "learning_rate": 4.493290847738886e-05, + "loss": 0.272, + "num_input_tokens_seen": 22252320, + "step": 38355 + }, + { + "epoch": 5.713434614238904, + "grad_norm": 13.192054748535156, + "learning_rate": 4.4930947082873e-05, + "loss": 0.4393, + "num_input_tokens_seen": 22255264, + "step": 38360 + }, + { + "epoch": 5.714179326779863, + "grad_norm": 6.844858169555664, + "learning_rate": 4.492898535164405e-05, + "loss": 0.1445, + "num_input_tokens_seen": 22258304, + "step": 38365 + }, + { + "epoch": 5.714924039320822, + "grad_norm": 38.633819580078125, + "learning_rate": 4.492702328373515e-05, + "loss": 0.1299, + "num_input_tokens_seen": 22261120, + "step": 38370 + }, + { + "epoch": 5.715668751861782, + "grad_norm": 7.422626495361328, + "learning_rate": 4.492506087917944e-05, + "loss": 0.2024, + "num_input_tokens_seen": 22263936, + "step": 38375 + }, + { + "epoch": 5.7164134644027405, + "grad_norm": 14.002684593200684, + "learning_rate": 4.4923098138010064e-05, + "loss": 0.168, + "num_input_tokens_seen": 22266528, + "step": 38380 + }, + { + "epoch": 5.7171581769437, + "grad_norm": 0.018098972737789154, + "learning_rate": 4.492113506026021e-05, + "loss": 0.3615, + "num_input_tokens_seen": 22269120, + "step": 38385 + }, + { + "epoch": 5.717902889484659, + "grad_norm": 20.815519332885742, + "learning_rate": 4.491917164596303e-05, + "loss": 0.3908, + "num_input_tokens_seen": 22271840, + "step": 38390 + }, + { + "epoch": 5.718647602025618, + "grad_norm": 1.990041732788086, + "learning_rate": 4.491720789515168e-05, + "loss": 0.0638, + "num_input_tokens_seen": 22274656, + "step": 38395 + }, + { + "epoch": 5.719392314566577, + "grad_norm": 11.581429481506348, + "learning_rate": 4.491524380785935e-05, + "loss": 0.4895, + "num_input_tokens_seen": 22277344, + "step": 38400 + }, + { + "epoch": 5.720137027107537, + "grad_norm": 4.374129772186279, + "learning_rate": 4.4913279384119214e-05, + "loss": 0.3632, + "num_input_tokens_seen": 22280160, + "step": 38405 + }, + { + "epoch": 5.720881739648496, + "grad_norm": 14.173905372619629, + "learning_rate": 4.4911314623964466e-05, + "loss": 0.2342, + "num_input_tokens_seen": 22283392, + "step": 38410 + }, + { + "epoch": 5.721626452189454, + "grad_norm": 0.485921174287796, + "learning_rate": 4.490934952742829e-05, + "loss": 0.0387, + "num_input_tokens_seen": 22286080, + "step": 38415 + }, + { + "epoch": 5.722371164730414, + "grad_norm": 11.592700958251953, + "learning_rate": 4.490738409454389e-05, + "loss": 0.1649, + "num_input_tokens_seen": 22288896, + "step": 38420 + }, + { + "epoch": 5.723115877271374, + "grad_norm": 10.772408485412598, + "learning_rate": 4.4905418325344475e-05, + "loss": 0.0845, + "num_input_tokens_seen": 22291424, + "step": 38425 + }, + { + "epoch": 5.7238605898123325, + "grad_norm": 20.979206085205078, + "learning_rate": 4.490345221986324e-05, + "loss": 0.1501, + "num_input_tokens_seen": 22294336, + "step": 38430 + }, + { + "epoch": 5.724605302353291, + "grad_norm": 1.5187758207321167, + "learning_rate": 4.490148577813341e-05, + "loss": 0.6277, + "num_input_tokens_seen": 22297376, + "step": 38435 + }, + { + "epoch": 5.725350014894251, + "grad_norm": 18.87796974182129, + "learning_rate": 4.489951900018821e-05, + "loss": 0.3662, + "num_input_tokens_seen": 22300096, + "step": 38440 + }, + { + "epoch": 5.72609472743521, + "grad_norm": 12.2527437210083, + "learning_rate": 4.4897551886060866e-05, + "loss": 0.1449, + "num_input_tokens_seen": 22302976, + "step": 38445 + }, + { + "epoch": 5.726839439976169, + "grad_norm": 14.023847579956055, + "learning_rate": 4.489558443578459e-05, + "loss": 0.5305, + "num_input_tokens_seen": 22306080, + "step": 38450 + }, + { + "epoch": 5.727584152517128, + "grad_norm": 35.2096061706543, + "learning_rate": 4.4893616649392646e-05, + "loss": 0.3154, + "num_input_tokens_seen": 22308832, + "step": 38455 + }, + { + "epoch": 5.728328865058088, + "grad_norm": 13.354122161865234, + "learning_rate": 4.4891648526918265e-05, + "loss": 0.2976, + "num_input_tokens_seen": 22311936, + "step": 38460 + }, + { + "epoch": 5.7290735775990465, + "grad_norm": 8.155318260192871, + "learning_rate": 4.48896800683947e-05, + "loss": 0.0606, + "num_input_tokens_seen": 22314560, + "step": 38465 + }, + { + "epoch": 5.729818290140006, + "grad_norm": 13.538520812988281, + "learning_rate": 4.48877112738552e-05, + "loss": 0.0811, + "num_input_tokens_seen": 22317536, + "step": 38470 + }, + { + "epoch": 5.730563002680965, + "grad_norm": 33.69210433959961, + "learning_rate": 4.488574214333304e-05, + "loss": 0.2986, + "num_input_tokens_seen": 22320608, + "step": 38475 + }, + { + "epoch": 5.7313077152219245, + "grad_norm": 0.9211509823799133, + "learning_rate": 4.488377267686147e-05, + "loss": 0.0985, + "num_input_tokens_seen": 22323456, + "step": 38480 + }, + { + "epoch": 5.732052427762883, + "grad_norm": 31.186992645263672, + "learning_rate": 4.488180287447378e-05, + "loss": 0.3966, + "num_input_tokens_seen": 22326304, + "step": 38485 + }, + { + "epoch": 5.732797140303843, + "grad_norm": 20.896495819091797, + "learning_rate": 4.4879832736203224e-05, + "loss": 0.1029, + "num_input_tokens_seen": 22329216, + "step": 38490 + }, + { + "epoch": 5.733541852844802, + "grad_norm": 21.322492599487305, + "learning_rate": 4.48778622620831e-05, + "loss": 0.4652, + "num_input_tokens_seen": 22331936, + "step": 38495 + }, + { + "epoch": 5.734286565385761, + "grad_norm": 14.600010871887207, + "learning_rate": 4.487589145214671e-05, + "loss": 0.2381, + "num_input_tokens_seen": 22334656, + "step": 38500 + }, + { + "epoch": 5.73503127792672, + "grad_norm": 32.375091552734375, + "learning_rate": 4.487392030642733e-05, + "loss": 0.2576, + "num_input_tokens_seen": 22337504, + "step": 38505 + }, + { + "epoch": 5.73577599046768, + "grad_norm": 42.10364532470703, + "learning_rate": 4.487194882495826e-05, + "loss": 0.4152, + "num_input_tokens_seen": 22340224, + "step": 38510 + }, + { + "epoch": 5.7365207030086385, + "grad_norm": 24.469552993774414, + "learning_rate": 4.486997700777281e-05, + "loss": 0.1489, + "num_input_tokens_seen": 22343360, + "step": 38515 + }, + { + "epoch": 5.737265415549598, + "grad_norm": 22.774560928344727, + "learning_rate": 4.486800485490429e-05, + "loss": 0.1328, + "num_input_tokens_seen": 22346016, + "step": 38520 + }, + { + "epoch": 5.738010128090557, + "grad_norm": 0.5941494703292847, + "learning_rate": 4.4866032366386034e-05, + "loss": 0.1676, + "num_input_tokens_seen": 22348928, + "step": 38525 + }, + { + "epoch": 5.7387548406315165, + "grad_norm": 50.90377426147461, + "learning_rate": 4.486405954225135e-05, + "loss": 0.484, + "num_input_tokens_seen": 22351712, + "step": 38530 + }, + { + "epoch": 5.739499553172475, + "grad_norm": 44.91062545776367, + "learning_rate": 4.486208638253356e-05, + "loss": 0.2635, + "num_input_tokens_seen": 22354528, + "step": 38535 + }, + { + "epoch": 5.740244265713435, + "grad_norm": 14.927227973937988, + "learning_rate": 4.486011288726601e-05, + "loss": 0.3473, + "num_input_tokens_seen": 22357376, + "step": 38540 + }, + { + "epoch": 5.740988978254394, + "grad_norm": 26.064809799194336, + "learning_rate": 4.485813905648204e-05, + "loss": 0.1479, + "num_input_tokens_seen": 22360192, + "step": 38545 + }, + { + "epoch": 5.741733690795353, + "grad_norm": 11.261032104492188, + "learning_rate": 4.485616489021499e-05, + "loss": 0.139, + "num_input_tokens_seen": 22363296, + "step": 38550 + }, + { + "epoch": 5.742478403336312, + "grad_norm": 11.770825386047363, + "learning_rate": 4.485419038849822e-05, + "loss": 0.0349, + "num_input_tokens_seen": 22366176, + "step": 38555 + }, + { + "epoch": 5.743223115877272, + "grad_norm": 23.02213478088379, + "learning_rate": 4.485221555136508e-05, + "loss": 0.3174, + "num_input_tokens_seen": 22369088, + "step": 38560 + }, + { + "epoch": 5.7439678284182305, + "grad_norm": 2.1775476932525635, + "learning_rate": 4.485024037884894e-05, + "loss": 0.1444, + "num_input_tokens_seen": 22371840, + "step": 38565 + }, + { + "epoch": 5.74471254095919, + "grad_norm": 3.0196361541748047, + "learning_rate": 4.484826487098316e-05, + "loss": 0.0653, + "num_input_tokens_seen": 22374656, + "step": 38570 + }, + { + "epoch": 5.745457253500149, + "grad_norm": 2.288262367248535, + "learning_rate": 4.484628902780111e-05, + "loss": 0.0131, + "num_input_tokens_seen": 22377504, + "step": 38575 + }, + { + "epoch": 5.746201966041108, + "grad_norm": 32.690181732177734, + "learning_rate": 4.484431284933619e-05, + "loss": 0.3417, + "num_input_tokens_seen": 22380736, + "step": 38580 + }, + { + "epoch": 5.746946678582067, + "grad_norm": 19.939701080322266, + "learning_rate": 4.484233633562176e-05, + "loss": 0.3181, + "num_input_tokens_seen": 22383584, + "step": 38585 + }, + { + "epoch": 5.747691391123027, + "grad_norm": 0.017066258937120438, + "learning_rate": 4.484035948669124e-05, + "loss": 0.6005, + "num_input_tokens_seen": 22386688, + "step": 38590 + }, + { + "epoch": 5.748436103663986, + "grad_norm": 40.321895599365234, + "learning_rate": 4.4838382302577995e-05, + "loss": 0.5923, + "num_input_tokens_seen": 22389728, + "step": 38595 + }, + { + "epoch": 5.7491808162049445, + "grad_norm": 35.04484939575195, + "learning_rate": 4.483640478331546e-05, + "loss": 0.7273, + "num_input_tokens_seen": 22393312, + "step": 38600 + }, + { + "epoch": 5.749925528745904, + "grad_norm": 16.157882690429688, + "learning_rate": 4.483442692893702e-05, + "loss": 0.487, + "num_input_tokens_seen": 22396224, + "step": 38605 + }, + { + "epoch": 5.750670241286863, + "grad_norm": 1.2478773593902588, + "learning_rate": 4.483244873947609e-05, + "loss": 0.3765, + "num_input_tokens_seen": 22399232, + "step": 38610 + }, + { + "epoch": 5.7514149538278225, + "grad_norm": 15.005425453186035, + "learning_rate": 4.48304702149661e-05, + "loss": 0.2047, + "num_input_tokens_seen": 22402144, + "step": 38615 + }, + { + "epoch": 5.752159666368781, + "grad_norm": 0.23880793154239655, + "learning_rate": 4.482849135544048e-05, + "loss": 0.4107, + "num_input_tokens_seen": 22405120, + "step": 38620 + }, + { + "epoch": 5.752904378909741, + "grad_norm": 5.288626194000244, + "learning_rate": 4.4826512160932636e-05, + "loss": 0.2848, + "num_input_tokens_seen": 22408000, + "step": 38625 + }, + { + "epoch": 5.7536490914507, + "grad_norm": 0.13744644820690155, + "learning_rate": 4.482453263147603e-05, + "loss": 0.2088, + "num_input_tokens_seen": 22410976, + "step": 38630 + }, + { + "epoch": 5.754393803991659, + "grad_norm": 6.100863933563232, + "learning_rate": 4.4822552767104095e-05, + "loss": 0.1035, + "num_input_tokens_seen": 22413856, + "step": 38635 + }, + { + "epoch": 5.755138516532618, + "grad_norm": 35.3502311706543, + "learning_rate": 4.482057256785027e-05, + "loss": 0.1411, + "num_input_tokens_seen": 22416704, + "step": 38640 + }, + { + "epoch": 5.755883229073578, + "grad_norm": 51.222373962402344, + "learning_rate": 4.481859203374802e-05, + "loss": 0.2116, + "num_input_tokens_seen": 22419552, + "step": 38645 + }, + { + "epoch": 5.7566279416145365, + "grad_norm": 52.50886917114258, + "learning_rate": 4.48166111648308e-05, + "loss": 0.4807, + "num_input_tokens_seen": 22422272, + "step": 38650 + }, + { + "epoch": 5.757372654155496, + "grad_norm": 15.169358253479004, + "learning_rate": 4.481462996113207e-05, + "loss": 0.3243, + "num_input_tokens_seen": 22425120, + "step": 38655 + }, + { + "epoch": 5.758117366696455, + "grad_norm": 0.07218876481056213, + "learning_rate": 4.481264842268531e-05, + "loss": 0.0679, + "num_input_tokens_seen": 22427712, + "step": 38660 + }, + { + "epoch": 5.7588620792374146, + "grad_norm": 44.20112228393555, + "learning_rate": 4.4810666549523997e-05, + "loss": 0.2884, + "num_input_tokens_seen": 22430592, + "step": 38665 + }, + { + "epoch": 5.759606791778373, + "grad_norm": 200.26858520507812, + "learning_rate": 4.48086843416816e-05, + "loss": 0.4466, + "num_input_tokens_seen": 22433568, + "step": 38670 + }, + { + "epoch": 5.760351504319333, + "grad_norm": 16.708799362182617, + "learning_rate": 4.480670179919162e-05, + "loss": 0.2463, + "num_input_tokens_seen": 22436704, + "step": 38675 + }, + { + "epoch": 5.761096216860292, + "grad_norm": 17.992361068725586, + "learning_rate": 4.480471892208754e-05, + "loss": 0.2543, + "num_input_tokens_seen": 22439776, + "step": 38680 + }, + { + "epoch": 5.761840929401251, + "grad_norm": 19.928333282470703, + "learning_rate": 4.480273571040285e-05, + "loss": 0.1728, + "num_input_tokens_seen": 22442848, + "step": 38685 + }, + { + "epoch": 5.76258564194221, + "grad_norm": 62.31614685058594, + "learning_rate": 4.480075216417109e-05, + "loss": 0.2351, + "num_input_tokens_seen": 22445440, + "step": 38690 + }, + { + "epoch": 5.76333035448317, + "grad_norm": 8.86588191986084, + "learning_rate": 4.479876828342573e-05, + "loss": 0.1748, + "num_input_tokens_seen": 22448032, + "step": 38695 + }, + { + "epoch": 5.7640750670241285, + "grad_norm": 0.06703397631645203, + "learning_rate": 4.479678406820031e-05, + "loss": 0.1518, + "num_input_tokens_seen": 22451040, + "step": 38700 + }, + { + "epoch": 5.764819779565088, + "grad_norm": 0.9595566987991333, + "learning_rate": 4.479479951852834e-05, + "loss": 0.4007, + "num_input_tokens_seen": 22454304, + "step": 38705 + }, + { + "epoch": 5.765564492106047, + "grad_norm": 0.3482508063316345, + "learning_rate": 4.479281463444335e-05, + "loss": 0.5034, + "num_input_tokens_seen": 22457408, + "step": 38710 + }, + { + "epoch": 5.766309204647007, + "grad_norm": 58.652252197265625, + "learning_rate": 4.479082941597888e-05, + "loss": 0.6759, + "num_input_tokens_seen": 22460192, + "step": 38715 + }, + { + "epoch": 5.767053917187965, + "grad_norm": 1.2685741186141968, + "learning_rate": 4.4788843863168455e-05, + "loss": 0.4411, + "num_input_tokens_seen": 22463456, + "step": 38720 + }, + { + "epoch": 5.767798629728925, + "grad_norm": 21.734025955200195, + "learning_rate": 4.4786857976045625e-05, + "loss": 0.2802, + "num_input_tokens_seen": 22466400, + "step": 38725 + }, + { + "epoch": 5.768543342269884, + "grad_norm": 16.175884246826172, + "learning_rate": 4.4784871754643946e-05, + "loss": 0.3472, + "num_input_tokens_seen": 22469120, + "step": 38730 + }, + { + "epoch": 5.769288054810843, + "grad_norm": 0.6951125264167786, + "learning_rate": 4.478288519899697e-05, + "loss": 0.1694, + "num_input_tokens_seen": 22472064, + "step": 38735 + }, + { + "epoch": 5.770032767351802, + "grad_norm": 0.6499648690223694, + "learning_rate": 4.4780898309138245e-05, + "loss": 0.2933, + "num_input_tokens_seen": 22474976, + "step": 38740 + }, + { + "epoch": 5.770777479892761, + "grad_norm": 15.600723266601562, + "learning_rate": 4.477891108510135e-05, + "loss": 0.2927, + "num_input_tokens_seen": 22478016, + "step": 38745 + }, + { + "epoch": 5.7715221924337206, + "grad_norm": 29.415027618408203, + "learning_rate": 4.4776923526919855e-05, + "loss": 0.4846, + "num_input_tokens_seen": 22480832, + "step": 38750 + }, + { + "epoch": 5.77226690497468, + "grad_norm": 6.30090856552124, + "learning_rate": 4.477493563462733e-05, + "loss": 0.3932, + "num_input_tokens_seen": 22483552, + "step": 38755 + }, + { + "epoch": 5.773011617515639, + "grad_norm": 2.6977550983428955, + "learning_rate": 4.477294740825738e-05, + "loss": 0.0732, + "num_input_tokens_seen": 22486176, + "step": 38760 + }, + { + "epoch": 5.773756330056598, + "grad_norm": 11.246842384338379, + "learning_rate": 4.477095884784358e-05, + "loss": 0.5867, + "num_input_tokens_seen": 22489216, + "step": 38765 + }, + { + "epoch": 5.774501042597557, + "grad_norm": 44.280921936035156, + "learning_rate": 4.476896995341951e-05, + "loss": 0.3338, + "num_input_tokens_seen": 22492064, + "step": 38770 + }, + { + "epoch": 5.775245755138517, + "grad_norm": 19.01093101501465, + "learning_rate": 4.47669807250188e-05, + "loss": 0.3856, + "num_input_tokens_seen": 22494976, + "step": 38775 + }, + { + "epoch": 5.775990467679476, + "grad_norm": 6.528285980224609, + "learning_rate": 4.476499116267503e-05, + "loss": 0.3257, + "num_input_tokens_seen": 22498144, + "step": 38780 + }, + { + "epoch": 5.7767351802204345, + "grad_norm": 1.5592976808547974, + "learning_rate": 4.476300126642183e-05, + "loss": 0.6188, + "num_input_tokens_seen": 22501088, + "step": 38785 + }, + { + "epoch": 5.777479892761394, + "grad_norm": 0.4068637788295746, + "learning_rate": 4.4761011036292804e-05, + "loss": 0.3342, + "num_input_tokens_seen": 22504000, + "step": 38790 + }, + { + "epoch": 5.778224605302353, + "grad_norm": 9.194497108459473, + "learning_rate": 4.475902047232159e-05, + "loss": 0.3035, + "num_input_tokens_seen": 22507008, + "step": 38795 + }, + { + "epoch": 5.778969317843313, + "grad_norm": 0.44299378991127014, + "learning_rate": 4.4757029574541795e-05, + "loss": 0.2768, + "num_input_tokens_seen": 22509984, + "step": 38800 + }, + { + "epoch": 5.779714030384271, + "grad_norm": 24.781652450561523, + "learning_rate": 4.475503834298707e-05, + "loss": 0.4028, + "num_input_tokens_seen": 22512832, + "step": 38805 + }, + { + "epoch": 5.780458742925231, + "grad_norm": 54.32681655883789, + "learning_rate": 4.475304677769105e-05, + "loss": 0.1987, + "num_input_tokens_seen": 22515648, + "step": 38810 + }, + { + "epoch": 5.78120345546619, + "grad_norm": 31.48829460144043, + "learning_rate": 4.475105487868739e-05, + "loss": 0.3063, + "num_input_tokens_seen": 22518528, + "step": 38815 + }, + { + "epoch": 5.781948168007149, + "grad_norm": 0.10307319462299347, + "learning_rate": 4.474906264600972e-05, + "loss": 0.0146, + "num_input_tokens_seen": 22521408, + "step": 38820 + }, + { + "epoch": 5.782692880548108, + "grad_norm": 0.02489597722887993, + "learning_rate": 4.474707007969171e-05, + "loss": 0.1712, + "num_input_tokens_seen": 22524192, + "step": 38825 + }, + { + "epoch": 5.783437593089068, + "grad_norm": 26.7788028717041, + "learning_rate": 4.4745077179767026e-05, + "loss": 0.0553, + "num_input_tokens_seen": 22527072, + "step": 38830 + }, + { + "epoch": 5.7841823056300266, + "grad_norm": 5.6220622062683105, + "learning_rate": 4.4743083946269324e-05, + "loss": 0.1417, + "num_input_tokens_seen": 22530368, + "step": 38835 + }, + { + "epoch": 5.784927018170986, + "grad_norm": 0.07918135821819305, + "learning_rate": 4.47410903792323e-05, + "loss": 0.0642, + "num_input_tokens_seen": 22533408, + "step": 38840 + }, + { + "epoch": 5.785671730711945, + "grad_norm": 22.26625633239746, + "learning_rate": 4.47390964786896e-05, + "loss": 0.2958, + "num_input_tokens_seen": 22536544, + "step": 38845 + }, + { + "epoch": 5.786416443252905, + "grad_norm": 22.758668899536133, + "learning_rate": 4.4737102244674934e-05, + "loss": 0.2539, + "num_input_tokens_seen": 22539520, + "step": 38850 + }, + { + "epoch": 5.787161155793863, + "grad_norm": 39.45740509033203, + "learning_rate": 4.473510767722199e-05, + "loss": 0.2918, + "num_input_tokens_seen": 22542528, + "step": 38855 + }, + { + "epoch": 5.787905868334823, + "grad_norm": 29.3955135345459, + "learning_rate": 4.473311277636445e-05, + "loss": 0.3322, + "num_input_tokens_seen": 22545504, + "step": 38860 + }, + { + "epoch": 5.788650580875782, + "grad_norm": 85.01094818115234, + "learning_rate": 4.4731117542136034e-05, + "loss": 0.4997, + "num_input_tokens_seen": 22548288, + "step": 38865 + }, + { + "epoch": 5.789395293416741, + "grad_norm": 2.308046579360962, + "learning_rate": 4.472912197457044e-05, + "loss": 0.1708, + "num_input_tokens_seen": 22551296, + "step": 38870 + }, + { + "epoch": 5.7901400059577, + "grad_norm": 12.75163459777832, + "learning_rate": 4.472712607370137e-05, + "loss": 0.1115, + "num_input_tokens_seen": 22554336, + "step": 38875 + }, + { + "epoch": 5.79088471849866, + "grad_norm": 10.312495231628418, + "learning_rate": 4.472512983956257e-05, + "loss": 0.2149, + "num_input_tokens_seen": 22557120, + "step": 38880 + }, + { + "epoch": 5.791629431039619, + "grad_norm": 0.0960061177611351, + "learning_rate": 4.4723133272187745e-05, + "loss": 0.2315, + "num_input_tokens_seen": 22560192, + "step": 38885 + }, + { + "epoch": 5.792374143580578, + "grad_norm": 0.1815936267375946, + "learning_rate": 4.4721136371610626e-05, + "loss": 0.4432, + "num_input_tokens_seen": 22563168, + "step": 38890 + }, + { + "epoch": 5.793118856121537, + "grad_norm": 4.110618591308594, + "learning_rate": 4.4719139137864956e-05, + "loss": 0.2263, + "num_input_tokens_seen": 22565888, + "step": 38895 + }, + { + "epoch": 5.793863568662497, + "grad_norm": 0.08209057152271271, + "learning_rate": 4.4717141570984474e-05, + "loss": 0.0661, + "num_input_tokens_seen": 22568864, + "step": 38900 + }, + { + "epoch": 5.794608281203455, + "grad_norm": 36.864749908447266, + "learning_rate": 4.471514367100292e-05, + "loss": 0.0345, + "num_input_tokens_seen": 22571808, + "step": 38905 + }, + { + "epoch": 5.795352993744415, + "grad_norm": 25.497589111328125, + "learning_rate": 4.471314543795405e-05, + "loss": 0.3107, + "num_input_tokens_seen": 22575008, + "step": 38910 + }, + { + "epoch": 5.796097706285374, + "grad_norm": 45.97441482543945, + "learning_rate": 4.4711146871871625e-05, + "loss": 0.3196, + "num_input_tokens_seen": 22577984, + "step": 38915 + }, + { + "epoch": 5.796842418826333, + "grad_norm": 30.966222763061523, + "learning_rate": 4.4709147972789405e-05, + "loss": 0.2561, + "num_input_tokens_seen": 22580992, + "step": 38920 + }, + { + "epoch": 5.797587131367292, + "grad_norm": 44.678489685058594, + "learning_rate": 4.470714874074117e-05, + "loss": 0.2944, + "num_input_tokens_seen": 22584000, + "step": 38925 + }, + { + "epoch": 5.798331843908251, + "grad_norm": 0.8719643950462341, + "learning_rate": 4.470514917576067e-05, + "loss": 0.1418, + "num_input_tokens_seen": 22586816, + "step": 38930 + }, + { + "epoch": 5.799076556449211, + "grad_norm": 40.14447021484375, + "learning_rate": 4.470314927788172e-05, + "loss": 0.2301, + "num_input_tokens_seen": 22589440, + "step": 38935 + }, + { + "epoch": 5.79982126899017, + "grad_norm": 0.63299560546875, + "learning_rate": 4.470114904713808e-05, + "loss": 0.5442, + "num_input_tokens_seen": 22592352, + "step": 38940 + }, + { + "epoch": 5.800565981531129, + "grad_norm": 12.839729309082031, + "learning_rate": 4.4699148483563546e-05, + "loss": 0.3961, + "num_input_tokens_seen": 22595456, + "step": 38945 + }, + { + "epoch": 5.801310694072088, + "grad_norm": 3.459721565246582, + "learning_rate": 4.469714758719192e-05, + "loss": 0.0325, + "num_input_tokens_seen": 22598560, + "step": 38950 + }, + { + "epoch": 5.802055406613047, + "grad_norm": 22.682273864746094, + "learning_rate": 4.469514635805702e-05, + "loss": 0.4656, + "num_input_tokens_seen": 22601632, + "step": 38955 + }, + { + "epoch": 5.802800119154006, + "grad_norm": 2.2408368587493896, + "learning_rate": 4.469314479619262e-05, + "loss": 0.1512, + "num_input_tokens_seen": 22604448, + "step": 38960 + }, + { + "epoch": 5.803544831694966, + "grad_norm": 14.726214408874512, + "learning_rate": 4.469114290163257e-05, + "loss": 0.2772, + "num_input_tokens_seen": 22607520, + "step": 38965 + }, + { + "epoch": 5.804289544235925, + "grad_norm": 0.46030035614967346, + "learning_rate": 4.468914067441066e-05, + "loss": 0.01, + "num_input_tokens_seen": 22610528, + "step": 38970 + }, + { + "epoch": 5.805034256776884, + "grad_norm": 0.2430112659931183, + "learning_rate": 4.468713811456074e-05, + "loss": 0.1712, + "num_input_tokens_seen": 22613344, + "step": 38975 + }, + { + "epoch": 5.805778969317843, + "grad_norm": 31.634910583496094, + "learning_rate": 4.468513522211662e-05, + "loss": 0.3964, + "num_input_tokens_seen": 22616192, + "step": 38980 + }, + { + "epoch": 5.806523681858803, + "grad_norm": 0.5827246904373169, + "learning_rate": 4.468313199711216e-05, + "loss": 0.2167, + "num_input_tokens_seen": 22619168, + "step": 38985 + }, + { + "epoch": 5.807268394399761, + "grad_norm": 0.793727695941925, + "learning_rate": 4.468112843958118e-05, + "loss": 0.3113, + "num_input_tokens_seen": 22622048, + "step": 38990 + }, + { + "epoch": 5.808013106940721, + "grad_norm": 62.155269622802734, + "learning_rate": 4.467912454955755e-05, + "loss": 0.343, + "num_input_tokens_seen": 22625056, + "step": 38995 + }, + { + "epoch": 5.80875781948168, + "grad_norm": 12.52209758758545, + "learning_rate": 4.46771203270751e-05, + "loss": 0.3065, + "num_input_tokens_seen": 22627936, + "step": 39000 + }, + { + "epoch": 5.809502532022639, + "grad_norm": 2.9974205493927, + "learning_rate": 4.4675115772167706e-05, + "loss": 0.2017, + "num_input_tokens_seen": 22630976, + "step": 39005 + }, + { + "epoch": 5.810247244563598, + "grad_norm": 0.5689027309417725, + "learning_rate": 4.467311088486922e-05, + "loss": 0.3855, + "num_input_tokens_seen": 22633920, + "step": 39010 + }, + { + "epoch": 5.810991957104558, + "grad_norm": 13.009112358093262, + "learning_rate": 4.467110566521353e-05, + "loss": 0.2762, + "num_input_tokens_seen": 22636704, + "step": 39015 + }, + { + "epoch": 5.811736669645517, + "grad_norm": 10.902670860290527, + "learning_rate": 4.4669100113234504e-05, + "loss": 0.2696, + "num_input_tokens_seen": 22639872, + "step": 39020 + }, + { + "epoch": 5.812481382186476, + "grad_norm": 20.673974990844727, + "learning_rate": 4.466709422896601e-05, + "loss": 0.0281, + "num_input_tokens_seen": 22642688, + "step": 39025 + }, + { + "epoch": 5.813226094727435, + "grad_norm": 19.18532943725586, + "learning_rate": 4.466508801244196e-05, + "loss": 0.4426, + "num_input_tokens_seen": 22645440, + "step": 39030 + }, + { + "epoch": 5.813970807268395, + "grad_norm": 2.8963112831115723, + "learning_rate": 4.466308146369623e-05, + "loss": 0.161, + "num_input_tokens_seen": 22648192, + "step": 39035 + }, + { + "epoch": 5.814715519809353, + "grad_norm": 23.129262924194336, + "learning_rate": 4.466107458276273e-05, + "loss": 0.2571, + "num_input_tokens_seen": 22651232, + "step": 39040 + }, + { + "epoch": 5.815460232350313, + "grad_norm": 0.2249928116798401, + "learning_rate": 4.465906736967534e-05, + "loss": 0.1093, + "num_input_tokens_seen": 22654048, + "step": 39045 + }, + { + "epoch": 5.816204944891272, + "grad_norm": 61.99470901489258, + "learning_rate": 4.465705982446801e-05, + "loss": 0.4673, + "num_input_tokens_seen": 22656800, + "step": 39050 + }, + { + "epoch": 5.8169496574322315, + "grad_norm": 3.242365837097168, + "learning_rate": 4.465505194717462e-05, + "loss": 0.3419, + "num_input_tokens_seen": 22659968, + "step": 39055 + }, + { + "epoch": 5.81769436997319, + "grad_norm": 4.217240810394287, + "learning_rate": 4.46530437378291e-05, + "loss": 0.0769, + "num_input_tokens_seen": 22662720, + "step": 39060 + }, + { + "epoch": 5.81843908251415, + "grad_norm": 14.754368782043457, + "learning_rate": 4.465103519646539e-05, + "loss": 0.4909, + "num_input_tokens_seen": 22665312, + "step": 39065 + }, + { + "epoch": 5.819183795055109, + "grad_norm": 6.587102890014648, + "learning_rate": 4.4649026323117404e-05, + "loss": 0.1793, + "num_input_tokens_seen": 22668320, + "step": 39070 + }, + { + "epoch": 5.819928507596068, + "grad_norm": 35.68766784667969, + "learning_rate": 4.464701711781909e-05, + "loss": 0.3861, + "num_input_tokens_seen": 22671264, + "step": 39075 + }, + { + "epoch": 5.820673220137027, + "grad_norm": 0.11300981789827347, + "learning_rate": 4.46450075806044e-05, + "loss": 0.2964, + "num_input_tokens_seen": 22674240, + "step": 39080 + }, + { + "epoch": 5.821417932677987, + "grad_norm": 48.07511520385742, + "learning_rate": 4.464299771150727e-05, + "loss": 0.3116, + "num_input_tokens_seen": 22677504, + "step": 39085 + }, + { + "epoch": 5.822162645218945, + "grad_norm": 1.730249285697937, + "learning_rate": 4.464098751056165e-05, + "loss": 0.1178, + "num_input_tokens_seen": 22680192, + "step": 39090 + }, + { + "epoch": 5.822907357759904, + "grad_norm": 11.662278175354004, + "learning_rate": 4.463897697780152e-05, + "loss": 0.296, + "num_input_tokens_seen": 22682976, + "step": 39095 + }, + { + "epoch": 5.823652070300864, + "grad_norm": 14.74022388458252, + "learning_rate": 4.463696611326082e-05, + "loss": 0.01, + "num_input_tokens_seen": 22685984, + "step": 39100 + }, + { + "epoch": 5.8243967828418235, + "grad_norm": 3.3819336891174316, + "learning_rate": 4.4634954916973545e-05, + "loss": 0.6089, + "num_input_tokens_seen": 22689024, + "step": 39105 + }, + { + "epoch": 5.825141495382782, + "grad_norm": 0.040957026183605194, + "learning_rate": 4.463294338897366e-05, + "loss": 0.1854, + "num_input_tokens_seen": 22691840, + "step": 39110 + }, + { + "epoch": 5.825886207923741, + "grad_norm": 59.878273010253906, + "learning_rate": 4.463093152929515e-05, + "loss": 0.0907, + "num_input_tokens_seen": 22694656, + "step": 39115 + }, + { + "epoch": 5.826630920464701, + "grad_norm": 0.6432802081108093, + "learning_rate": 4.4628919337972e-05, + "loss": 0.2263, + "num_input_tokens_seen": 22697408, + "step": 39120 + }, + { + "epoch": 5.82737563300566, + "grad_norm": 13.296523094177246, + "learning_rate": 4.462690681503822e-05, + "loss": 0.5875, + "num_input_tokens_seen": 22700064, + "step": 39125 + }, + { + "epoch": 5.828120345546619, + "grad_norm": 96.60971069335938, + "learning_rate": 4.462489396052779e-05, + "loss": 0.1617, + "num_input_tokens_seen": 22702688, + "step": 39130 + }, + { + "epoch": 5.828865058087578, + "grad_norm": 0.05258859694004059, + "learning_rate": 4.462288077447472e-05, + "loss": 0.4542, + "num_input_tokens_seen": 22705664, + "step": 39135 + }, + { + "epoch": 5.8296097706285375, + "grad_norm": 13.7376127243042, + "learning_rate": 4.462086725691302e-05, + "loss": 0.205, + "num_input_tokens_seen": 22708352, + "step": 39140 + }, + { + "epoch": 5.830354483169496, + "grad_norm": 26.7884464263916, + "learning_rate": 4.4618853407876714e-05, + "loss": 0.2431, + "num_input_tokens_seen": 22711104, + "step": 39145 + }, + { + "epoch": 5.831099195710456, + "grad_norm": 14.594905853271484, + "learning_rate": 4.461683922739982e-05, + "loss": 0.0761, + "num_input_tokens_seen": 22713696, + "step": 39150 + }, + { + "epoch": 5.831843908251415, + "grad_norm": 5.329504489898682, + "learning_rate": 4.461482471551637e-05, + "loss": 0.4214, + "num_input_tokens_seen": 22716448, + "step": 39155 + }, + { + "epoch": 5.832588620792374, + "grad_norm": 85.4053955078125, + "learning_rate": 4.4612809872260386e-05, + "loss": 0.2064, + "num_input_tokens_seen": 22719168, + "step": 39160 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 0.08413850516080856, + "learning_rate": 4.461079469766592e-05, + "loss": 0.314, + "num_input_tokens_seen": 22722240, + "step": 39165 + }, + { + "epoch": 5.834078045874293, + "grad_norm": 32.812015533447266, + "learning_rate": 4.4608779191766994e-05, + "loss": 0.266, + "num_input_tokens_seen": 22725120, + "step": 39170 + }, + { + "epoch": 5.834822758415251, + "grad_norm": 1.1280126571655273, + "learning_rate": 4.460676335459768e-05, + "loss": 0.3928, + "num_input_tokens_seen": 22727968, + "step": 39175 + }, + { + "epoch": 5.835567470956211, + "grad_norm": 16.989707946777344, + "learning_rate": 4.460474718619203e-05, + "loss": 0.4372, + "num_input_tokens_seen": 22730688, + "step": 39180 + }, + { + "epoch": 5.83631218349717, + "grad_norm": 29.555070877075195, + "learning_rate": 4.4602730686584105e-05, + "loss": 0.24, + "num_input_tokens_seen": 22733568, + "step": 39185 + }, + { + "epoch": 5.8370568960381295, + "grad_norm": 0.06361132860183716, + "learning_rate": 4.460071385580796e-05, + "loss": 0.2469, + "num_input_tokens_seen": 22736288, + "step": 39190 + }, + { + "epoch": 5.837801608579088, + "grad_norm": 0.017259858548641205, + "learning_rate": 4.459869669389768e-05, + "loss": 0.0842, + "num_input_tokens_seen": 22739104, + "step": 39195 + }, + { + "epoch": 5.838546321120048, + "grad_norm": 17.3258056640625, + "learning_rate": 4.459667920088734e-05, + "loss": 0.4888, + "num_input_tokens_seen": 22741920, + "step": 39200 + }, + { + "epoch": 5.839291033661007, + "grad_norm": 13.775016784667969, + "learning_rate": 4.459466137681102e-05, + "loss": 0.4265, + "num_input_tokens_seen": 22744896, + "step": 39205 + }, + { + "epoch": 5.840035746201966, + "grad_norm": 0.05577198415994644, + "learning_rate": 4.4592643221702805e-05, + "loss": 0.1263, + "num_input_tokens_seen": 22747808, + "step": 39210 + }, + { + "epoch": 5.840780458742925, + "grad_norm": 32.2905387878418, + "learning_rate": 4.459062473559681e-05, + "loss": 0.466, + "num_input_tokens_seen": 22750432, + "step": 39215 + }, + { + "epoch": 5.841525171283885, + "grad_norm": 29.138389587402344, + "learning_rate": 4.4588605918527104e-05, + "loss": 0.4234, + "num_input_tokens_seen": 22753504, + "step": 39220 + }, + { + "epoch": 5.8422698838248435, + "grad_norm": 2.637021541595459, + "learning_rate": 4.458658677052782e-05, + "loss": 0.2198, + "num_input_tokens_seen": 22756096, + "step": 39225 + }, + { + "epoch": 5.843014596365803, + "grad_norm": 45.20562744140625, + "learning_rate": 4.458456729163306e-05, + "loss": 0.1542, + "num_input_tokens_seen": 22758624, + "step": 39230 + }, + { + "epoch": 5.843759308906762, + "grad_norm": 0.07805617153644562, + "learning_rate": 4.458254748187693e-05, + "loss": 0.0575, + "num_input_tokens_seen": 22761600, + "step": 39235 + }, + { + "epoch": 5.8445040214477215, + "grad_norm": 56.19801330566406, + "learning_rate": 4.458052734129358e-05, + "loss": 0.1865, + "num_input_tokens_seen": 22764480, + "step": 39240 + }, + { + "epoch": 5.84524873398868, + "grad_norm": 0.7766117453575134, + "learning_rate": 4.457850686991711e-05, + "loss": 0.1004, + "num_input_tokens_seen": 22767520, + "step": 39245 + }, + { + "epoch": 5.84599344652964, + "grad_norm": 12.766376495361328, + "learning_rate": 4.4576486067781675e-05, + "loss": 0.8292, + "num_input_tokens_seen": 22770688, + "step": 39250 + }, + { + "epoch": 5.846738159070599, + "grad_norm": 62.38862991333008, + "learning_rate": 4.45744649349214e-05, + "loss": 0.5148, + "num_input_tokens_seen": 22773312, + "step": 39255 + }, + { + "epoch": 5.847482871611557, + "grad_norm": 67.55670166015625, + "learning_rate": 4.457244347137043e-05, + "loss": 0.2397, + "num_input_tokens_seen": 22776160, + "step": 39260 + }, + { + "epoch": 5.848227584152517, + "grad_norm": 1.3273751735687256, + "learning_rate": 4.457042167716292e-05, + "loss": 0.2702, + "num_input_tokens_seen": 22779136, + "step": 39265 + }, + { + "epoch": 5.848972296693477, + "grad_norm": 29.026212692260742, + "learning_rate": 4.456839955233303e-05, + "loss": 0.5343, + "num_input_tokens_seen": 22782080, + "step": 39270 + }, + { + "epoch": 5.8497170092344355, + "grad_norm": 59.81914138793945, + "learning_rate": 4.456637709691491e-05, + "loss": 0.3437, + "num_input_tokens_seen": 22784704, + "step": 39275 + }, + { + "epoch": 5.850461721775394, + "grad_norm": 0.22581535577774048, + "learning_rate": 4.456435431094275e-05, + "loss": 0.1291, + "num_input_tokens_seen": 22787552, + "step": 39280 + }, + { + "epoch": 5.851206434316354, + "grad_norm": 44.17995071411133, + "learning_rate": 4.45623311944507e-05, + "loss": 0.353, + "num_input_tokens_seen": 22790208, + "step": 39285 + }, + { + "epoch": 5.8519511468573135, + "grad_norm": 21.774587631225586, + "learning_rate": 4.4560307747472945e-05, + "loss": 0.2729, + "num_input_tokens_seen": 22792864, + "step": 39290 + }, + { + "epoch": 5.852695859398272, + "grad_norm": 22.714418411254883, + "learning_rate": 4.4558283970043676e-05, + "loss": 0.2649, + "num_input_tokens_seen": 22796000, + "step": 39295 + }, + { + "epoch": 5.853440571939231, + "grad_norm": 5.6105122566223145, + "learning_rate": 4.4556259862197067e-05, + "loss": 0.1927, + "num_input_tokens_seen": 22798976, + "step": 39300 + }, + { + "epoch": 5.854185284480191, + "grad_norm": 6.625308036804199, + "learning_rate": 4.4554235423967336e-05, + "loss": 0.0648, + "num_input_tokens_seen": 22801952, + "step": 39305 + }, + { + "epoch": 5.8549299970211495, + "grad_norm": 60.444854736328125, + "learning_rate": 4.4552210655388664e-05, + "loss": 0.2706, + "num_input_tokens_seen": 22804960, + "step": 39310 + }, + { + "epoch": 5.855674709562109, + "grad_norm": 26.0520076751709, + "learning_rate": 4.455018555649527e-05, + "loss": 0.289, + "num_input_tokens_seen": 22807776, + "step": 39315 + }, + { + "epoch": 5.856419422103068, + "grad_norm": 19.704212188720703, + "learning_rate": 4.4548160127321356e-05, + "loss": 0.1413, + "num_input_tokens_seen": 22810848, + "step": 39320 + }, + { + "epoch": 5.8571641346440275, + "grad_norm": 19.40716552734375, + "learning_rate": 4.454613436790115e-05, + "loss": 0.4091, + "num_input_tokens_seen": 22813664, + "step": 39325 + }, + { + "epoch": 5.857908847184986, + "grad_norm": 5.781551361083984, + "learning_rate": 4.454410827826887e-05, + "loss": 0.4307, + "num_input_tokens_seen": 22816544, + "step": 39330 + }, + { + "epoch": 5.858653559725946, + "grad_norm": 5.083150863647461, + "learning_rate": 4.454208185845874e-05, + "loss": 0.4779, + "num_input_tokens_seen": 22819552, + "step": 39335 + }, + { + "epoch": 5.859398272266905, + "grad_norm": 0.1566845327615738, + "learning_rate": 4.4540055108504996e-05, + "loss": 0.2396, + "num_input_tokens_seen": 22822688, + "step": 39340 + }, + { + "epoch": 5.860142984807864, + "grad_norm": 0.24510520696640015, + "learning_rate": 4.4538028028441885e-05, + "loss": 0.2908, + "num_input_tokens_seen": 22825344, + "step": 39345 + }, + { + "epoch": 5.860887697348823, + "grad_norm": 43.571937561035156, + "learning_rate": 4.453600061830365e-05, + "loss": 0.2791, + "num_input_tokens_seen": 22828256, + "step": 39350 + }, + { + "epoch": 5.861632409889783, + "grad_norm": 14.346365928649902, + "learning_rate": 4.453397287812453e-05, + "loss": 0.3038, + "num_input_tokens_seen": 22831040, + "step": 39355 + }, + { + "epoch": 5.8623771224307415, + "grad_norm": 14.679788589477539, + "learning_rate": 4.4531944807938806e-05, + "loss": 0.2343, + "num_input_tokens_seen": 22833888, + "step": 39360 + }, + { + "epoch": 5.863121834971701, + "grad_norm": 21.45025634765625, + "learning_rate": 4.4529916407780715e-05, + "loss": 0.237, + "num_input_tokens_seen": 22836736, + "step": 39365 + }, + { + "epoch": 5.86386654751266, + "grad_norm": 18.040531158447266, + "learning_rate": 4.452788767768454e-05, + "loss": 0.4368, + "num_input_tokens_seen": 22839616, + "step": 39370 + }, + { + "epoch": 5.8646112600536195, + "grad_norm": 1.809781551361084, + "learning_rate": 4.4525858617684545e-05, + "loss": 0.2455, + "num_input_tokens_seen": 22842752, + "step": 39375 + }, + { + "epoch": 5.865355972594578, + "grad_norm": 13.894521713256836, + "learning_rate": 4.452382922781503e-05, + "loss": 0.3366, + "num_input_tokens_seen": 22845760, + "step": 39380 + }, + { + "epoch": 5.866100685135538, + "grad_norm": 4.024547576904297, + "learning_rate": 4.4521799508110245e-05, + "loss": 0.5682, + "num_input_tokens_seen": 22848544, + "step": 39385 + }, + { + "epoch": 5.866845397676497, + "grad_norm": 37.76620101928711, + "learning_rate": 4.4519769458604504e-05, + "loss": 0.4428, + "num_input_tokens_seen": 22851392, + "step": 39390 + }, + { + "epoch": 5.867590110217456, + "grad_norm": 11.55854606628418, + "learning_rate": 4.45177390793321e-05, + "loss": 0.5278, + "num_input_tokens_seen": 22854656, + "step": 39395 + }, + { + "epoch": 5.868334822758415, + "grad_norm": 16.553688049316406, + "learning_rate": 4.451570837032733e-05, + "loss": 0.2603, + "num_input_tokens_seen": 22857568, + "step": 39400 + }, + { + "epoch": 5.869079535299375, + "grad_norm": 16.62275505065918, + "learning_rate": 4.45136773316245e-05, + "loss": 0.3983, + "num_input_tokens_seen": 22860864, + "step": 39405 + }, + { + "epoch": 5.8698242478403335, + "grad_norm": 4.742278575897217, + "learning_rate": 4.451164596325793e-05, + "loss": 0.1969, + "num_input_tokens_seen": 22863712, + "step": 39410 + }, + { + "epoch": 5.870568960381293, + "grad_norm": 12.359701156616211, + "learning_rate": 4.450961426526192e-05, + "loss": 0.3289, + "num_input_tokens_seen": 22866656, + "step": 39415 + }, + { + "epoch": 5.871313672922252, + "grad_norm": 18.054515838623047, + "learning_rate": 4.450758223767082e-05, + "loss": 0.1488, + "num_input_tokens_seen": 22869344, + "step": 39420 + }, + { + "epoch": 5.872058385463212, + "grad_norm": 16.24787712097168, + "learning_rate": 4.4505549880518935e-05, + "loss": 0.5578, + "num_input_tokens_seen": 22872192, + "step": 39425 + }, + { + "epoch": 5.87280309800417, + "grad_norm": 0.7318844199180603, + "learning_rate": 4.4503517193840615e-05, + "loss": 0.2718, + "num_input_tokens_seen": 22875136, + "step": 39430 + }, + { + "epoch": 5.87354781054513, + "grad_norm": 14.982832908630371, + "learning_rate": 4.4501484177670186e-05, + "loss": 0.394, + "num_input_tokens_seen": 22878176, + "step": 39435 + }, + { + "epoch": 5.874292523086089, + "grad_norm": 2.9448578357696533, + "learning_rate": 4.449945083204201e-05, + "loss": 0.1495, + "num_input_tokens_seen": 22881344, + "step": 39440 + }, + { + "epoch": 5.8750372356270475, + "grad_norm": 32.12130355834961, + "learning_rate": 4.4497417156990427e-05, + "loss": 0.2499, + "num_input_tokens_seen": 22884128, + "step": 39445 + }, + { + "epoch": 5.875781948168007, + "grad_norm": 19.683841705322266, + "learning_rate": 4.44953831525498e-05, + "loss": 0.3088, + "num_input_tokens_seen": 22886944, + "step": 39450 + }, + { + "epoch": 5.876526660708967, + "grad_norm": 16.359149932861328, + "learning_rate": 4.449334881875449e-05, + "loss": 0.2329, + "num_input_tokens_seen": 22889760, + "step": 39455 + }, + { + "epoch": 5.8772713732499255, + "grad_norm": 0.2322760820388794, + "learning_rate": 4.4491314155638865e-05, + "loss": 0.0652, + "num_input_tokens_seen": 22892800, + "step": 39460 + }, + { + "epoch": 5.878016085790884, + "grad_norm": 13.041929244995117, + "learning_rate": 4.448927916323729e-05, + "loss": 0.3831, + "num_input_tokens_seen": 22895680, + "step": 39465 + }, + { + "epoch": 5.878760798331844, + "grad_norm": 2.1988134384155273, + "learning_rate": 4.448724384158416e-05, + "loss": 0.2863, + "num_input_tokens_seen": 22898912, + "step": 39470 + }, + { + "epoch": 5.879505510872804, + "grad_norm": 16.645479202270508, + "learning_rate": 4.4485208190713846e-05, + "loss": 0.2709, + "num_input_tokens_seen": 22901408, + "step": 39475 + }, + { + "epoch": 5.880250223413762, + "grad_norm": 5.945645332336426, + "learning_rate": 4.448317221066074e-05, + "loss": 0.5559, + "num_input_tokens_seen": 22904256, + "step": 39480 + }, + { + "epoch": 5.880994935954721, + "grad_norm": 23.348752975463867, + "learning_rate": 4.4481135901459245e-05, + "loss": 0.2545, + "num_input_tokens_seen": 22907072, + "step": 39485 + }, + { + "epoch": 5.881739648495681, + "grad_norm": 21.400197982788086, + "learning_rate": 4.4479099263143765e-05, + "loss": 0.2094, + "num_input_tokens_seen": 22909792, + "step": 39490 + }, + { + "epoch": 5.8824843610366395, + "grad_norm": 6.685596942901611, + "learning_rate": 4.447706229574869e-05, + "loss": 0.2862, + "num_input_tokens_seen": 22912928, + "step": 39495 + }, + { + "epoch": 5.883229073577599, + "grad_norm": 2.32906174659729, + "learning_rate": 4.4475024999308454e-05, + "loss": 0.1961, + "num_input_tokens_seen": 22915648, + "step": 39500 + }, + { + "epoch": 5.883973786118558, + "grad_norm": 10.642359733581543, + "learning_rate": 4.4472987373857456e-05, + "loss": 0.2654, + "num_input_tokens_seen": 22918368, + "step": 39505 + }, + { + "epoch": 5.884718498659518, + "grad_norm": 0.17170320451259613, + "learning_rate": 4.447094941943013e-05, + "loss": 0.1505, + "num_input_tokens_seen": 22921568, + "step": 39510 + }, + { + "epoch": 5.885463211200476, + "grad_norm": 28.422143936157227, + "learning_rate": 4.44689111360609e-05, + "loss": 0.1574, + "num_input_tokens_seen": 22924320, + "step": 39515 + }, + { + "epoch": 5.886207923741436, + "grad_norm": 12.119024276733398, + "learning_rate": 4.446687252378421e-05, + "loss": 0.2364, + "num_input_tokens_seen": 22927456, + "step": 39520 + }, + { + "epoch": 5.886952636282395, + "grad_norm": 4.659893989562988, + "learning_rate": 4.446483358263449e-05, + "loss": 0.1633, + "num_input_tokens_seen": 22930208, + "step": 39525 + }, + { + "epoch": 5.887697348823354, + "grad_norm": 105.33281707763672, + "learning_rate": 4.44627943126462e-05, + "loss": 0.474, + "num_input_tokens_seen": 22932896, + "step": 39530 + }, + { + "epoch": 5.888442061364313, + "grad_norm": 29.561710357666016, + "learning_rate": 4.446075471385376e-05, + "loss": 0.2185, + "num_input_tokens_seen": 22935872, + "step": 39535 + }, + { + "epoch": 5.889186773905273, + "grad_norm": 32.222389221191406, + "learning_rate": 4.4458714786291666e-05, + "loss": 0.2695, + "num_input_tokens_seen": 22938848, + "step": 39540 + }, + { + "epoch": 5.8899314864462315, + "grad_norm": 7.868519306182861, + "learning_rate": 4.4456674529994356e-05, + "loss": 0.2067, + "num_input_tokens_seen": 22941664, + "step": 39545 + }, + { + "epoch": 5.890676198987191, + "grad_norm": 28.360572814941406, + "learning_rate": 4.44546339449963e-05, + "loss": 0.5056, + "num_input_tokens_seen": 22944480, + "step": 39550 + }, + { + "epoch": 5.89142091152815, + "grad_norm": 13.265695571899414, + "learning_rate": 4.445259303133198e-05, + "loss": 0.222, + "num_input_tokens_seen": 22947520, + "step": 39555 + }, + { + "epoch": 5.89216562406911, + "grad_norm": 11.888436317443848, + "learning_rate": 4.445055178903588e-05, + "loss": 0.4064, + "num_input_tokens_seen": 22950464, + "step": 39560 + }, + { + "epoch": 5.892910336610068, + "grad_norm": 18.00731658935547, + "learning_rate": 4.444851021814247e-05, + "loss": 0.1213, + "num_input_tokens_seen": 22953184, + "step": 39565 + }, + { + "epoch": 5.893655049151028, + "grad_norm": 38.48763656616211, + "learning_rate": 4.444646831868624e-05, + "loss": 0.398, + "num_input_tokens_seen": 22956032, + "step": 39570 + }, + { + "epoch": 5.894399761691987, + "grad_norm": 0.10557620227336884, + "learning_rate": 4.44444260907017e-05, + "loss": 0.0483, + "num_input_tokens_seen": 22958784, + "step": 39575 + }, + { + "epoch": 5.895144474232946, + "grad_norm": 0.7244097590446472, + "learning_rate": 4.444238353422334e-05, + "loss": 0.4002, + "num_input_tokens_seen": 22961568, + "step": 39580 + }, + { + "epoch": 5.895889186773905, + "grad_norm": 0.08660425245761871, + "learning_rate": 4.444034064928567e-05, + "loss": 0.2981, + "num_input_tokens_seen": 22964480, + "step": 39585 + }, + { + "epoch": 5.896633899314865, + "grad_norm": 4.39486837387085, + "learning_rate": 4.443829743592321e-05, + "loss": 0.1793, + "num_input_tokens_seen": 22967200, + "step": 39590 + }, + { + "epoch": 5.897378611855824, + "grad_norm": 0.01842552237212658, + "learning_rate": 4.4436253894170464e-05, + "loss": 0.0523, + "num_input_tokens_seen": 22969792, + "step": 39595 + }, + { + "epoch": 5.898123324396783, + "grad_norm": 0.18781283497810364, + "learning_rate": 4.4434210024061966e-05, + "loss": 0.2614, + "num_input_tokens_seen": 22972960, + "step": 39600 + }, + { + "epoch": 5.898868036937742, + "grad_norm": 7.418697834014893, + "learning_rate": 4.443216582563224e-05, + "loss": 0.3011, + "num_input_tokens_seen": 22975776, + "step": 39605 + }, + { + "epoch": 5.899612749478701, + "grad_norm": 30.949743270874023, + "learning_rate": 4.443012129891583e-05, + "loss": 0.204, + "num_input_tokens_seen": 22978336, + "step": 39610 + }, + { + "epoch": 5.90035746201966, + "grad_norm": 29.03008270263672, + "learning_rate": 4.442807644394725e-05, + "loss": 0.4323, + "num_input_tokens_seen": 22981248, + "step": 39615 + }, + { + "epoch": 5.90110217456062, + "grad_norm": 27.095823287963867, + "learning_rate": 4.442603126076108e-05, + "loss": 0.1177, + "num_input_tokens_seen": 22983968, + "step": 39620 + }, + { + "epoch": 5.901846887101579, + "grad_norm": 2.0329654216766357, + "learning_rate": 4.442398574939185e-05, + "loss": 0.2633, + "num_input_tokens_seen": 22986848, + "step": 39625 + }, + { + "epoch": 5.9025915996425375, + "grad_norm": 0.08049395680427551, + "learning_rate": 4.442193990987412e-05, + "loss": 0.1946, + "num_input_tokens_seen": 22989728, + "step": 39630 + }, + { + "epoch": 5.903336312183497, + "grad_norm": 33.72814178466797, + "learning_rate": 4.441989374224246e-05, + "loss": 0.2871, + "num_input_tokens_seen": 22992416, + "step": 39635 + }, + { + "epoch": 5.904081024724457, + "grad_norm": 22.970170974731445, + "learning_rate": 4.4417847246531435e-05, + "loss": 0.1732, + "num_input_tokens_seen": 22995040, + "step": 39640 + }, + { + "epoch": 5.904825737265416, + "grad_norm": 0.0356891043484211, + "learning_rate": 4.4415800422775614e-05, + "loss": 0.309, + "num_input_tokens_seen": 22997920, + "step": 39645 + }, + { + "epoch": 5.905570449806374, + "grad_norm": 10.03492546081543, + "learning_rate": 4.441375327100957e-05, + "loss": 0.1594, + "num_input_tokens_seen": 23000960, + "step": 39650 + }, + { + "epoch": 5.906315162347334, + "grad_norm": 0.37761494517326355, + "learning_rate": 4.4411705791267904e-05, + "loss": 0.1744, + "num_input_tokens_seen": 23003872, + "step": 39655 + }, + { + "epoch": 5.907059874888293, + "grad_norm": 3.859283924102783, + "learning_rate": 4.44096579835852e-05, + "loss": 0.2766, + "num_input_tokens_seen": 23006656, + "step": 39660 + }, + { + "epoch": 5.907804587429252, + "grad_norm": 6.399391174316406, + "learning_rate": 4.440760984799605e-05, + "loss": 0.2613, + "num_input_tokens_seen": 23009408, + "step": 39665 + }, + { + "epoch": 5.908549299970211, + "grad_norm": 3.373342275619507, + "learning_rate": 4.440556138453505e-05, + "loss": 0.1945, + "num_input_tokens_seen": 23012352, + "step": 39670 + }, + { + "epoch": 5.909294012511171, + "grad_norm": 5.5604424476623535, + "learning_rate": 4.440351259323682e-05, + "loss": 0.2052, + "num_input_tokens_seen": 23015040, + "step": 39675 + }, + { + "epoch": 5.91003872505213, + "grad_norm": 0.03376384451985359, + "learning_rate": 4.440146347413596e-05, + "loss": 0.1437, + "num_input_tokens_seen": 23017920, + "step": 39680 + }, + { + "epoch": 5.910783437593089, + "grad_norm": 35.57423400878906, + "learning_rate": 4.4399414027267094e-05, + "loss": 0.6006, + "num_input_tokens_seen": 23020672, + "step": 39685 + }, + { + "epoch": 5.911528150134048, + "grad_norm": 12.430676460266113, + "learning_rate": 4.439736425266485e-05, + "loss": 0.4048, + "num_input_tokens_seen": 23023552, + "step": 39690 + }, + { + "epoch": 5.912272862675008, + "grad_norm": 29.21845817565918, + "learning_rate": 4.4395314150363856e-05, + "loss": 0.2979, + "num_input_tokens_seen": 23026400, + "step": 39695 + }, + { + "epoch": 5.913017575215966, + "grad_norm": 0.36928316950798035, + "learning_rate": 4.439326372039872e-05, + "loss": 0.0668, + "num_input_tokens_seen": 23029504, + "step": 39700 + }, + { + "epoch": 5.913762287756926, + "grad_norm": 0.009695461951196194, + "learning_rate": 4.439121296280413e-05, + "loss": 0.2625, + "num_input_tokens_seen": 23032224, + "step": 39705 + }, + { + "epoch": 5.914507000297885, + "grad_norm": 35.37885665893555, + "learning_rate": 4.438916187761469e-05, + "loss": 0.4378, + "num_input_tokens_seen": 23034912, + "step": 39710 + }, + { + "epoch": 5.915251712838844, + "grad_norm": 8.042108535766602, + "learning_rate": 4.4387110464865066e-05, + "loss": 0.1739, + "num_input_tokens_seen": 23037600, + "step": 39715 + }, + { + "epoch": 5.915996425379803, + "grad_norm": 1.4215888977050781, + "learning_rate": 4.4385058724589925e-05, + "loss": 0.0719, + "num_input_tokens_seen": 23040544, + "step": 39720 + }, + { + "epoch": 5.916741137920763, + "grad_norm": 20.488882064819336, + "learning_rate": 4.438300665682391e-05, + "loss": 0.1959, + "num_input_tokens_seen": 23043584, + "step": 39725 + }, + { + "epoch": 5.917485850461722, + "grad_norm": 0.6210819482803345, + "learning_rate": 4.43809542616017e-05, + "loss": 0.1677, + "num_input_tokens_seen": 23046560, + "step": 39730 + }, + { + "epoch": 5.918230563002681, + "grad_norm": 32.543704986572266, + "learning_rate": 4.437890153895797e-05, + "loss": 0.4227, + "num_input_tokens_seen": 23049344, + "step": 39735 + }, + { + "epoch": 5.91897527554364, + "grad_norm": 0.06227029487490654, + "learning_rate": 4.437684848892739e-05, + "loss": 0.2677, + "num_input_tokens_seen": 23052224, + "step": 39740 + }, + { + "epoch": 5.9197199880846, + "grad_norm": 0.20099849998950958, + "learning_rate": 4.437479511154465e-05, + "loss": 0.395, + "num_input_tokens_seen": 23054880, + "step": 39745 + }, + { + "epoch": 5.920464700625558, + "grad_norm": 53.808349609375, + "learning_rate": 4.4372741406844434e-05, + "loss": 0.3717, + "num_input_tokens_seen": 23057728, + "step": 39750 + }, + { + "epoch": 5.921209413166518, + "grad_norm": 24.027021408081055, + "learning_rate": 4.437068737486145e-05, + "loss": 0.2288, + "num_input_tokens_seen": 23060736, + "step": 39755 + }, + { + "epoch": 5.921954125707477, + "grad_norm": 18.011667251586914, + "learning_rate": 4.4368633015630385e-05, + "loss": 0.2956, + "num_input_tokens_seen": 23063744, + "step": 39760 + }, + { + "epoch": 5.9226988382484365, + "grad_norm": 6.219000339508057, + "learning_rate": 4.436657832918595e-05, + "loss": 0.3601, + "num_input_tokens_seen": 23066656, + "step": 39765 + }, + { + "epoch": 5.923443550789395, + "grad_norm": 0.4050758481025696, + "learning_rate": 4.436452331556286e-05, + "loss": 0.1678, + "num_input_tokens_seen": 23069888, + "step": 39770 + }, + { + "epoch": 5.924188263330355, + "grad_norm": 24.59292221069336, + "learning_rate": 4.436246797479582e-05, + "loss": 0.2957, + "num_input_tokens_seen": 23072448, + "step": 39775 + }, + { + "epoch": 5.924932975871314, + "grad_norm": 5.736574649810791, + "learning_rate": 4.436041230691957e-05, + "loss": 0.2457, + "num_input_tokens_seen": 23075392, + "step": 39780 + }, + { + "epoch": 5.925677688412273, + "grad_norm": 16.856449127197266, + "learning_rate": 4.435835631196884e-05, + "loss": 0.4215, + "num_input_tokens_seen": 23078400, + "step": 39785 + }, + { + "epoch": 5.926422400953232, + "grad_norm": 1.7256048917770386, + "learning_rate": 4.435629998997835e-05, + "loss": 0.2211, + "num_input_tokens_seen": 23081280, + "step": 39790 + }, + { + "epoch": 5.927167113494191, + "grad_norm": 21.64481544494629, + "learning_rate": 4.435424334098284e-05, + "loss": 0.4368, + "num_input_tokens_seen": 23084000, + "step": 39795 + }, + { + "epoch": 5.92791182603515, + "grad_norm": 0.032076846808195114, + "learning_rate": 4.435218636501706e-05, + "loss": 0.2479, + "num_input_tokens_seen": 23086944, + "step": 39800 + }, + { + "epoch": 5.92865653857611, + "grad_norm": 0.1059778705239296, + "learning_rate": 4.435012906211576e-05, + "loss": 0.0462, + "num_input_tokens_seen": 23089600, + "step": 39805 + }, + { + "epoch": 5.929401251117069, + "grad_norm": 1.8991014957427979, + "learning_rate": 4.43480714323137e-05, + "loss": 0.1377, + "num_input_tokens_seen": 23092096, + "step": 39810 + }, + { + "epoch": 5.930145963658028, + "grad_norm": 3.1171188354492188, + "learning_rate": 4.434601347564563e-05, + "loss": 0.0979, + "num_input_tokens_seen": 23094816, + "step": 39815 + }, + { + "epoch": 5.930890676198987, + "grad_norm": 3.4230964183807373, + "learning_rate": 4.434395519214633e-05, + "loss": 0.304, + "num_input_tokens_seen": 23097632, + "step": 39820 + }, + { + "epoch": 5.931635388739946, + "grad_norm": 14.001443862915039, + "learning_rate": 4.4341896581850566e-05, + "loss": 0.6294, + "num_input_tokens_seen": 23100448, + "step": 39825 + }, + { + "epoch": 5.932380101280906, + "grad_norm": 0.10266993939876556, + "learning_rate": 4.433983764479312e-05, + "loss": 0.1108, + "num_input_tokens_seen": 23103424, + "step": 39830 + }, + { + "epoch": 5.933124813821864, + "grad_norm": 3.4657371044158936, + "learning_rate": 4.433777838100876e-05, + "loss": 0.1563, + "num_input_tokens_seen": 23106144, + "step": 39835 + }, + { + "epoch": 5.933869526362824, + "grad_norm": 0.18681560456752777, + "learning_rate": 4.433571879053231e-05, + "loss": 0.1403, + "num_input_tokens_seen": 23108864, + "step": 39840 + }, + { + "epoch": 5.934614238903783, + "grad_norm": 26.011747360229492, + "learning_rate": 4.433365887339853e-05, + "loss": 0.2888, + "num_input_tokens_seen": 23111904, + "step": 39845 + }, + { + "epoch": 5.9353589514447425, + "grad_norm": 7.262552738189697, + "learning_rate": 4.4331598629642235e-05, + "loss": 0.1985, + "num_input_tokens_seen": 23114944, + "step": 39850 + }, + { + "epoch": 5.936103663985701, + "grad_norm": 0.14116479456424713, + "learning_rate": 4.432953805929823e-05, + "loss": 0.2098, + "num_input_tokens_seen": 23117760, + "step": 39855 + }, + { + "epoch": 5.936848376526661, + "grad_norm": 10.618697166442871, + "learning_rate": 4.432747716240133e-05, + "loss": 0.3564, + "num_input_tokens_seen": 23120800, + "step": 39860 + }, + { + "epoch": 5.93759308906762, + "grad_norm": 15.78625202178955, + "learning_rate": 4.432541593898634e-05, + "loss": 0.4935, + "num_input_tokens_seen": 23123584, + "step": 39865 + }, + { + "epoch": 5.938337801608579, + "grad_norm": 40.16399383544922, + "learning_rate": 4.432335438908809e-05, + "loss": 0.4967, + "num_input_tokens_seen": 23126400, + "step": 39870 + }, + { + "epoch": 5.939082514149538, + "grad_norm": 10.513951301574707, + "learning_rate": 4.432129251274141e-05, + "loss": 0.3784, + "num_input_tokens_seen": 23129408, + "step": 39875 + }, + { + "epoch": 5.939827226690498, + "grad_norm": 16.810882568359375, + "learning_rate": 4.431923030998113e-05, + "loss": 0.2894, + "num_input_tokens_seen": 23132384, + "step": 39880 + }, + { + "epoch": 5.940571939231456, + "grad_norm": 27.999223709106445, + "learning_rate": 4.4317167780842086e-05, + "loss": 0.3044, + "num_input_tokens_seen": 23135360, + "step": 39885 + }, + { + "epoch": 5.941316651772416, + "grad_norm": 21.786718368530273, + "learning_rate": 4.4315104925359124e-05, + "loss": 0.1179, + "num_input_tokens_seen": 23138240, + "step": 39890 + }, + { + "epoch": 5.942061364313375, + "grad_norm": 135.30194091796875, + "learning_rate": 4.431304174356709e-05, + "loss": 0.3798, + "num_input_tokens_seen": 23141024, + "step": 39895 + }, + { + "epoch": 5.9428060768543345, + "grad_norm": 2.031420946121216, + "learning_rate": 4.431097823550086e-05, + "loss": 0.258, + "num_input_tokens_seen": 23143840, + "step": 39900 + }, + { + "epoch": 5.943550789395293, + "grad_norm": 13.804327011108398, + "learning_rate": 4.4308914401195275e-05, + "loss": 0.4503, + "num_input_tokens_seen": 23146816, + "step": 39905 + }, + { + "epoch": 5.944295501936253, + "grad_norm": 8.59709358215332, + "learning_rate": 4.43068502406852e-05, + "loss": 0.5322, + "num_input_tokens_seen": 23149632, + "step": 39910 + }, + { + "epoch": 5.945040214477212, + "grad_norm": 22.47564697265625, + "learning_rate": 4.4304785754005516e-05, + "loss": 0.2456, + "num_input_tokens_seen": 23152768, + "step": 39915 + }, + { + "epoch": 5.945784927018171, + "grad_norm": 41.285465240478516, + "learning_rate": 4.43027209411911e-05, + "loss": 0.1537, + "num_input_tokens_seen": 23155552, + "step": 39920 + }, + { + "epoch": 5.94652963955913, + "grad_norm": 0.9425075650215149, + "learning_rate": 4.430065580227683e-05, + "loss": 0.4131, + "num_input_tokens_seen": 23158496, + "step": 39925 + }, + { + "epoch": 5.94727435210009, + "grad_norm": 17.31528663635254, + "learning_rate": 4.4298590337297595e-05, + "loss": 0.2212, + "num_input_tokens_seen": 23161376, + "step": 39930 + }, + { + "epoch": 5.9480190646410485, + "grad_norm": 9.071615219116211, + "learning_rate": 4.4296524546288286e-05, + "loss": 0.2969, + "num_input_tokens_seen": 23164192, + "step": 39935 + }, + { + "epoch": 5.948763777182008, + "grad_norm": 0.7731643319129944, + "learning_rate": 4.429445842928382e-05, + "loss": 0.0629, + "num_input_tokens_seen": 23166848, + "step": 39940 + }, + { + "epoch": 5.949508489722967, + "grad_norm": 14.5902099609375, + "learning_rate": 4.4292391986319084e-05, + "loss": 0.2887, + "num_input_tokens_seen": 23169856, + "step": 39945 + }, + { + "epoch": 5.9502532022639265, + "grad_norm": 0.058354973793029785, + "learning_rate": 4.429032521742899e-05, + "loss": 0.2801, + "num_input_tokens_seen": 23172544, + "step": 39950 + }, + { + "epoch": 5.950997914804885, + "grad_norm": 6.795621871948242, + "learning_rate": 4.428825812264845e-05, + "loss": 0.1884, + "num_input_tokens_seen": 23175840, + "step": 39955 + }, + { + "epoch": 5.951742627345844, + "grad_norm": 10.887886047363281, + "learning_rate": 4.4286190702012405e-05, + "loss": 0.4096, + "num_input_tokens_seen": 23178976, + "step": 39960 + }, + { + "epoch": 5.952487339886804, + "grad_norm": 0.26523157954216003, + "learning_rate": 4.428412295555576e-05, + "loss": 0.0068, + "num_input_tokens_seen": 23182112, + "step": 39965 + }, + { + "epoch": 5.953232052427763, + "grad_norm": 20.678144454956055, + "learning_rate": 4.4282054883313464e-05, + "loss": 0.2671, + "num_input_tokens_seen": 23184864, + "step": 39970 + }, + { + "epoch": 5.953976764968722, + "grad_norm": 20.407978057861328, + "learning_rate": 4.427998648532045e-05, + "loss": 0.2597, + "num_input_tokens_seen": 23188128, + "step": 39975 + }, + { + "epoch": 5.954721477509681, + "grad_norm": 11.772546768188477, + "learning_rate": 4.427791776161165e-05, + "loss": 0.1755, + "num_input_tokens_seen": 23190880, + "step": 39980 + }, + { + "epoch": 5.9554661900506405, + "grad_norm": 7.730973243713379, + "learning_rate": 4.4275848712222035e-05, + "loss": 0.2289, + "num_input_tokens_seen": 23193760, + "step": 39985 + }, + { + "epoch": 5.9562109025916, + "grad_norm": 0.03921213373541832, + "learning_rate": 4.4273779337186536e-05, + "loss": 0.0681, + "num_input_tokens_seen": 23196576, + "step": 39990 + }, + { + "epoch": 5.956955615132559, + "grad_norm": 0.8676679134368896, + "learning_rate": 4.427170963654013e-05, + "loss": 0.2144, + "num_input_tokens_seen": 23199424, + "step": 39995 + }, + { + "epoch": 5.957700327673518, + "grad_norm": 12.72353744506836, + "learning_rate": 4.426963961031777e-05, + "loss": 0.2109, + "num_input_tokens_seen": 23202272, + "step": 40000 + }, + { + "epoch": 5.958445040214477, + "grad_norm": 14.778027534484863, + "learning_rate": 4.426756925855444e-05, + "loss": 0.0576, + "num_input_tokens_seen": 23205248, + "step": 40005 + }, + { + "epoch": 5.959189752755436, + "grad_norm": 21.31448745727539, + "learning_rate": 4.4265498581285114e-05, + "loss": 0.3096, + "num_input_tokens_seen": 23208448, + "step": 40010 + }, + { + "epoch": 5.959934465296396, + "grad_norm": 0.15371763706207275, + "learning_rate": 4.426342757854476e-05, + "loss": 0.1041, + "num_input_tokens_seen": 23211488, + "step": 40015 + }, + { + "epoch": 5.9606791778373545, + "grad_norm": 50.78289794921875, + "learning_rate": 4.4261356250368386e-05, + "loss": 0.6096, + "num_input_tokens_seen": 23214176, + "step": 40020 + }, + { + "epoch": 5.961423890378314, + "grad_norm": 32.604034423828125, + "learning_rate": 4.4259284596790976e-05, + "loss": 0.2614, + "num_input_tokens_seen": 23216864, + "step": 40025 + }, + { + "epoch": 5.962168602919273, + "grad_norm": 58.0198860168457, + "learning_rate": 4.425721261784751e-05, + "loss": 0.4427, + "num_input_tokens_seen": 23219392, + "step": 40030 + }, + { + "epoch": 5.9629133154602325, + "grad_norm": 10.673432350158691, + "learning_rate": 4.425514031357302e-05, + "loss": 0.1775, + "num_input_tokens_seen": 23222272, + "step": 40035 + }, + { + "epoch": 5.963658028001191, + "grad_norm": 40.8205680847168, + "learning_rate": 4.4253067684002505e-05, + "loss": 0.5049, + "num_input_tokens_seen": 23225088, + "step": 40040 + }, + { + "epoch": 5.964402740542151, + "grad_norm": 115.59820556640625, + "learning_rate": 4.425099472917098e-05, + "loss": 0.1729, + "num_input_tokens_seen": 23227840, + "step": 40045 + }, + { + "epoch": 5.96514745308311, + "grad_norm": 19.004335403442383, + "learning_rate": 4.4248921449113464e-05, + "loss": 0.2759, + "num_input_tokens_seen": 23230592, + "step": 40050 + }, + { + "epoch": 5.965892165624069, + "grad_norm": 18.672134399414062, + "learning_rate": 4.424684784386498e-05, + "loss": 0.395, + "num_input_tokens_seen": 23233504, + "step": 40055 + }, + { + "epoch": 5.966636878165028, + "grad_norm": 26.26060676574707, + "learning_rate": 4.424477391346057e-05, + "loss": 0.3666, + "num_input_tokens_seen": 23236480, + "step": 40060 + }, + { + "epoch": 5.967381590705988, + "grad_norm": 1.4983173608779907, + "learning_rate": 4.424269965793526e-05, + "loss": 0.0957, + "num_input_tokens_seen": 23239200, + "step": 40065 + }, + { + "epoch": 5.9681263032469465, + "grad_norm": 20.498449325561523, + "learning_rate": 4.424062507732409e-05, + "loss": 0.3833, + "num_input_tokens_seen": 23242080, + "step": 40070 + }, + { + "epoch": 5.968871015787906, + "grad_norm": 17.836244583129883, + "learning_rate": 4.4238550171662127e-05, + "loss": 0.1112, + "num_input_tokens_seen": 23245088, + "step": 40075 + }, + { + "epoch": 5.969615728328865, + "grad_norm": 4.778038501739502, + "learning_rate": 4.423647494098441e-05, + "loss": 0.0498, + "num_input_tokens_seen": 23247904, + "step": 40080 + }, + { + "epoch": 5.9703604408698245, + "grad_norm": 16.084070205688477, + "learning_rate": 4.423439938532599e-05, + "loss": 0.1427, + "num_input_tokens_seen": 23251104, + "step": 40085 + }, + { + "epoch": 5.971105153410783, + "grad_norm": 6.291694641113281, + "learning_rate": 4.423232350472195e-05, + "loss": 0.2686, + "num_input_tokens_seen": 23254080, + "step": 40090 + }, + { + "epoch": 5.971849865951743, + "grad_norm": 8.280017852783203, + "learning_rate": 4.423024729920735e-05, + "loss": 0.3549, + "num_input_tokens_seen": 23257312, + "step": 40095 + }, + { + "epoch": 5.972594578492702, + "grad_norm": 7.794059753417969, + "learning_rate": 4.422817076881726e-05, + "loss": 0.0296, + "num_input_tokens_seen": 23260064, + "step": 40100 + }, + { + "epoch": 5.973339291033661, + "grad_norm": 66.01786804199219, + "learning_rate": 4.422609391358677e-05, + "loss": 0.2688, + "num_input_tokens_seen": 23263040, + "step": 40105 + }, + { + "epoch": 5.97408400357462, + "grad_norm": 0.013018245808780193, + "learning_rate": 4.4224016733550975e-05, + "loss": 0.1194, + "num_input_tokens_seen": 23265984, + "step": 40110 + }, + { + "epoch": 5.97482871611558, + "grad_norm": 30.347774505615234, + "learning_rate": 4.4221939228744945e-05, + "loss": 0.2894, + "num_input_tokens_seen": 23268768, + "step": 40115 + }, + { + "epoch": 5.9755734286565385, + "grad_norm": 0.021802689880132675, + "learning_rate": 4.421986139920379e-05, + "loss": 0.2101, + "num_input_tokens_seen": 23271456, + "step": 40120 + }, + { + "epoch": 5.976318141197497, + "grad_norm": 25.393211364746094, + "learning_rate": 4.4217783244962615e-05, + "loss": 0.333, + "num_input_tokens_seen": 23274496, + "step": 40125 + }, + { + "epoch": 5.977062853738457, + "grad_norm": 7.782171249389648, + "learning_rate": 4.421570476605652e-05, + "loss": 0.4952, + "num_input_tokens_seen": 23277504, + "step": 40130 + }, + { + "epoch": 5.977807566279417, + "grad_norm": 29.781295776367188, + "learning_rate": 4.421362596252062e-05, + "loss": 0.408, + "num_input_tokens_seen": 23280448, + "step": 40135 + }, + { + "epoch": 5.978552278820375, + "grad_norm": 19.70213508605957, + "learning_rate": 4.4211546834390046e-05, + "loss": 0.1835, + "num_input_tokens_seen": 23283104, + "step": 40140 + }, + { + "epoch": 5.979296991361334, + "grad_norm": 0.2876449525356293, + "learning_rate": 4.420946738169991e-05, + "loss": 0.3097, + "num_input_tokens_seen": 23286176, + "step": 40145 + }, + { + "epoch": 5.980041703902294, + "grad_norm": 0.17860618233680725, + "learning_rate": 4.4207387604485345e-05, + "loss": 0.1191, + "num_input_tokens_seen": 23289184, + "step": 40150 + }, + { + "epoch": 5.980786416443253, + "grad_norm": 25.588634490966797, + "learning_rate": 4.420530750278149e-05, + "loss": 0.5054, + "num_input_tokens_seen": 23292288, + "step": 40155 + }, + { + "epoch": 5.981531128984212, + "grad_norm": 0.018204206600785255, + "learning_rate": 4.420322707662348e-05, + "loss": 0.1824, + "num_input_tokens_seen": 23296192, + "step": 40160 + }, + { + "epoch": 5.982275841525171, + "grad_norm": 56.632625579833984, + "learning_rate": 4.420114632604647e-05, + "loss": 0.1085, + "num_input_tokens_seen": 23299200, + "step": 40165 + }, + { + "epoch": 5.9830205540661305, + "grad_norm": 39.1386833190918, + "learning_rate": 4.41990652510856e-05, + "loss": 0.2311, + "num_input_tokens_seen": 23302208, + "step": 40170 + }, + { + "epoch": 5.983765266607089, + "grad_norm": 15.700273513793945, + "learning_rate": 4.4196983851776044e-05, + "loss": 0.5677, + "num_input_tokens_seen": 23304960, + "step": 40175 + }, + { + "epoch": 5.984509979148049, + "grad_norm": 9.646729469299316, + "learning_rate": 4.419490212815296e-05, + "loss": 0.3341, + "num_input_tokens_seen": 23308032, + "step": 40180 + }, + { + "epoch": 5.985254691689008, + "grad_norm": 0.6590297818183899, + "learning_rate": 4.419282008025151e-05, + "loss": 0.3506, + "num_input_tokens_seen": 23311040, + "step": 40185 + }, + { + "epoch": 5.985999404229967, + "grad_norm": 12.45244312286377, + "learning_rate": 4.4190737708106864e-05, + "loss": 0.1172, + "num_input_tokens_seen": 23314080, + "step": 40190 + }, + { + "epoch": 5.986744116770926, + "grad_norm": 0.7417761087417603, + "learning_rate": 4.418865501175422e-05, + "loss": 0.1048, + "num_input_tokens_seen": 23316992, + "step": 40195 + }, + { + "epoch": 5.987488829311886, + "grad_norm": 0.715812087059021, + "learning_rate": 4.418657199122874e-05, + "loss": 0.0319, + "num_input_tokens_seen": 23319904, + "step": 40200 + }, + { + "epoch": 5.9882335418528445, + "grad_norm": 41.226593017578125, + "learning_rate": 4.418448864656564e-05, + "loss": 0.7093, + "num_input_tokens_seen": 23322720, + "step": 40205 + }, + { + "epoch": 5.988978254393804, + "grad_norm": 45.16706085205078, + "learning_rate": 4.418240497780009e-05, + "loss": 0.2856, + "num_input_tokens_seen": 23325632, + "step": 40210 + }, + { + "epoch": 5.989722966934763, + "grad_norm": 7.882386207580566, + "learning_rate": 4.4180320984967305e-05, + "loss": 0.1257, + "num_input_tokens_seen": 23328608, + "step": 40215 + }, + { + "epoch": 5.990467679475723, + "grad_norm": 28.427499771118164, + "learning_rate": 4.4178236668102504e-05, + "loss": 0.5667, + "num_input_tokens_seen": 23331648, + "step": 40220 + }, + { + "epoch": 5.991212392016681, + "grad_norm": 9.294988632202148, + "learning_rate": 4.417615202724087e-05, + "loss": 0.2104, + "num_input_tokens_seen": 23334560, + "step": 40225 + }, + { + "epoch": 5.991957104557641, + "grad_norm": 14.931066513061523, + "learning_rate": 4.4174067062417645e-05, + "loss": 0.1657, + "num_input_tokens_seen": 23337376, + "step": 40230 + }, + { + "epoch": 5.9927018170986, + "grad_norm": 24.482547760009766, + "learning_rate": 4.417198177366805e-05, + "loss": 0.5895, + "num_input_tokens_seen": 23340352, + "step": 40235 + }, + { + "epoch": 5.993446529639559, + "grad_norm": 72.51160430908203, + "learning_rate": 4.41698961610273e-05, + "loss": 0.3753, + "num_input_tokens_seen": 23343328, + "step": 40240 + }, + { + "epoch": 5.994191242180518, + "grad_norm": 29.35824966430664, + "learning_rate": 4.416781022453064e-05, + "loss": 0.1883, + "num_input_tokens_seen": 23345984, + "step": 40245 + }, + { + "epoch": 5.994935954721478, + "grad_norm": 9.895700454711914, + "learning_rate": 4.4165723964213314e-05, + "loss": 0.5237, + "num_input_tokens_seen": 23348896, + "step": 40250 + }, + { + "epoch": 5.9956806672624365, + "grad_norm": 0.8720644116401672, + "learning_rate": 4.4163637380110555e-05, + "loss": 0.1081, + "num_input_tokens_seen": 23352000, + "step": 40255 + }, + { + "epoch": 5.996425379803396, + "grad_norm": 28.431976318359375, + "learning_rate": 4.416155047225762e-05, + "loss": 0.3697, + "num_input_tokens_seen": 23354880, + "step": 40260 + }, + { + "epoch": 5.997170092344355, + "grad_norm": 24.292251586914062, + "learning_rate": 4.415946324068976e-05, + "loss": 0.5102, + "num_input_tokens_seen": 23357792, + "step": 40265 + }, + { + "epoch": 5.997914804885315, + "grad_norm": 4.196498394012451, + "learning_rate": 4.4157375685442246e-05, + "loss": 0.4116, + "num_input_tokens_seen": 23360736, + "step": 40270 + }, + { + "epoch": 5.998659517426273, + "grad_norm": 7.5796942710876465, + "learning_rate": 4.415528780655034e-05, + "loss": 0.1243, + "num_input_tokens_seen": 23363744, + "step": 40275 + }, + { + "epoch": 5.999404229967233, + "grad_norm": 20.28716278076172, + "learning_rate": 4.4153199604049315e-05, + "loss": 0.1427, + "num_input_tokens_seen": 23366464, + "step": 40280 + }, + { + "epoch": 6.0, + "eval_loss": 1.0435552597045898, + "eval_runtime": 51.2674, + "eval_samples_per_second": 58.205, + "eval_steps_per_second": 14.551, + "num_input_tokens_seen": 23368392, + "step": 40284 + }, + { + "epoch": 6.000148942508192, + "grad_norm": 6.9963812828063965, + "learning_rate": 4.415111107797445e-05, + "loss": 0.4045, + "num_input_tokens_seen": 23368904, + "step": 40285 + }, + { + "epoch": 6.000893655049151, + "grad_norm": 12.156858444213867, + "learning_rate": 4.414902222836103e-05, + "loss": 0.1161, + "num_input_tokens_seen": 23371816, + "step": 40290 + }, + { + "epoch": 6.00163836759011, + "grad_norm": 10.190969467163086, + "learning_rate": 4.414693305524434e-05, + "loss": 0.0301, + "num_input_tokens_seen": 23374728, + "step": 40295 + }, + { + "epoch": 6.00238308013107, + "grad_norm": 6.210923194885254, + "learning_rate": 4.4144843558659675e-05, + "loss": 0.1121, + "num_input_tokens_seen": 23377544, + "step": 40300 + }, + { + "epoch": 6.003127792672029, + "grad_norm": 0.1851644665002823, + "learning_rate": 4.414275373864234e-05, + "loss": 0.2017, + "num_input_tokens_seen": 23380488, + "step": 40305 + }, + { + "epoch": 6.003872505212988, + "grad_norm": 0.060081951320171356, + "learning_rate": 4.4140663595227624e-05, + "loss": 0.1519, + "num_input_tokens_seen": 23383176, + "step": 40310 + }, + { + "epoch": 6.004617217753947, + "grad_norm": 7.679230690002441, + "learning_rate": 4.413857312845086e-05, + "loss": 0.318, + "num_input_tokens_seen": 23386152, + "step": 40315 + }, + { + "epoch": 6.005361930294906, + "grad_norm": 16.207265853881836, + "learning_rate": 4.4136482338347356e-05, + "loss": 0.3017, + "num_input_tokens_seen": 23389448, + "step": 40320 + }, + { + "epoch": 6.006106642835865, + "grad_norm": 0.6925609707832336, + "learning_rate": 4.413439122495243e-05, + "loss": 0.0108, + "num_input_tokens_seen": 23392328, + "step": 40325 + }, + { + "epoch": 6.006851355376824, + "grad_norm": 9.215225219726562, + "learning_rate": 4.413229978830141e-05, + "loss": 0.0428, + "num_input_tokens_seen": 23395272, + "step": 40330 + }, + { + "epoch": 6.007596067917784, + "grad_norm": 0.06450334191322327, + "learning_rate": 4.413020802842963e-05, + "loss": 0.0128, + "num_input_tokens_seen": 23398184, + "step": 40335 + }, + { + "epoch": 6.0083407804587425, + "grad_norm": 14.198707580566406, + "learning_rate": 4.412811594537243e-05, + "loss": 0.267, + "num_input_tokens_seen": 23401160, + "step": 40340 + }, + { + "epoch": 6.009085492999702, + "grad_norm": 3.6379003524780273, + "learning_rate": 4.4126023539165155e-05, + "loss": 0.0509, + "num_input_tokens_seen": 23404232, + "step": 40345 + }, + { + "epoch": 6.009830205540661, + "grad_norm": 0.040171414613723755, + "learning_rate": 4.412393080984315e-05, + "loss": 0.3243, + "num_input_tokens_seen": 23407176, + "step": 40350 + }, + { + "epoch": 6.010574918081621, + "grad_norm": 0.020190149545669556, + "learning_rate": 4.412183775744177e-05, + "loss": 0.0603, + "num_input_tokens_seen": 23410024, + "step": 40355 + }, + { + "epoch": 6.011319630622579, + "grad_norm": 4.342523097991943, + "learning_rate": 4.411974438199637e-05, + "loss": 0.0081, + "num_input_tokens_seen": 23413384, + "step": 40360 + }, + { + "epoch": 6.012064343163539, + "grad_norm": 11.754827499389648, + "learning_rate": 4.411765068354233e-05, + "loss": 0.4202, + "num_input_tokens_seen": 23416360, + "step": 40365 + }, + { + "epoch": 6.012809055704498, + "grad_norm": 1.436729073524475, + "learning_rate": 4.4115556662115004e-05, + "loss": 0.116, + "num_input_tokens_seen": 23419304, + "step": 40370 + }, + { + "epoch": 6.013553768245457, + "grad_norm": 0.2460569143295288, + "learning_rate": 4.411346231774978e-05, + "loss": 0.0502, + "num_input_tokens_seen": 23422088, + "step": 40375 + }, + { + "epoch": 6.014298480786416, + "grad_norm": 1.724708080291748, + "learning_rate": 4.411136765048204e-05, + "loss": 0.0067, + "num_input_tokens_seen": 23424968, + "step": 40380 + }, + { + "epoch": 6.015043193327376, + "grad_norm": 14.445097923278809, + "learning_rate": 4.410927266034716e-05, + "loss": 0.4107, + "num_input_tokens_seen": 23428168, + "step": 40385 + }, + { + "epoch": 6.015787905868335, + "grad_norm": 0.04497925937175751, + "learning_rate": 4.4107177347380545e-05, + "loss": 0.1803, + "num_input_tokens_seen": 23431112, + "step": 40390 + }, + { + "epoch": 6.016532618409294, + "grad_norm": 0.3726789951324463, + "learning_rate": 4.4105081711617594e-05, + "loss": 0.1544, + "num_input_tokens_seen": 23433640, + "step": 40395 + }, + { + "epoch": 6.017277330950253, + "grad_norm": 0.07995807379484177, + "learning_rate": 4.410298575309369e-05, + "loss": 0.0042, + "num_input_tokens_seen": 23436552, + "step": 40400 + }, + { + "epoch": 6.018022043491213, + "grad_norm": 30.31894874572754, + "learning_rate": 4.4100889471844263e-05, + "loss": 0.5098, + "num_input_tokens_seen": 23439336, + "step": 40405 + }, + { + "epoch": 6.018766756032171, + "grad_norm": 0.13199050724506378, + "learning_rate": 4.4098792867904724e-05, + "loss": 0.1342, + "num_input_tokens_seen": 23442120, + "step": 40410 + }, + { + "epoch": 6.019511468573131, + "grad_norm": 11.481090545654297, + "learning_rate": 4.409669594131049e-05, + "loss": 0.3369, + "num_input_tokens_seen": 23444936, + "step": 40415 + }, + { + "epoch": 6.02025618111409, + "grad_norm": 0.006001392845064402, + "learning_rate": 4.409459869209699e-05, + "loss": 0.0671, + "num_input_tokens_seen": 23447720, + "step": 40420 + }, + { + "epoch": 6.021000893655049, + "grad_norm": 12.74742317199707, + "learning_rate": 4.409250112029965e-05, + "loss": 0.2717, + "num_input_tokens_seen": 23450568, + "step": 40425 + }, + { + "epoch": 6.021745606196008, + "grad_norm": 0.02406616322696209, + "learning_rate": 4.4090403225953915e-05, + "loss": 0.0751, + "num_input_tokens_seen": 23453320, + "step": 40430 + }, + { + "epoch": 6.022490318736968, + "grad_norm": 1.4802745580673218, + "learning_rate": 4.408830500909521e-05, + "loss": 0.0074, + "num_input_tokens_seen": 23456136, + "step": 40435 + }, + { + "epoch": 6.023235031277927, + "grad_norm": 0.30729737877845764, + "learning_rate": 4.408620646975899e-05, + "loss": 0.1372, + "num_input_tokens_seen": 23459272, + "step": 40440 + }, + { + "epoch": 6.023979743818886, + "grad_norm": 33.99259948730469, + "learning_rate": 4.408410760798072e-05, + "loss": 0.1924, + "num_input_tokens_seen": 23462056, + "step": 40445 + }, + { + "epoch": 6.024724456359845, + "grad_norm": 38.169883728027344, + "learning_rate": 4.408200842379584e-05, + "loss": 0.0364, + "num_input_tokens_seen": 23465224, + "step": 40450 + }, + { + "epoch": 6.025469168900805, + "grad_norm": 0.01677101105451584, + "learning_rate": 4.407990891723983e-05, + "loss": 0.3187, + "num_input_tokens_seen": 23467848, + "step": 40455 + }, + { + "epoch": 6.026213881441763, + "grad_norm": 1.4328856468200684, + "learning_rate": 4.407780908834814e-05, + "loss": 0.2343, + "num_input_tokens_seen": 23470472, + "step": 40460 + }, + { + "epoch": 6.026958593982723, + "grad_norm": 56.020938873291016, + "learning_rate": 4.407570893715627e-05, + "loss": 0.255, + "num_input_tokens_seen": 23472936, + "step": 40465 + }, + { + "epoch": 6.027703306523682, + "grad_norm": 0.12208814173936844, + "learning_rate": 4.4073608463699676e-05, + "loss": 0.012, + "num_input_tokens_seen": 23475816, + "step": 40470 + }, + { + "epoch": 6.0284480190646414, + "grad_norm": 30.837377548217773, + "learning_rate": 4.4071507668013854e-05, + "loss": 0.1305, + "num_input_tokens_seen": 23478632, + "step": 40475 + }, + { + "epoch": 6.0291927316056, + "grad_norm": 1.990241527557373, + "learning_rate": 4.406940655013429e-05, + "loss": 0.091, + "num_input_tokens_seen": 23481448, + "step": 40480 + }, + { + "epoch": 6.02993744414656, + "grad_norm": 0.01506676897406578, + "learning_rate": 4.406730511009649e-05, + "loss": 0.01, + "num_input_tokens_seen": 23484552, + "step": 40485 + }, + { + "epoch": 6.030682156687519, + "grad_norm": 0.0021300038788467646, + "learning_rate": 4.406520334793595e-05, + "loss": 0.1505, + "num_input_tokens_seen": 23487336, + "step": 40490 + }, + { + "epoch": 6.031426869228477, + "grad_norm": 6.94592809677124, + "learning_rate": 4.4063101263688164e-05, + "loss": 0.2365, + "num_input_tokens_seen": 23490088, + "step": 40495 + }, + { + "epoch": 6.032171581769437, + "grad_norm": 0.040899887681007385, + "learning_rate": 4.406099885738866e-05, + "loss": 0.0987, + "num_input_tokens_seen": 23493096, + "step": 40500 + }, + { + "epoch": 6.032916294310396, + "grad_norm": 13.732873916625977, + "learning_rate": 4.405889612907296e-05, + "loss": 0.0631, + "num_input_tokens_seen": 23496392, + "step": 40505 + }, + { + "epoch": 6.033661006851355, + "grad_norm": 5.159622669219971, + "learning_rate": 4.405679307877658e-05, + "loss": 0.171, + "num_input_tokens_seen": 23499432, + "step": 40510 + }, + { + "epoch": 6.034405719392314, + "grad_norm": 0.173588365316391, + "learning_rate": 4.4054689706535044e-05, + "loss": 0.0393, + "num_input_tokens_seen": 23502184, + "step": 40515 + }, + { + "epoch": 6.035150431933274, + "grad_norm": 1.1346383094787598, + "learning_rate": 4.40525860123839e-05, + "loss": 0.3481, + "num_input_tokens_seen": 23504840, + "step": 40520 + }, + { + "epoch": 6.035895144474233, + "grad_norm": 64.66565704345703, + "learning_rate": 4.405048199635868e-05, + "loss": 0.0921, + "num_input_tokens_seen": 23507528, + "step": 40525 + }, + { + "epoch": 6.036639857015192, + "grad_norm": 0.002811715705320239, + "learning_rate": 4.404837765849492e-05, + "loss": 0.1019, + "num_input_tokens_seen": 23510184, + "step": 40530 + }, + { + "epoch": 6.037384569556151, + "grad_norm": 0.868832528591156, + "learning_rate": 4.4046272998828186e-05, + "loss": 0.1417, + "num_input_tokens_seen": 23512744, + "step": 40535 + }, + { + "epoch": 6.038129282097111, + "grad_norm": 4.252509117126465, + "learning_rate": 4.4044168017394025e-05, + "loss": 0.0261, + "num_input_tokens_seen": 23515400, + "step": 40540 + }, + { + "epoch": 6.038873994638069, + "grad_norm": 0.34406211972236633, + "learning_rate": 4.4042062714228e-05, + "loss": 0.0089, + "num_input_tokens_seen": 23518440, + "step": 40545 + }, + { + "epoch": 6.039618707179029, + "grad_norm": 0.025747543200850487, + "learning_rate": 4.403995708936568e-05, + "loss": 0.1329, + "num_input_tokens_seen": 23521320, + "step": 40550 + }, + { + "epoch": 6.040363419719988, + "grad_norm": 0.03182005137205124, + "learning_rate": 4.403785114284263e-05, + "loss": 0.1859, + "num_input_tokens_seen": 23524200, + "step": 40555 + }, + { + "epoch": 6.0411081322609474, + "grad_norm": 0.007018308620899916, + "learning_rate": 4.4035744874694444e-05, + "loss": 0.1058, + "num_input_tokens_seen": 23526888, + "step": 40560 + }, + { + "epoch": 6.041852844801906, + "grad_norm": 0.04592566564679146, + "learning_rate": 4.403363828495669e-05, + "loss": 0.1094, + "num_input_tokens_seen": 23529800, + "step": 40565 + }, + { + "epoch": 6.042597557342866, + "grad_norm": 5.126532077789307, + "learning_rate": 4.403153137366497e-05, + "loss": 0.1914, + "num_input_tokens_seen": 23532584, + "step": 40570 + }, + { + "epoch": 6.043342269883825, + "grad_norm": 0.11071977764368057, + "learning_rate": 4.402942414085486e-05, + "loss": 0.0032, + "num_input_tokens_seen": 23535368, + "step": 40575 + }, + { + "epoch": 6.044086982424784, + "grad_norm": 0.11942103505134583, + "learning_rate": 4.4027316586561976e-05, + "loss": 0.1915, + "num_input_tokens_seen": 23538344, + "step": 40580 + }, + { + "epoch": 6.044831694965743, + "grad_norm": 2.1455090045928955, + "learning_rate": 4.402520871082191e-05, + "loss": 0.3191, + "num_input_tokens_seen": 23541352, + "step": 40585 + }, + { + "epoch": 6.045576407506703, + "grad_norm": 7.63059139251709, + "learning_rate": 4.402310051367029e-05, + "loss": 0.1626, + "num_input_tokens_seen": 23544392, + "step": 40590 + }, + { + "epoch": 6.046321120047661, + "grad_norm": 0.05066308006644249, + "learning_rate": 4.4020991995142716e-05, + "loss": 0.1827, + "num_input_tokens_seen": 23547336, + "step": 40595 + }, + { + "epoch": 6.047065832588621, + "grad_norm": 25.991491317749023, + "learning_rate": 4.401888315527481e-05, + "loss": 0.2344, + "num_input_tokens_seen": 23549896, + "step": 40600 + }, + { + "epoch": 6.04781054512958, + "grad_norm": 1.2302764654159546, + "learning_rate": 4.40167739941022e-05, + "loss": 0.1693, + "num_input_tokens_seen": 23552680, + "step": 40605 + }, + { + "epoch": 6.0485552576705395, + "grad_norm": 0.21424971520900726, + "learning_rate": 4.401466451166053e-05, + "loss": 0.0302, + "num_input_tokens_seen": 23555368, + "step": 40610 + }, + { + "epoch": 6.049299970211498, + "grad_norm": 0.009517617523670197, + "learning_rate": 4.401255470798543e-05, + "loss": 0.2637, + "num_input_tokens_seen": 23558344, + "step": 40615 + }, + { + "epoch": 6.050044682752458, + "grad_norm": 109.0364990234375, + "learning_rate": 4.401044458311254e-05, + "loss": 0.3437, + "num_input_tokens_seen": 23561640, + "step": 40620 + }, + { + "epoch": 6.050789395293417, + "grad_norm": 151.56271362304688, + "learning_rate": 4.40083341370775e-05, + "loss": 0.1678, + "num_input_tokens_seen": 23564456, + "step": 40625 + }, + { + "epoch": 6.051534107834376, + "grad_norm": 0.005943663883954287, + "learning_rate": 4.400622336991599e-05, + "loss": 0.1409, + "num_input_tokens_seen": 23567112, + "step": 40630 + }, + { + "epoch": 6.052278820375335, + "grad_norm": 7.530261516571045, + "learning_rate": 4.400411228166364e-05, + "loss": 0.2005, + "num_input_tokens_seen": 23570312, + "step": 40635 + }, + { + "epoch": 6.053023532916295, + "grad_norm": 0.01114603877067566, + "learning_rate": 4.400200087235613e-05, + "loss": 0.1758, + "num_input_tokens_seen": 23573128, + "step": 40640 + }, + { + "epoch": 6.0537682454572534, + "grad_norm": 4.671849727630615, + "learning_rate": 4.399988914202913e-05, + "loss": 0.0029, + "num_input_tokens_seen": 23575816, + "step": 40645 + }, + { + "epoch": 6.054512957998213, + "grad_norm": 72.42615509033203, + "learning_rate": 4.399777709071832e-05, + "loss": 0.2308, + "num_input_tokens_seen": 23578792, + "step": 40650 + }, + { + "epoch": 6.055257670539172, + "grad_norm": 77.65678405761719, + "learning_rate": 4.399566471845937e-05, + "loss": 0.1972, + "num_input_tokens_seen": 23581480, + "step": 40655 + }, + { + "epoch": 6.0560023830801315, + "grad_norm": 4.021240711212158, + "learning_rate": 4.3993552025287966e-05, + "loss": 0.0025, + "num_input_tokens_seen": 23584520, + "step": 40660 + }, + { + "epoch": 6.05674709562109, + "grad_norm": 0.016395537182688713, + "learning_rate": 4.399143901123981e-05, + "loss": 0.3405, + "num_input_tokens_seen": 23587464, + "step": 40665 + }, + { + "epoch": 6.057491808162049, + "grad_norm": 0.009114238433539867, + "learning_rate": 4.398932567635059e-05, + "loss": 0.8108, + "num_input_tokens_seen": 23590440, + "step": 40670 + }, + { + "epoch": 6.058236520703009, + "grad_norm": 25.728574752807617, + "learning_rate": 4.398721202065602e-05, + "loss": 0.3408, + "num_input_tokens_seen": 23593032, + "step": 40675 + }, + { + "epoch": 6.058981233243967, + "grad_norm": 4.0742106437683105, + "learning_rate": 4.398509804419179e-05, + "loss": 0.1425, + "num_input_tokens_seen": 23595752, + "step": 40680 + }, + { + "epoch": 6.059725945784927, + "grad_norm": 77.02873992919922, + "learning_rate": 4.3982983746993636e-05, + "loss": 0.1097, + "num_input_tokens_seen": 23598440, + "step": 40685 + }, + { + "epoch": 6.060470658325886, + "grad_norm": 1.1371636390686035, + "learning_rate": 4.398086912909726e-05, + "loss": 0.1523, + "num_input_tokens_seen": 23601224, + "step": 40690 + }, + { + "epoch": 6.0612153708668455, + "grad_norm": 54.53486633300781, + "learning_rate": 4.397875419053838e-05, + "loss": 0.2229, + "num_input_tokens_seen": 23604264, + "step": 40695 + }, + { + "epoch": 6.061960083407804, + "grad_norm": 1.4761344194412231, + "learning_rate": 4.397663893135275e-05, + "loss": 0.2128, + "num_input_tokens_seen": 23607144, + "step": 40700 + }, + { + "epoch": 6.062704795948764, + "grad_norm": 22.49771499633789, + "learning_rate": 4.397452335157609e-05, + "loss": 0.261, + "num_input_tokens_seen": 23610024, + "step": 40705 + }, + { + "epoch": 6.063449508489723, + "grad_norm": 0.020110027864575386, + "learning_rate": 4.397240745124414e-05, + "loss": 0.2813, + "num_input_tokens_seen": 23612840, + "step": 40710 + }, + { + "epoch": 6.064194221030682, + "grad_norm": 28.18691062927246, + "learning_rate": 4.397029123039266e-05, + "loss": 0.2512, + "num_input_tokens_seen": 23615656, + "step": 40715 + }, + { + "epoch": 6.064938933571641, + "grad_norm": 0.835342526435852, + "learning_rate": 4.396817468905738e-05, + "loss": 0.0032, + "num_input_tokens_seen": 23618536, + "step": 40720 + }, + { + "epoch": 6.065683646112601, + "grad_norm": 0.011872738599777222, + "learning_rate": 4.396605782727406e-05, + "loss": 0.1295, + "num_input_tokens_seen": 23621512, + "step": 40725 + }, + { + "epoch": 6.0664283586535594, + "grad_norm": 0.3440881073474884, + "learning_rate": 4.3963940645078484e-05, + "loss": 0.06, + "num_input_tokens_seen": 23624136, + "step": 40730 + }, + { + "epoch": 6.067173071194519, + "grad_norm": 12.384922981262207, + "learning_rate": 4.3961823142506395e-05, + "loss": 0.1853, + "num_input_tokens_seen": 23627624, + "step": 40735 + }, + { + "epoch": 6.067917783735478, + "grad_norm": 50.66135025024414, + "learning_rate": 4.395970531959358e-05, + "loss": 0.0799, + "num_input_tokens_seen": 23630888, + "step": 40740 + }, + { + "epoch": 6.0686624962764375, + "grad_norm": 74.35688018798828, + "learning_rate": 4.395758717637581e-05, + "loss": 0.058, + "num_input_tokens_seen": 23633992, + "step": 40745 + }, + { + "epoch": 6.069407208817396, + "grad_norm": 0.24427004158496857, + "learning_rate": 4.3955468712888884e-05, + "loss": 0.088, + "num_input_tokens_seen": 23636872, + "step": 40750 + }, + { + "epoch": 6.070151921358356, + "grad_norm": 0.19383424520492554, + "learning_rate": 4.395334992916857e-05, + "loss": 0.163, + "num_input_tokens_seen": 23639656, + "step": 40755 + }, + { + "epoch": 6.070896633899315, + "grad_norm": 75.76861572265625, + "learning_rate": 4.395123082525067e-05, + "loss": 0.3598, + "num_input_tokens_seen": 23642696, + "step": 40760 + }, + { + "epoch": 6.071641346440274, + "grad_norm": 0.19357958436012268, + "learning_rate": 4.394911140117099e-05, + "loss": 0.0231, + "num_input_tokens_seen": 23645480, + "step": 40765 + }, + { + "epoch": 6.072386058981233, + "grad_norm": 0.02410905249416828, + "learning_rate": 4.3946991656965334e-05, + "loss": 0.2345, + "num_input_tokens_seen": 23648456, + "step": 40770 + }, + { + "epoch": 6.073130771522193, + "grad_norm": 2.403144598007202, + "learning_rate": 4.394487159266951e-05, + "loss": 0.0027, + "num_input_tokens_seen": 23651208, + "step": 40775 + }, + { + "epoch": 6.0738754840631515, + "grad_norm": 17.01058006286621, + "learning_rate": 4.394275120831933e-05, + "loss": 0.2999, + "num_input_tokens_seen": 23654344, + "step": 40780 + }, + { + "epoch": 6.074620196604111, + "grad_norm": 23.7237606048584, + "learning_rate": 4.394063050395063e-05, + "loss": 0.3227, + "num_input_tokens_seen": 23657096, + "step": 40785 + }, + { + "epoch": 6.07536490914507, + "grad_norm": 0.40165701508522034, + "learning_rate": 4.393850947959922e-05, + "loss": 0.0365, + "num_input_tokens_seen": 23659976, + "step": 40790 + }, + { + "epoch": 6.0761096216860295, + "grad_norm": 33.4273796081543, + "learning_rate": 4.3936388135300946e-05, + "loss": 0.068, + "num_input_tokens_seen": 23663016, + "step": 40795 + }, + { + "epoch": 6.076854334226988, + "grad_norm": 0.030454356223344803, + "learning_rate": 4.3934266471091635e-05, + "loss": 0.0307, + "num_input_tokens_seen": 23666184, + "step": 40800 + }, + { + "epoch": 6.077599046767948, + "grad_norm": 1.0867048501968384, + "learning_rate": 4.393214448700713e-05, + "loss": 0.2796, + "num_input_tokens_seen": 23669032, + "step": 40805 + }, + { + "epoch": 6.078343759308907, + "grad_norm": 65.54883575439453, + "learning_rate": 4.39300221830833e-05, + "loss": 0.0481, + "num_input_tokens_seen": 23671976, + "step": 40810 + }, + { + "epoch": 6.079088471849866, + "grad_norm": 0.3687579035758972, + "learning_rate": 4.392789955935598e-05, + "loss": 0.0014, + "num_input_tokens_seen": 23674952, + "step": 40815 + }, + { + "epoch": 6.079833184390825, + "grad_norm": 0.04714901000261307, + "learning_rate": 4.3925776615861034e-05, + "loss": 0.0637, + "num_input_tokens_seen": 23677928, + "step": 40820 + }, + { + "epoch": 6.080577896931785, + "grad_norm": 34.95066452026367, + "learning_rate": 4.392365335263432e-05, + "loss": 0.7645, + "num_input_tokens_seen": 23680808, + "step": 40825 + }, + { + "epoch": 6.0813226094727435, + "grad_norm": 14.802106857299805, + "learning_rate": 4.392152976971173e-05, + "loss": 0.3648, + "num_input_tokens_seen": 23683688, + "step": 40830 + }, + { + "epoch": 6.082067322013703, + "grad_norm": 0.6739334464073181, + "learning_rate": 4.3919405867129114e-05, + "loss": 0.1358, + "num_input_tokens_seen": 23686376, + "step": 40835 + }, + { + "epoch": 6.082812034554662, + "grad_norm": 20.2442569732666, + "learning_rate": 4.391728164492237e-05, + "loss": 0.1862, + "num_input_tokens_seen": 23689416, + "step": 40840 + }, + { + "epoch": 6.083556747095621, + "grad_norm": 0.03779458627104759, + "learning_rate": 4.391515710312738e-05, + "loss": 0.1711, + "num_input_tokens_seen": 23692296, + "step": 40845 + }, + { + "epoch": 6.08430145963658, + "grad_norm": 27.375612258911133, + "learning_rate": 4.391303224178003e-05, + "loss": 0.148, + "num_input_tokens_seen": 23695080, + "step": 40850 + }, + { + "epoch": 6.085046172177539, + "grad_norm": 154.91030883789062, + "learning_rate": 4.391090706091623e-05, + "loss": 0.2323, + "num_input_tokens_seen": 23697992, + "step": 40855 + }, + { + "epoch": 6.085790884718499, + "grad_norm": 0.02857438288629055, + "learning_rate": 4.390878156057186e-05, + "loss": 0.0626, + "num_input_tokens_seen": 23700616, + "step": 40860 + }, + { + "epoch": 6.0865355972594575, + "grad_norm": 0.05308985710144043, + "learning_rate": 4.390665574078286e-05, + "loss": 0.2531, + "num_input_tokens_seen": 23703464, + "step": 40865 + }, + { + "epoch": 6.087280309800417, + "grad_norm": 0.18890437483787537, + "learning_rate": 4.390452960158512e-05, + "loss": 0.2425, + "num_input_tokens_seen": 23706792, + "step": 40870 + }, + { + "epoch": 6.088025022341376, + "grad_norm": 41.65673828125, + "learning_rate": 4.390240314301457e-05, + "loss": 0.3393, + "num_input_tokens_seen": 23709896, + "step": 40875 + }, + { + "epoch": 6.0887697348823355, + "grad_norm": 13.675878524780273, + "learning_rate": 4.3900276365107126e-05, + "loss": 0.1583, + "num_input_tokens_seen": 23712872, + "step": 40880 + }, + { + "epoch": 6.089514447423294, + "grad_norm": 10.156351089477539, + "learning_rate": 4.3898149267898727e-05, + "loss": 0.3162, + "num_input_tokens_seen": 23715784, + "step": 40885 + }, + { + "epoch": 6.090259159964254, + "grad_norm": 21.707996368408203, + "learning_rate": 4.3896021851425306e-05, + "loss": 0.1856, + "num_input_tokens_seen": 23718792, + "step": 40890 + }, + { + "epoch": 6.091003872505213, + "grad_norm": 0.11659275740385056, + "learning_rate": 4.389389411572279e-05, + "loss": 0.0468, + "num_input_tokens_seen": 23721416, + "step": 40895 + }, + { + "epoch": 6.091748585046172, + "grad_norm": 0.12388381361961365, + "learning_rate": 4.389176606082714e-05, + "loss": 0.0178, + "num_input_tokens_seen": 23725416, + "step": 40900 + }, + { + "epoch": 6.092493297587131, + "grad_norm": 120.75321960449219, + "learning_rate": 4.388963768677431e-05, + "loss": 0.4772, + "num_input_tokens_seen": 23728232, + "step": 40905 + }, + { + "epoch": 6.093238010128091, + "grad_norm": 1.1373399496078491, + "learning_rate": 4.388750899360025e-05, + "loss": 0.358, + "num_input_tokens_seen": 23731272, + "step": 40910 + }, + { + "epoch": 6.0939827226690495, + "grad_norm": 0.08749667555093765, + "learning_rate": 4.3885379981340905e-05, + "loss": 0.0068, + "num_input_tokens_seen": 23734248, + "step": 40915 + }, + { + "epoch": 6.094727435210009, + "grad_norm": 21.562223434448242, + "learning_rate": 4.388325065003228e-05, + "loss": 0.4368, + "num_input_tokens_seen": 23737448, + "step": 40920 + }, + { + "epoch": 6.095472147750968, + "grad_norm": 0.050211913883686066, + "learning_rate": 4.3881120999710315e-05, + "loss": 0.1032, + "num_input_tokens_seen": 23740456, + "step": 40925 + }, + { + "epoch": 6.0962168602919276, + "grad_norm": 0.036344680935144424, + "learning_rate": 4.3878991030411e-05, + "loss": 0.216, + "num_input_tokens_seen": 23743208, + "step": 40930 + }, + { + "epoch": 6.096961572832886, + "grad_norm": 12.734701156616211, + "learning_rate": 4.387686074217032e-05, + "loss": 0.0644, + "num_input_tokens_seen": 23745864, + "step": 40935 + }, + { + "epoch": 6.097706285373846, + "grad_norm": 0.2961043119430542, + "learning_rate": 4.387473013502427e-05, + "loss": 0.1316, + "num_input_tokens_seen": 23748808, + "step": 40940 + }, + { + "epoch": 6.098450997914805, + "grad_norm": 35.74739074707031, + "learning_rate": 4.387259920900884e-05, + "loss": 0.5205, + "num_input_tokens_seen": 23751592, + "step": 40945 + }, + { + "epoch": 6.099195710455764, + "grad_norm": 33.36870574951172, + "learning_rate": 4.3870467964160015e-05, + "loss": 0.213, + "num_input_tokens_seen": 23754440, + "step": 40950 + }, + { + "epoch": 6.099940422996723, + "grad_norm": 0.3168962001800537, + "learning_rate": 4.3868336400513823e-05, + "loss": 0.1014, + "num_input_tokens_seen": 23757480, + "step": 40955 + }, + { + "epoch": 6.100685135537683, + "grad_norm": 51.30059051513672, + "learning_rate": 4.386620451810626e-05, + "loss": 0.4266, + "num_input_tokens_seen": 23760392, + "step": 40960 + }, + { + "epoch": 6.1014298480786415, + "grad_norm": 24.929237365722656, + "learning_rate": 4.3864072316973345e-05, + "loss": 0.2939, + "num_input_tokens_seen": 23763496, + "step": 40965 + }, + { + "epoch": 6.102174560619601, + "grad_norm": 86.93768310546875, + "learning_rate": 4.386193979715111e-05, + "loss": 0.1698, + "num_input_tokens_seen": 23766088, + "step": 40970 + }, + { + "epoch": 6.10291927316056, + "grad_norm": 7.10188627243042, + "learning_rate": 4.385980695867556e-05, + "loss": 0.0424, + "num_input_tokens_seen": 23769064, + "step": 40975 + }, + { + "epoch": 6.10366398570152, + "grad_norm": 11.50451946258545, + "learning_rate": 4.385767380158275e-05, + "loss": 0.3484, + "num_input_tokens_seen": 23771848, + "step": 40980 + }, + { + "epoch": 6.104408698242478, + "grad_norm": 140.32424926757812, + "learning_rate": 4.38555403259087e-05, + "loss": 0.1948, + "num_input_tokens_seen": 23774952, + "step": 40985 + }, + { + "epoch": 6.105153410783438, + "grad_norm": 12.0862455368042, + "learning_rate": 4.3853406531689465e-05, + "loss": 0.1391, + "num_input_tokens_seen": 23777640, + "step": 40990 + }, + { + "epoch": 6.105898123324397, + "grad_norm": 4.969802379608154, + "learning_rate": 4.3851272418961085e-05, + "loss": 0.1597, + "num_input_tokens_seen": 23780296, + "step": 40995 + }, + { + "epoch": 6.106642835865356, + "grad_norm": 0.04849926754832268, + "learning_rate": 4.384913798775962e-05, + "loss": 0.1999, + "num_input_tokens_seen": 23783176, + "step": 41000 + }, + { + "epoch": 6.107387548406315, + "grad_norm": 0.23380924761295319, + "learning_rate": 4.384700323812112e-05, + "loss": 0.0189, + "num_input_tokens_seen": 23785672, + "step": 41005 + }, + { + "epoch": 6.108132260947274, + "grad_norm": 1.8756020069122314, + "learning_rate": 4.3844868170081665e-05, + "loss": 0.3248, + "num_input_tokens_seen": 23788552, + "step": 41010 + }, + { + "epoch": 6.1088769734882336, + "grad_norm": 21.13489532470703, + "learning_rate": 4.384273278367731e-05, + "loss": 0.2536, + "num_input_tokens_seen": 23791400, + "step": 41015 + }, + { + "epoch": 6.109621686029192, + "grad_norm": 17.813831329345703, + "learning_rate": 4.3840597078944135e-05, + "loss": 0.1661, + "num_input_tokens_seen": 23794088, + "step": 41020 + }, + { + "epoch": 6.110366398570152, + "grad_norm": 0.06339464336633682, + "learning_rate": 4.3838461055918226e-05, + "loss": 0.0797, + "num_input_tokens_seen": 23796808, + "step": 41025 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 20.685359954833984, + "learning_rate": 4.383632471463566e-05, + "loss": 0.077, + "num_input_tokens_seen": 23800168, + "step": 41030 + }, + { + "epoch": 6.11185582365207, + "grad_norm": 0.3546464741230011, + "learning_rate": 4.383418805513253e-05, + "loss": 0.3841, + "num_input_tokens_seen": 23803080, + "step": 41035 + }, + { + "epoch": 6.112600536193029, + "grad_norm": 5.57728385925293, + "learning_rate": 4.3832051077444937e-05, + "loss": 0.3126, + "num_input_tokens_seen": 23805960, + "step": 41040 + }, + { + "epoch": 6.113345248733989, + "grad_norm": 18.118946075439453, + "learning_rate": 4.382991378160898e-05, + "loss": 0.2195, + "num_input_tokens_seen": 23808904, + "step": 41045 + }, + { + "epoch": 6.1140899612749475, + "grad_norm": 0.8654698133468628, + "learning_rate": 4.3827776167660775e-05, + "loss": 0.0122, + "num_input_tokens_seen": 23811656, + "step": 41050 + }, + { + "epoch": 6.114834673815907, + "grad_norm": 3.8937036991119385, + "learning_rate": 4.382563823563642e-05, + "loss": 0.005, + "num_input_tokens_seen": 23814632, + "step": 41055 + }, + { + "epoch": 6.115579386356866, + "grad_norm": 37.813560485839844, + "learning_rate": 4.382349998557204e-05, + "loss": 0.1379, + "num_input_tokens_seen": 23817672, + "step": 41060 + }, + { + "epoch": 6.116324098897826, + "grad_norm": 0.751005232334137, + "learning_rate": 4.382136141750376e-05, + "loss": 0.0036, + "num_input_tokens_seen": 23820680, + "step": 41065 + }, + { + "epoch": 6.117068811438784, + "grad_norm": 0.4566943347454071, + "learning_rate": 4.381922253146771e-05, + "loss": 0.0677, + "num_input_tokens_seen": 23823816, + "step": 41070 + }, + { + "epoch": 6.117813523979744, + "grad_norm": 0.03090193308889866, + "learning_rate": 4.381708332750002e-05, + "loss": 0.1246, + "num_input_tokens_seen": 23826792, + "step": 41075 + }, + { + "epoch": 6.118558236520703, + "grad_norm": 0.1259007453918457, + "learning_rate": 4.381494380563683e-05, + "loss": 0.1807, + "num_input_tokens_seen": 23829352, + "step": 41080 + }, + { + "epoch": 6.119302949061662, + "grad_norm": 76.34284973144531, + "learning_rate": 4.3812803965914296e-05, + "loss": 0.4695, + "num_input_tokens_seen": 23832200, + "step": 41085 + }, + { + "epoch": 6.120047661602621, + "grad_norm": 17.042552947998047, + "learning_rate": 4.381066380836855e-05, + "loss": 0.0735, + "num_input_tokens_seen": 23834824, + "step": 41090 + }, + { + "epoch": 6.120792374143581, + "grad_norm": 9.206459045410156, + "learning_rate": 4.380852333303576e-05, + "loss": 0.2805, + "num_input_tokens_seen": 23837640, + "step": 41095 + }, + { + "epoch": 6.1215370866845396, + "grad_norm": 0.04645337164402008, + "learning_rate": 4.380638253995209e-05, + "loss": 0.2242, + "num_input_tokens_seen": 23840616, + "step": 41100 + }, + { + "epoch": 6.122281799225499, + "grad_norm": 18.75489044189453, + "learning_rate": 4.380424142915369e-05, + "loss": 0.1482, + "num_input_tokens_seen": 23843560, + "step": 41105 + }, + { + "epoch": 6.123026511766458, + "grad_norm": 13.307377815246582, + "learning_rate": 4.380210000067675e-05, + "loss": 0.0158, + "num_input_tokens_seen": 23846792, + "step": 41110 + }, + { + "epoch": 6.123771224307418, + "grad_norm": 0.01261444017291069, + "learning_rate": 4.379995825455744e-05, + "loss": 0.1129, + "num_input_tokens_seen": 23849576, + "step": 41115 + }, + { + "epoch": 6.124515936848376, + "grad_norm": 0.07432108372449875, + "learning_rate": 4.379781619083195e-05, + "loss": 0.0101, + "num_input_tokens_seen": 23852520, + "step": 41120 + }, + { + "epoch": 6.125260649389336, + "grad_norm": 31.309816360473633, + "learning_rate": 4.379567380953645e-05, + "loss": 0.181, + "num_input_tokens_seen": 23855464, + "step": 41125 + }, + { + "epoch": 6.126005361930295, + "grad_norm": 1.9235327243804932, + "learning_rate": 4.3793531110707143e-05, + "loss": 0.0871, + "num_input_tokens_seen": 23858408, + "step": 41130 + }, + { + "epoch": 6.126750074471254, + "grad_norm": 17.906227111816406, + "learning_rate": 4.3791388094380236e-05, + "loss": 0.1285, + "num_input_tokens_seen": 23861864, + "step": 41135 + }, + { + "epoch": 6.127494787012213, + "grad_norm": 0.1032840833067894, + "learning_rate": 4.378924476059192e-05, + "loss": 0.2838, + "num_input_tokens_seen": 23864616, + "step": 41140 + }, + { + "epoch": 6.128239499553173, + "grad_norm": 2.88053560256958, + "learning_rate": 4.378710110937842e-05, + "loss": 0.1796, + "num_input_tokens_seen": 23867688, + "step": 41145 + }, + { + "epoch": 6.128984212094132, + "grad_norm": 22.517480850219727, + "learning_rate": 4.378495714077593e-05, + "loss": 0.7478, + "num_input_tokens_seen": 23870600, + "step": 41150 + }, + { + "epoch": 6.129728924635091, + "grad_norm": 3.40334415435791, + "learning_rate": 4.3782812854820687e-05, + "loss": 0.0049, + "num_input_tokens_seen": 23873416, + "step": 41155 + }, + { + "epoch": 6.13047363717605, + "grad_norm": 13.69579792022705, + "learning_rate": 4.378066825154891e-05, + "loss": 0.2394, + "num_input_tokens_seen": 23876456, + "step": 41160 + }, + { + "epoch": 6.13121834971701, + "grad_norm": 38.94145584106445, + "learning_rate": 4.3778523330996824e-05, + "loss": 0.4553, + "num_input_tokens_seen": 23879176, + "step": 41165 + }, + { + "epoch": 6.131963062257968, + "grad_norm": 0.05297612398862839, + "learning_rate": 4.377637809320068e-05, + "loss": 0.3903, + "num_input_tokens_seen": 23882280, + "step": 41170 + }, + { + "epoch": 6.132707774798928, + "grad_norm": 0.11837146431207657, + "learning_rate": 4.377423253819671e-05, + "loss": 0.0049, + "num_input_tokens_seen": 23884968, + "step": 41175 + }, + { + "epoch": 6.133452487339887, + "grad_norm": 0.014801052398979664, + "learning_rate": 4.377208666602116e-05, + "loss": 0.1925, + "num_input_tokens_seen": 23887880, + "step": 41180 + }, + { + "epoch": 6.134197199880846, + "grad_norm": 2.1462669372558594, + "learning_rate": 4.3769940476710284e-05, + "loss": 0.061, + "num_input_tokens_seen": 23890600, + "step": 41185 + }, + { + "epoch": 6.134941912421805, + "grad_norm": 23.89448356628418, + "learning_rate": 4.376779397030034e-05, + "loss": 0.0324, + "num_input_tokens_seen": 23893608, + "step": 41190 + }, + { + "epoch": 6.135686624962764, + "grad_norm": 0.05905583128333092, + "learning_rate": 4.376564714682761e-05, + "loss": 0.6157, + "num_input_tokens_seen": 23896616, + "step": 41195 + }, + { + "epoch": 6.136431337503724, + "grad_norm": 29.325702667236328, + "learning_rate": 4.376350000632832e-05, + "loss": 0.3881, + "num_input_tokens_seen": 23899848, + "step": 41200 + }, + { + "epoch": 6.137176050044682, + "grad_norm": 30.275806427001953, + "learning_rate": 4.376135254883877e-05, + "loss": 0.2164, + "num_input_tokens_seen": 23902760, + "step": 41205 + }, + { + "epoch": 6.137920762585642, + "grad_norm": 27.441558837890625, + "learning_rate": 4.375920477439525e-05, + "loss": 0.3236, + "num_input_tokens_seen": 23905256, + "step": 41210 + }, + { + "epoch": 6.138665475126601, + "grad_norm": 0.06318824738264084, + "learning_rate": 4.375705668303403e-05, + "loss": 0.1876, + "num_input_tokens_seen": 23907976, + "step": 41215 + }, + { + "epoch": 6.13941018766756, + "grad_norm": 32.47362518310547, + "learning_rate": 4.37549082747914e-05, + "loss": 0.1133, + "num_input_tokens_seen": 23910856, + "step": 41220 + }, + { + "epoch": 6.140154900208519, + "grad_norm": 0.9742967486381531, + "learning_rate": 4.375275954970364e-05, + "loss": 0.1756, + "num_input_tokens_seen": 23913544, + "step": 41225 + }, + { + "epoch": 6.140899612749479, + "grad_norm": 0.23880252242088318, + "learning_rate": 4.3750610507807075e-05, + "loss": 0.0599, + "num_input_tokens_seen": 23916456, + "step": 41230 + }, + { + "epoch": 6.141644325290438, + "grad_norm": 47.63466262817383, + "learning_rate": 4.3748461149138016e-05, + "loss": 0.1189, + "num_input_tokens_seen": 23919208, + "step": 41235 + }, + { + "epoch": 6.142389037831397, + "grad_norm": 0.35079142451286316, + "learning_rate": 4.374631147373275e-05, + "loss": 0.0791, + "num_input_tokens_seen": 23922152, + "step": 41240 + }, + { + "epoch": 6.143133750372356, + "grad_norm": 4.9898600578308105, + "learning_rate": 4.374416148162761e-05, + "loss": 0.3182, + "num_input_tokens_seen": 23925256, + "step": 41245 + }, + { + "epoch": 6.143878462913316, + "grad_norm": 8.050034523010254, + "learning_rate": 4.374201117285891e-05, + "loss": 0.2362, + "num_input_tokens_seen": 23928072, + "step": 41250 + }, + { + "epoch": 6.144623175454274, + "grad_norm": 1.7215555906295776, + "learning_rate": 4.3739860547462976e-05, + "loss": 0.0741, + "num_input_tokens_seen": 23930952, + "step": 41255 + }, + { + "epoch": 6.145367887995234, + "grad_norm": 56.95254898071289, + "learning_rate": 4.373770960547614e-05, + "loss": 0.1308, + "num_input_tokens_seen": 23933992, + "step": 41260 + }, + { + "epoch": 6.146112600536193, + "grad_norm": 21.559112548828125, + "learning_rate": 4.3735558346934755e-05, + "loss": 0.319, + "num_input_tokens_seen": 23936968, + "step": 41265 + }, + { + "epoch": 6.146857313077152, + "grad_norm": 22.053638458251953, + "learning_rate": 4.373340677187515e-05, + "loss": 0.1791, + "num_input_tokens_seen": 23940008, + "step": 41270 + }, + { + "epoch": 6.147602025618111, + "grad_norm": 16.143348693847656, + "learning_rate": 4.373125488033368e-05, + "loss": 0.2897, + "num_input_tokens_seen": 23943336, + "step": 41275 + }, + { + "epoch": 6.148346738159071, + "grad_norm": 0.02880394272506237, + "learning_rate": 4.372910267234669e-05, + "loss": 0.3351, + "num_input_tokens_seen": 23946216, + "step": 41280 + }, + { + "epoch": 6.14909145070003, + "grad_norm": 20.112279891967773, + "learning_rate": 4.3726950147950554e-05, + "loss": 0.2434, + "num_input_tokens_seen": 23949320, + "step": 41285 + }, + { + "epoch": 6.149836163240989, + "grad_norm": 25.08089256286621, + "learning_rate": 4.372479730718162e-05, + "loss": 0.3787, + "num_input_tokens_seen": 23952136, + "step": 41290 + }, + { + "epoch": 6.150580875781948, + "grad_norm": 0.11127405613660812, + "learning_rate": 4.3722644150076275e-05, + "loss": 0.0479, + "num_input_tokens_seen": 23954984, + "step": 41295 + }, + { + "epoch": 6.151325588322908, + "grad_norm": 0.07197767496109009, + "learning_rate": 4.3720490676670886e-05, + "loss": 0.2945, + "num_input_tokens_seen": 23957736, + "step": 41300 + }, + { + "epoch": 6.152070300863866, + "grad_norm": 12.945802688598633, + "learning_rate": 4.371833688700182e-05, + "loss": 0.1305, + "num_input_tokens_seen": 23960488, + "step": 41305 + }, + { + "epoch": 6.152815013404826, + "grad_norm": 0.28479018807411194, + "learning_rate": 4.3716182781105484e-05, + "loss": 0.0883, + "num_input_tokens_seen": 23963112, + "step": 41310 + }, + { + "epoch": 6.153559725945785, + "grad_norm": 33.671424865722656, + "learning_rate": 4.3714028359018274e-05, + "loss": 0.2356, + "num_input_tokens_seen": 23965768, + "step": 41315 + }, + { + "epoch": 6.1543044384867445, + "grad_norm": 1.3813642263412476, + "learning_rate": 4.3711873620776566e-05, + "loss": 0.5776, + "num_input_tokens_seen": 23968840, + "step": 41320 + }, + { + "epoch": 6.155049151027703, + "grad_norm": 27.584182739257812, + "learning_rate": 4.370971856641677e-05, + "loss": 0.1062, + "num_input_tokens_seen": 23971624, + "step": 41325 + }, + { + "epoch": 6.155793863568663, + "grad_norm": 14.060227394104004, + "learning_rate": 4.3707563195975296e-05, + "loss": 0.1422, + "num_input_tokens_seen": 23974472, + "step": 41330 + }, + { + "epoch": 6.156538576109622, + "grad_norm": 0.04216865450143814, + "learning_rate": 4.370540750948855e-05, + "loss": 0.0751, + "num_input_tokens_seen": 23977320, + "step": 41335 + }, + { + "epoch": 6.157283288650581, + "grad_norm": 7.528720855712891, + "learning_rate": 4.370325150699296e-05, + "loss": 0.1448, + "num_input_tokens_seen": 23980328, + "step": 41340 + }, + { + "epoch": 6.15802800119154, + "grad_norm": 26.072425842285156, + "learning_rate": 4.3701095188524943e-05, + "loss": 0.3292, + "num_input_tokens_seen": 23983240, + "step": 41345 + }, + { + "epoch": 6.1587727137325, + "grad_norm": 51.93379211425781, + "learning_rate": 4.369893855412093e-05, + "loss": 0.1787, + "num_input_tokens_seen": 23986216, + "step": 41350 + }, + { + "epoch": 6.159517426273458, + "grad_norm": 34.506683349609375, + "learning_rate": 4.369678160381736e-05, + "loss": 0.1861, + "num_input_tokens_seen": 23989544, + "step": 41355 + }, + { + "epoch": 6.160262138814417, + "grad_norm": 4.647797584533691, + "learning_rate": 4.3694624337650656e-05, + "loss": 0.0317, + "num_input_tokens_seen": 23992392, + "step": 41360 + }, + { + "epoch": 6.161006851355377, + "grad_norm": 1.375787615776062, + "learning_rate": 4.369246675565729e-05, + "loss": 0.1025, + "num_input_tokens_seen": 23995400, + "step": 41365 + }, + { + "epoch": 6.161751563896336, + "grad_norm": 0.1819893717765808, + "learning_rate": 4.369030885787369e-05, + "loss": 0.4738, + "num_input_tokens_seen": 23998184, + "step": 41370 + }, + { + "epoch": 6.162496276437295, + "grad_norm": 2.470825433731079, + "learning_rate": 4.368815064433631e-05, + "loss": 0.1569, + "num_input_tokens_seen": 24001096, + "step": 41375 + }, + { + "epoch": 6.163240988978254, + "grad_norm": 52.31004333496094, + "learning_rate": 4.368599211508162e-05, + "loss": 0.3049, + "num_input_tokens_seen": 24004168, + "step": 41380 + }, + { + "epoch": 6.163985701519214, + "grad_norm": 2.7827706336975098, + "learning_rate": 4.3683833270146095e-05, + "loss": 0.1228, + "num_input_tokens_seen": 24007144, + "step": 41385 + }, + { + "epoch": 6.164730414060172, + "grad_norm": 7.622616767883301, + "learning_rate": 4.368167410956619e-05, + "loss": 0.2959, + "num_input_tokens_seen": 24010216, + "step": 41390 + }, + { + "epoch": 6.165475126601132, + "grad_norm": 0.18416348099708557, + "learning_rate": 4.367951463337839e-05, + "loss": 0.4147, + "num_input_tokens_seen": 24013032, + "step": 41395 + }, + { + "epoch": 6.166219839142091, + "grad_norm": 60.615882873535156, + "learning_rate": 4.367735484161918e-05, + "loss": 0.374, + "num_input_tokens_seen": 24015816, + "step": 41400 + }, + { + "epoch": 6.1669645516830505, + "grad_norm": 0.025535468012094498, + "learning_rate": 4.367519473432503e-05, + "loss": 0.2139, + "num_input_tokens_seen": 24018568, + "step": 41405 + }, + { + "epoch": 6.167709264224009, + "grad_norm": 4.568099498748779, + "learning_rate": 4.367303431153245e-05, + "loss": 0.1169, + "num_input_tokens_seen": 24021480, + "step": 41410 + }, + { + "epoch": 6.168453976764969, + "grad_norm": 3.6769275665283203, + "learning_rate": 4.367087357327794e-05, + "loss": 0.0663, + "num_input_tokens_seen": 24023944, + "step": 41415 + }, + { + "epoch": 6.169198689305928, + "grad_norm": 0.011036762036383152, + "learning_rate": 4.366871251959799e-05, + "loss": 0.0662, + "num_input_tokens_seen": 24026920, + "step": 41420 + }, + { + "epoch": 6.169943401846887, + "grad_norm": 55.39677810668945, + "learning_rate": 4.3666551150529124e-05, + "loss": 0.5707, + "num_input_tokens_seen": 24029992, + "step": 41425 + }, + { + "epoch": 6.170688114387846, + "grad_norm": 110.55061340332031, + "learning_rate": 4.366438946610784e-05, + "loss": 0.2234, + "num_input_tokens_seen": 24032840, + "step": 41430 + }, + { + "epoch": 6.171432826928806, + "grad_norm": 0.014043092727661133, + "learning_rate": 4.366222746637067e-05, + "loss": 0.2311, + "num_input_tokens_seen": 24035880, + "step": 41435 + }, + { + "epoch": 6.172177539469764, + "grad_norm": 0.6955795884132385, + "learning_rate": 4.366006515135413e-05, + "loss": 0.2731, + "num_input_tokens_seen": 24039464, + "step": 41440 + }, + { + "epoch": 6.172922252010724, + "grad_norm": 2.296672821044922, + "learning_rate": 4.3657902521094764e-05, + "loss": 0.3416, + "num_input_tokens_seen": 24042536, + "step": 41445 + }, + { + "epoch": 6.173666964551683, + "grad_norm": 1.4026445150375366, + "learning_rate": 4.365573957562909e-05, + "loss": 0.1851, + "num_input_tokens_seen": 24045512, + "step": 41450 + }, + { + "epoch": 6.1744116770926425, + "grad_norm": 0.012314834631979465, + "learning_rate": 4.365357631499366e-05, + "loss": 0.0228, + "num_input_tokens_seen": 24048392, + "step": 41455 + }, + { + "epoch": 6.175156389633601, + "grad_norm": 9.361709594726562, + "learning_rate": 4.365141273922502e-05, + "loss": 0.3244, + "num_input_tokens_seen": 24051272, + "step": 41460 + }, + { + "epoch": 6.175901102174561, + "grad_norm": 0.14169687032699585, + "learning_rate": 4.3649248848359706e-05, + "loss": 0.2216, + "num_input_tokens_seen": 24053928, + "step": 41465 + }, + { + "epoch": 6.17664581471552, + "grad_norm": 0.1254865676164627, + "learning_rate": 4.36470846424343e-05, + "loss": 0.3043, + "num_input_tokens_seen": 24056648, + "step": 41470 + }, + { + "epoch": 6.177390527256479, + "grad_norm": 27.613615036010742, + "learning_rate": 4.364492012148534e-05, + "loss": 0.3571, + "num_input_tokens_seen": 24059464, + "step": 41475 + }, + { + "epoch": 6.178135239797438, + "grad_norm": 25.938735961914062, + "learning_rate": 4.364275528554941e-05, + "loss": 0.191, + "num_input_tokens_seen": 24062376, + "step": 41480 + }, + { + "epoch": 6.178879952338398, + "grad_norm": 6.503528118133545, + "learning_rate": 4.3640590134663076e-05, + "loss": 0.0663, + "num_input_tokens_seen": 24065256, + "step": 41485 + }, + { + "epoch": 6.1796246648793565, + "grad_norm": 0.29364868998527527, + "learning_rate": 4.363842466886292e-05, + "loss": 0.1707, + "num_input_tokens_seen": 24067976, + "step": 41490 + }, + { + "epoch": 6.180369377420316, + "grad_norm": 2.0816690921783447, + "learning_rate": 4.363625888818552e-05, + "loss": 0.2183, + "num_input_tokens_seen": 24071144, + "step": 41495 + }, + { + "epoch": 6.181114089961275, + "grad_norm": 0.20641210675239563, + "learning_rate": 4.363409279266747e-05, + "loss": 0.1935, + "num_input_tokens_seen": 24074248, + "step": 41500 + }, + { + "epoch": 6.1818588025022345, + "grad_norm": 32.7744140625, + "learning_rate": 4.3631926382345356e-05, + "loss": 0.3913, + "num_input_tokens_seen": 24077160, + "step": 41505 + }, + { + "epoch": 6.182603515043193, + "grad_norm": 1.8143248558044434, + "learning_rate": 4.3629759657255786e-05, + "loss": 0.0429, + "num_input_tokens_seen": 24079880, + "step": 41510 + }, + { + "epoch": 6.183348227584153, + "grad_norm": 25.448678970336914, + "learning_rate": 4.3627592617435363e-05, + "loss": 0.229, + "num_input_tokens_seen": 24082856, + "step": 41515 + }, + { + "epoch": 6.184092940125112, + "grad_norm": 25.550111770629883, + "learning_rate": 4.362542526292069e-05, + "loss": 0.2792, + "num_input_tokens_seen": 24085704, + "step": 41520 + }, + { + "epoch": 6.18483765266607, + "grad_norm": 73.40092468261719, + "learning_rate": 4.362325759374839e-05, + "loss": 0.5383, + "num_input_tokens_seen": 24088616, + "step": 41525 + }, + { + "epoch": 6.18558236520703, + "grad_norm": 18.410173416137695, + "learning_rate": 4.3621089609955084e-05, + "loss": 0.38, + "num_input_tokens_seen": 24091592, + "step": 41530 + }, + { + "epoch": 6.18632707774799, + "grad_norm": 0.05818317085504532, + "learning_rate": 4.3618921311577384e-05, + "loss": 0.1293, + "num_input_tokens_seen": 24094504, + "step": 41535 + }, + { + "epoch": 6.1870717902889485, + "grad_norm": 34.711063385009766, + "learning_rate": 4.361675269865194e-05, + "loss": 0.1769, + "num_input_tokens_seen": 24097448, + "step": 41540 + }, + { + "epoch": 6.187816502829907, + "grad_norm": 1.8357490301132202, + "learning_rate": 4.361458377121538e-05, + "loss": 0.0531, + "num_input_tokens_seen": 24100296, + "step": 41545 + }, + { + "epoch": 6.188561215370867, + "grad_norm": 11.828250885009766, + "learning_rate": 4.3612414529304344e-05, + "loss": 0.541, + "num_input_tokens_seen": 24103240, + "step": 41550 + }, + { + "epoch": 6.189305927911826, + "grad_norm": 16.310863494873047, + "learning_rate": 4.3610244972955486e-05, + "loss": 0.1951, + "num_input_tokens_seen": 24106248, + "step": 41555 + }, + { + "epoch": 6.190050640452785, + "grad_norm": 9.59363079071045, + "learning_rate": 4.3608075102205454e-05, + "loss": 0.1558, + "num_input_tokens_seen": 24109064, + "step": 41560 + }, + { + "epoch": 6.190795352993744, + "grad_norm": 0.0840824767947197, + "learning_rate": 4.36059049170909e-05, + "loss": 0.0919, + "num_input_tokens_seen": 24111560, + "step": 41565 + }, + { + "epoch": 6.191540065534704, + "grad_norm": 0.7261965870857239, + "learning_rate": 4.36037344176485e-05, + "loss": 0.007, + "num_input_tokens_seen": 24114312, + "step": 41570 + }, + { + "epoch": 6.1922847780756625, + "grad_norm": 0.15134769678115845, + "learning_rate": 4.3601563603914906e-05, + "loss": 0.0652, + "num_input_tokens_seen": 24117096, + "step": 41575 + }, + { + "epoch": 6.193029490616622, + "grad_norm": 0.10773324221372604, + "learning_rate": 4.3599392475926806e-05, + "loss": 0.0994, + "num_input_tokens_seen": 24120104, + "step": 41580 + }, + { + "epoch": 6.193774203157581, + "grad_norm": 0.017377914860844612, + "learning_rate": 4.359722103372087e-05, + "loss": 0.319, + "num_input_tokens_seen": 24122920, + "step": 41585 + }, + { + "epoch": 6.1945189156985405, + "grad_norm": 14.018250465393066, + "learning_rate": 4.3595049277333785e-05, + "loss": 0.166, + "num_input_tokens_seen": 24125800, + "step": 41590 + }, + { + "epoch": 6.195263628239499, + "grad_norm": 0.11270676553249359, + "learning_rate": 4.359287720680225e-05, + "loss": 0.1048, + "num_input_tokens_seen": 24128488, + "step": 41595 + }, + { + "epoch": 6.196008340780459, + "grad_norm": 0.05729251727461815, + "learning_rate": 4.359070482216295e-05, + "loss": 0.3459, + "num_input_tokens_seen": 24131528, + "step": 41600 + }, + { + "epoch": 6.196753053321418, + "grad_norm": 13.314724922180176, + "learning_rate": 4.358853212345258e-05, + "loss": 0.3428, + "num_input_tokens_seen": 24134344, + "step": 41605 + }, + { + "epoch": 6.197497765862377, + "grad_norm": 25.524673461914062, + "learning_rate": 4.358635911070785e-05, + "loss": 0.2023, + "num_input_tokens_seen": 24137256, + "step": 41610 + }, + { + "epoch": 6.198242478403336, + "grad_norm": 4.700993061065674, + "learning_rate": 4.3584185783965484e-05, + "loss": 0.1181, + "num_input_tokens_seen": 24140328, + "step": 41615 + }, + { + "epoch": 6.198987190944296, + "grad_norm": 35.96528244018555, + "learning_rate": 4.358201214326218e-05, + "loss": 0.1332, + "num_input_tokens_seen": 24143080, + "step": 41620 + }, + { + "epoch": 6.1997319034852545, + "grad_norm": 29.866512298583984, + "learning_rate": 4.357983818863467e-05, + "loss": 0.0368, + "num_input_tokens_seen": 24145992, + "step": 41625 + }, + { + "epoch": 6.200476616026214, + "grad_norm": 64.03241729736328, + "learning_rate": 4.357766392011968e-05, + "loss": 0.3132, + "num_input_tokens_seen": 24149096, + "step": 41630 + }, + { + "epoch": 6.201221328567173, + "grad_norm": 28.123985290527344, + "learning_rate": 4.357548933775393e-05, + "loss": 0.0787, + "num_input_tokens_seen": 24152008, + "step": 41635 + }, + { + "epoch": 6.2019660411081325, + "grad_norm": 12.547198295593262, + "learning_rate": 4.3573314441574176e-05, + "loss": 0.0486, + "num_input_tokens_seen": 24154920, + "step": 41640 + }, + { + "epoch": 6.202710753649091, + "grad_norm": 4.040307521820068, + "learning_rate": 4.357113923161715e-05, + "loss": 0.3666, + "num_input_tokens_seen": 24157832, + "step": 41645 + }, + { + "epoch": 6.203455466190051, + "grad_norm": 6.715031623840332, + "learning_rate": 4.35689637079196e-05, + "loss": 0.6768, + "num_input_tokens_seen": 24160776, + "step": 41650 + }, + { + "epoch": 6.20420017873101, + "grad_norm": 17.01056480407715, + "learning_rate": 4.356678787051828e-05, + "loss": 0.0679, + "num_input_tokens_seen": 24163976, + "step": 41655 + }, + { + "epoch": 6.204944891271969, + "grad_norm": 11.080587387084961, + "learning_rate": 4.356461171944994e-05, + "loss": 0.0272, + "num_input_tokens_seen": 24167080, + "step": 41660 + }, + { + "epoch": 6.205689603812928, + "grad_norm": 0.4241239130496979, + "learning_rate": 4.356243525475137e-05, + "loss": 0.0142, + "num_input_tokens_seen": 24169704, + "step": 41665 + }, + { + "epoch": 6.206434316353888, + "grad_norm": 0.048057496547698975, + "learning_rate": 4.3560258476459315e-05, + "loss": 0.2484, + "num_input_tokens_seen": 24172712, + "step": 41670 + }, + { + "epoch": 6.2071790288948465, + "grad_norm": 0.4505770206451416, + "learning_rate": 4.355808138461056e-05, + "loss": 0.1931, + "num_input_tokens_seen": 24175464, + "step": 41675 + }, + { + "epoch": 6.207923741435806, + "grad_norm": 0.7701317667961121, + "learning_rate": 4.355590397924188e-05, + "loss": 0.2592, + "num_input_tokens_seen": 24178408, + "step": 41680 + }, + { + "epoch": 6.208668453976765, + "grad_norm": 0.48814332485198975, + "learning_rate": 4.355372626039006e-05, + "loss": 0.3243, + "num_input_tokens_seen": 24181160, + "step": 41685 + }, + { + "epoch": 6.209413166517725, + "grad_norm": 23.201719284057617, + "learning_rate": 4.355154822809189e-05, + "loss": 0.2828, + "num_input_tokens_seen": 24184200, + "step": 41690 + }, + { + "epoch": 6.210157879058683, + "grad_norm": 0.08493795990943909, + "learning_rate": 4.3549369882384174e-05, + "loss": 0.1797, + "num_input_tokens_seen": 24186952, + "step": 41695 + }, + { + "epoch": 6.210902591599643, + "grad_norm": 34.118934631347656, + "learning_rate": 4.35471912233037e-05, + "loss": 0.5984, + "num_input_tokens_seen": 24190024, + "step": 41700 + }, + { + "epoch": 6.211647304140602, + "grad_norm": 0.014832554385066032, + "learning_rate": 4.3545012250887286e-05, + "loss": 0.3765, + "num_input_tokens_seen": 24193480, + "step": 41705 + }, + { + "epoch": 6.2123920166815605, + "grad_norm": 0.17668451368808746, + "learning_rate": 4.354283296517173e-05, + "loss": 0.2533, + "num_input_tokens_seen": 24196200, + "step": 41710 + }, + { + "epoch": 6.21313672922252, + "grad_norm": 0.4358573257923126, + "learning_rate": 4.354065336619387e-05, + "loss": 0.0985, + "num_input_tokens_seen": 24199432, + "step": 41715 + }, + { + "epoch": 6.213881441763479, + "grad_norm": 7.427865028381348, + "learning_rate": 4.3538473453990506e-05, + "loss": 0.2079, + "num_input_tokens_seen": 24202056, + "step": 41720 + }, + { + "epoch": 6.2146261543044385, + "grad_norm": 0.332182377576828, + "learning_rate": 4.353629322859848e-05, + "loss": 0.1251, + "num_input_tokens_seen": 24205128, + "step": 41725 + }, + { + "epoch": 6.215370866845397, + "grad_norm": 0.09059522300958633, + "learning_rate": 4.353411269005462e-05, + "loss": 0.3819, + "num_input_tokens_seen": 24208072, + "step": 41730 + }, + { + "epoch": 6.216115579386357, + "grad_norm": 23.163835525512695, + "learning_rate": 4.353193183839576e-05, + "loss": 0.0269, + "num_input_tokens_seen": 24211016, + "step": 41735 + }, + { + "epoch": 6.216860291927316, + "grad_norm": 0.019734058529138565, + "learning_rate": 4.352975067365874e-05, + "loss": 0.0624, + "num_input_tokens_seen": 24213800, + "step": 41740 + }, + { + "epoch": 6.217605004468275, + "grad_norm": 0.06348968297243118, + "learning_rate": 4.352756919588042e-05, + "loss": 0.0308, + "num_input_tokens_seen": 24216808, + "step": 41745 + }, + { + "epoch": 6.218349717009234, + "grad_norm": 10.392279624938965, + "learning_rate": 4.3525387405097654e-05, + "loss": 0.1991, + "num_input_tokens_seen": 24219624, + "step": 41750 + }, + { + "epoch": 6.219094429550194, + "grad_norm": 0.07342587411403656, + "learning_rate": 4.352320530134729e-05, + "loss": 0.0242, + "num_input_tokens_seen": 24222632, + "step": 41755 + }, + { + "epoch": 6.2198391420911525, + "grad_norm": 15.810691833496094, + "learning_rate": 4.35210228846662e-05, + "loss": 0.0329, + "num_input_tokens_seen": 24225192, + "step": 41760 + }, + { + "epoch": 6.220583854632112, + "grad_norm": 84.31188201904297, + "learning_rate": 4.3518840155091255e-05, + "loss": 0.2075, + "num_input_tokens_seen": 24228008, + "step": 41765 + }, + { + "epoch": 6.221328567173071, + "grad_norm": 49.74447250366211, + "learning_rate": 4.351665711265933e-05, + "loss": 0.1816, + "num_input_tokens_seen": 24231176, + "step": 41770 + }, + { + "epoch": 6.222073279714031, + "grad_norm": 0.08205479383468628, + "learning_rate": 4.351447375740729e-05, + "loss": 0.142, + "num_input_tokens_seen": 24233832, + "step": 41775 + }, + { + "epoch": 6.222817992254989, + "grad_norm": 18.270906448364258, + "learning_rate": 4.351229008937205e-05, + "loss": 0.0769, + "num_input_tokens_seen": 24236456, + "step": 41780 + }, + { + "epoch": 6.223562704795949, + "grad_norm": 0.28426218032836914, + "learning_rate": 4.3510106108590476e-05, + "loss": 0.0023, + "num_input_tokens_seen": 24239208, + "step": 41785 + }, + { + "epoch": 6.224307417336908, + "grad_norm": 0.43928372859954834, + "learning_rate": 4.350792181509947e-05, + "loss": 0.1478, + "num_input_tokens_seen": 24241960, + "step": 41790 + }, + { + "epoch": 6.225052129877867, + "grad_norm": 0.1070980578660965, + "learning_rate": 4.350573720893594e-05, + "loss": 0.0728, + "num_input_tokens_seen": 24244840, + "step": 41795 + }, + { + "epoch": 6.225796842418826, + "grad_norm": 0.0716143324971199, + "learning_rate": 4.350355229013679e-05, + "loss": 0.1507, + "num_input_tokens_seen": 24247848, + "step": 41800 + }, + { + "epoch": 6.226541554959786, + "grad_norm": 22.219219207763672, + "learning_rate": 4.3501367058738916e-05, + "loss": 0.3768, + "num_input_tokens_seen": 24250856, + "step": 41805 + }, + { + "epoch": 6.2272862675007445, + "grad_norm": 8.743675231933594, + "learning_rate": 4.3499181514779266e-05, + "loss": 0.1745, + "num_input_tokens_seen": 24253928, + "step": 41810 + }, + { + "epoch": 6.228030980041704, + "grad_norm": 22.103534698486328, + "learning_rate": 4.3496995658294735e-05, + "loss": 0.0739, + "num_input_tokens_seen": 24256744, + "step": 41815 + }, + { + "epoch": 6.228775692582663, + "grad_norm": 4.046358108520508, + "learning_rate": 4.349480948932226e-05, + "loss": 0.2121, + "num_input_tokens_seen": 24259976, + "step": 41820 + }, + { + "epoch": 6.229520405123623, + "grad_norm": 51.4623908996582, + "learning_rate": 4.3492623007898786e-05, + "loss": 0.2304, + "num_input_tokens_seen": 24262728, + "step": 41825 + }, + { + "epoch": 6.230265117664581, + "grad_norm": 0.032989490777254105, + "learning_rate": 4.3490436214061236e-05, + "loss": 0.2867, + "num_input_tokens_seen": 24265640, + "step": 41830 + }, + { + "epoch": 6.231009830205541, + "grad_norm": 0.06590045243501663, + "learning_rate": 4.348824910784656e-05, + "loss": 0.1636, + "num_input_tokens_seen": 24268456, + "step": 41835 + }, + { + "epoch": 6.2317545427465, + "grad_norm": 0.24142061173915863, + "learning_rate": 4.348606168929171e-05, + "loss": 0.0758, + "num_input_tokens_seen": 24271464, + "step": 41840 + }, + { + "epoch": 6.232499255287459, + "grad_norm": 7.132147312164307, + "learning_rate": 4.348387395843363e-05, + "loss": 0.01, + "num_input_tokens_seen": 24274280, + "step": 41845 + }, + { + "epoch": 6.233243967828418, + "grad_norm": 57.48949432373047, + "learning_rate": 4.348168591530929e-05, + "loss": 0.1135, + "num_input_tokens_seen": 24277384, + "step": 41850 + }, + { + "epoch": 6.233988680369378, + "grad_norm": 0.16017763316631317, + "learning_rate": 4.3479497559955654e-05, + "loss": 0.2406, + "num_input_tokens_seen": 24280392, + "step": 41855 + }, + { + "epoch": 6.234733392910337, + "grad_norm": 0.05346764624118805, + "learning_rate": 4.347730889240968e-05, + "loss": 0.0209, + "num_input_tokens_seen": 24283144, + "step": 41860 + }, + { + "epoch": 6.235478105451296, + "grad_norm": 0.056199174374341965, + "learning_rate": 4.347511991270835e-05, + "loss": 0.0888, + "num_input_tokens_seen": 24285928, + "step": 41865 + }, + { + "epoch": 6.236222817992255, + "grad_norm": 0.017817817628383636, + "learning_rate": 4.347293062088865e-05, + "loss": 0.4119, + "num_input_tokens_seen": 24288712, + "step": 41870 + }, + { + "epoch": 6.236967530533214, + "grad_norm": 0.017468435689806938, + "learning_rate": 4.3470741016987574e-05, + "loss": 0.0183, + "num_input_tokens_seen": 24291368, + "step": 41875 + }, + { + "epoch": 6.237712243074173, + "grad_norm": 14.454157829284668, + "learning_rate": 4.3468551101042084e-05, + "loss": 0.3571, + "num_input_tokens_seen": 24294440, + "step": 41880 + }, + { + "epoch": 6.238456955615132, + "grad_norm": 60.451171875, + "learning_rate": 4.3466360873089204e-05, + "loss": 0.848, + "num_input_tokens_seen": 24297096, + "step": 41885 + }, + { + "epoch": 6.239201668156092, + "grad_norm": 0.027141939848661423, + "learning_rate": 4.346417033316592e-05, + "loss": 0.745, + "num_input_tokens_seen": 24300136, + "step": 41890 + }, + { + "epoch": 6.2399463806970505, + "grad_norm": 36.82902526855469, + "learning_rate": 4.346197948130925e-05, + "loss": 0.1544, + "num_input_tokens_seen": 24302856, + "step": 41895 + }, + { + "epoch": 6.24069109323801, + "grad_norm": 33.64652633666992, + "learning_rate": 4.34597883175562e-05, + "loss": 0.1916, + "num_input_tokens_seen": 24305832, + "step": 41900 + }, + { + "epoch": 6.241435805778969, + "grad_norm": 38.22701644897461, + "learning_rate": 4.3457596841943775e-05, + "loss": 0.2288, + "num_input_tokens_seen": 24308520, + "step": 41905 + }, + { + "epoch": 6.242180518319929, + "grad_norm": 87.46688842773438, + "learning_rate": 4.345540505450902e-05, + "loss": 0.1966, + "num_input_tokens_seen": 24311624, + "step": 41910 + }, + { + "epoch": 6.242925230860887, + "grad_norm": 0.036375340074300766, + "learning_rate": 4.345321295528896e-05, + "loss": 0.1574, + "num_input_tokens_seen": 24314472, + "step": 41915 + }, + { + "epoch": 6.243669943401847, + "grad_norm": 0.08285675942897797, + "learning_rate": 4.345102054432061e-05, + "loss": 0.2134, + "num_input_tokens_seen": 24317256, + "step": 41920 + }, + { + "epoch": 6.244414655942806, + "grad_norm": 42.071266174316406, + "learning_rate": 4.344882782164103e-05, + "loss": 0.3178, + "num_input_tokens_seen": 24320040, + "step": 41925 + }, + { + "epoch": 6.245159368483765, + "grad_norm": 35.3112907409668, + "learning_rate": 4.344663478728725e-05, + "loss": 0.3226, + "num_input_tokens_seen": 24323144, + "step": 41930 + }, + { + "epoch": 6.245904081024724, + "grad_norm": 1.3917832374572754, + "learning_rate": 4.3444441441296324e-05, + "loss": 0.4311, + "num_input_tokens_seen": 24325768, + "step": 41935 + }, + { + "epoch": 6.246648793565684, + "grad_norm": 2.2099812030792236, + "learning_rate": 4.344224778370531e-05, + "loss": 0.1303, + "num_input_tokens_seen": 24328808, + "step": 41940 + }, + { + "epoch": 6.247393506106643, + "grad_norm": 58.66209411621094, + "learning_rate": 4.344005381455126e-05, + "loss": 0.2118, + "num_input_tokens_seen": 24331720, + "step": 41945 + }, + { + "epoch": 6.248138218647602, + "grad_norm": 26.743816375732422, + "learning_rate": 4.343785953387125e-05, + "loss": 0.2893, + "num_input_tokens_seen": 24334664, + "step": 41950 + }, + { + "epoch": 6.248882931188561, + "grad_norm": 69.30728912353516, + "learning_rate": 4.343566494170233e-05, + "loss": 0.2318, + "num_input_tokens_seen": 24337576, + "step": 41955 + }, + { + "epoch": 6.249627643729521, + "grad_norm": 20.705364227294922, + "learning_rate": 4.34334700380816e-05, + "loss": 0.2265, + "num_input_tokens_seen": 24340360, + "step": 41960 + }, + { + "epoch": 6.250372356270479, + "grad_norm": 0.33284586668014526, + "learning_rate": 4.343127482304612e-05, + "loss": 0.0261, + "num_input_tokens_seen": 24343112, + "step": 41965 + }, + { + "epoch": 6.251117068811439, + "grad_norm": 0.01101006381213665, + "learning_rate": 4.342907929663299e-05, + "loss": 0.1236, + "num_input_tokens_seen": 24345832, + "step": 41970 + }, + { + "epoch": 6.251861781352398, + "grad_norm": 16.166553497314453, + "learning_rate": 4.342688345887929e-05, + "loss": 0.4199, + "num_input_tokens_seen": 24348904, + "step": 41975 + }, + { + "epoch": 6.252606493893357, + "grad_norm": 1.718324899673462, + "learning_rate": 4.342468730982212e-05, + "loss": 0.3126, + "num_input_tokens_seen": 24351752, + "step": 41980 + }, + { + "epoch": 6.253351206434316, + "grad_norm": 44.89921188354492, + "learning_rate": 4.342249084949859e-05, + "loss": 0.2961, + "num_input_tokens_seen": 24354472, + "step": 41985 + }, + { + "epoch": 6.254095918975276, + "grad_norm": 1.2731547355651855, + "learning_rate": 4.34202940779458e-05, + "loss": 0.0518, + "num_input_tokens_seen": 24357224, + "step": 41990 + }, + { + "epoch": 6.254840631516235, + "grad_norm": 45.8010368347168, + "learning_rate": 4.341809699520086e-05, + "loss": 0.2785, + "num_input_tokens_seen": 24359912, + "step": 41995 + }, + { + "epoch": 6.255585344057194, + "grad_norm": 14.769761085510254, + "learning_rate": 4.34158996013009e-05, + "loss": 0.1328, + "num_input_tokens_seen": 24362728, + "step": 42000 + }, + { + "epoch": 6.256330056598153, + "grad_norm": 221.41946411132812, + "learning_rate": 4.3413701896283024e-05, + "loss": 0.678, + "num_input_tokens_seen": 24365736, + "step": 42005 + }, + { + "epoch": 6.257074769139113, + "grad_norm": 0.026309844106435776, + "learning_rate": 4.341150388018437e-05, + "loss": 0.0324, + "num_input_tokens_seen": 24368488, + "step": 42010 + }, + { + "epoch": 6.257819481680071, + "grad_norm": 20.291357040405273, + "learning_rate": 4.340930555304208e-05, + "loss": 0.2775, + "num_input_tokens_seen": 24371240, + "step": 42015 + }, + { + "epoch": 6.258564194221031, + "grad_norm": 0.051599472761154175, + "learning_rate": 4.340710691489327e-05, + "loss": 0.0601, + "num_input_tokens_seen": 24374216, + "step": 42020 + }, + { + "epoch": 6.25930890676199, + "grad_norm": 29.732723236083984, + "learning_rate": 4.340490796577511e-05, + "loss": 0.3739, + "num_input_tokens_seen": 24377128, + "step": 42025 + }, + { + "epoch": 6.2600536193029495, + "grad_norm": 55.87013244628906, + "learning_rate": 4.340270870572472e-05, + "loss": 0.2151, + "num_input_tokens_seen": 24380360, + "step": 42030 + }, + { + "epoch": 6.260798331843908, + "grad_norm": 19.88693618774414, + "learning_rate": 4.340050913477928e-05, + "loss": 0.4724, + "num_input_tokens_seen": 24383368, + "step": 42035 + }, + { + "epoch": 6.261543044384867, + "grad_norm": 0.03466501832008362, + "learning_rate": 4.339830925297594e-05, + "loss": 0.3255, + "num_input_tokens_seen": 24386472, + "step": 42040 + }, + { + "epoch": 6.262287756925827, + "grad_norm": 48.849674224853516, + "learning_rate": 4.3396109060351864e-05, + "loss": 0.1128, + "num_input_tokens_seen": 24389416, + "step": 42045 + }, + { + "epoch": 6.263032469466786, + "grad_norm": 0.01327168196439743, + "learning_rate": 4.339390855694422e-05, + "loss": 0.2569, + "num_input_tokens_seen": 24392392, + "step": 42050 + }, + { + "epoch": 6.263777182007745, + "grad_norm": 56.80584716796875, + "learning_rate": 4.339170774279019e-05, + "loss": 0.1681, + "num_input_tokens_seen": 24395112, + "step": 42055 + }, + { + "epoch": 6.264521894548704, + "grad_norm": 0.05794544890522957, + "learning_rate": 4.3389506617926945e-05, + "loss": 0.1426, + "num_input_tokens_seen": 24397896, + "step": 42060 + }, + { + "epoch": 6.265266607089663, + "grad_norm": 0.3781164586544037, + "learning_rate": 4.3387305182391677e-05, + "loss": 0.0966, + "num_input_tokens_seen": 24400936, + "step": 42065 + }, + { + "epoch": 6.266011319630622, + "grad_norm": 33.25581359863281, + "learning_rate": 4.3385103436221575e-05, + "loss": 0.5722, + "num_input_tokens_seen": 24403784, + "step": 42070 + }, + { + "epoch": 6.266756032171582, + "grad_norm": 42.83914566040039, + "learning_rate": 4.338290137945384e-05, + "loss": 0.3001, + "num_input_tokens_seen": 24406472, + "step": 42075 + }, + { + "epoch": 6.267500744712541, + "grad_norm": 0.17462658882141113, + "learning_rate": 4.338069901212567e-05, + "loss": 0.0953, + "num_input_tokens_seen": 24409128, + "step": 42080 + }, + { + "epoch": 6.2682454572535, + "grad_norm": 1.5553734302520752, + "learning_rate": 4.337849633427427e-05, + "loss": 0.1405, + "num_input_tokens_seen": 24411880, + "step": 42085 + }, + { + "epoch": 6.268990169794459, + "grad_norm": 3.3321945667266846, + "learning_rate": 4.337629334593685e-05, + "loss": 0.3524, + "num_input_tokens_seen": 24414824, + "step": 42090 + }, + { + "epoch": 6.269734882335419, + "grad_norm": 32.08964920043945, + "learning_rate": 4.337409004715063e-05, + "loss": 0.2074, + "num_input_tokens_seen": 24417576, + "step": 42095 + }, + { + "epoch": 6.270479594876377, + "grad_norm": 43.55760955810547, + "learning_rate": 4.337188643795284e-05, + "loss": 0.0296, + "num_input_tokens_seen": 24420296, + "step": 42100 + }, + { + "epoch": 6.271224307417337, + "grad_norm": 7.92293643951416, + "learning_rate": 4.33696825183807e-05, + "loss": 0.0327, + "num_input_tokens_seen": 24423112, + "step": 42105 + }, + { + "epoch": 6.271969019958296, + "grad_norm": 0.090919129550457, + "learning_rate": 4.3367478288471444e-05, + "loss": 0.565, + "num_input_tokens_seen": 24425800, + "step": 42110 + }, + { + "epoch": 6.2727137324992555, + "grad_norm": 2.620617389678955, + "learning_rate": 4.33652737482623e-05, + "loss": 0.2529, + "num_input_tokens_seen": 24428808, + "step": 42115 + }, + { + "epoch": 6.273458445040214, + "grad_norm": 22.469867706298828, + "learning_rate": 4.336306889779054e-05, + "loss": 0.2216, + "num_input_tokens_seen": 24432008, + "step": 42120 + }, + { + "epoch": 6.274203157581174, + "grad_norm": 4.096005916595459, + "learning_rate": 4.3360863737093375e-05, + "loss": 0.216, + "num_input_tokens_seen": 24434760, + "step": 42125 + }, + { + "epoch": 6.274947870122133, + "grad_norm": 0.04068036004900932, + "learning_rate": 4.335865826620809e-05, + "loss": 0.2757, + "num_input_tokens_seen": 24437672, + "step": 42130 + }, + { + "epoch": 6.275692582663092, + "grad_norm": 13.027242660522461, + "learning_rate": 4.335645248517193e-05, + "loss": 0.1299, + "num_input_tokens_seen": 24440712, + "step": 42135 + }, + { + "epoch": 6.276437295204051, + "grad_norm": 48.12158966064453, + "learning_rate": 4.335424639402216e-05, + "loss": 0.1923, + "num_input_tokens_seen": 24444008, + "step": 42140 + }, + { + "epoch": 6.277182007745011, + "grad_norm": 0.03975812718272209, + "learning_rate": 4.3352039992796056e-05, + "loss": 0.0118, + "num_input_tokens_seen": 24446696, + "step": 42145 + }, + { + "epoch": 6.277926720285969, + "grad_norm": 74.12731170654297, + "learning_rate": 4.334983328153088e-05, + "loss": 0.232, + "num_input_tokens_seen": 24449864, + "step": 42150 + }, + { + "epoch": 6.278671432826929, + "grad_norm": 0.19783730804920197, + "learning_rate": 4.334762626026393e-05, + "loss": 0.0735, + "num_input_tokens_seen": 24452520, + "step": 42155 + }, + { + "epoch": 6.279416145367888, + "grad_norm": 0.8764669895172119, + "learning_rate": 4.3345418929032475e-05, + "loss": 0.0203, + "num_input_tokens_seen": 24455528, + "step": 42160 + }, + { + "epoch": 6.2801608579088475, + "grad_norm": 20.73948097229004, + "learning_rate": 4.334321128787382e-05, + "loss": 0.48, + "num_input_tokens_seen": 24458536, + "step": 42165 + }, + { + "epoch": 6.280905570449806, + "grad_norm": 0.5413720607757568, + "learning_rate": 4.3341003336825246e-05, + "loss": 0.1423, + "num_input_tokens_seen": 24461512, + "step": 42170 + }, + { + "epoch": 6.281650282990766, + "grad_norm": 18.640104293823242, + "learning_rate": 4.333879507592407e-05, + "loss": 0.2605, + "num_input_tokens_seen": 24464232, + "step": 42175 + }, + { + "epoch": 6.282394995531725, + "grad_norm": 22.470273971557617, + "learning_rate": 4.3336586505207587e-05, + "loss": 0.3288, + "num_input_tokens_seen": 24467208, + "step": 42180 + }, + { + "epoch": 6.283139708072684, + "grad_norm": 65.38337707519531, + "learning_rate": 4.3334377624713104e-05, + "loss": 0.1938, + "num_input_tokens_seen": 24470376, + "step": 42185 + }, + { + "epoch": 6.283884420613643, + "grad_norm": 1.5071685314178467, + "learning_rate": 4.333216843447795e-05, + "loss": 0.094, + "num_input_tokens_seen": 24473256, + "step": 42190 + }, + { + "epoch": 6.284629133154603, + "grad_norm": 33.736454010009766, + "learning_rate": 4.332995893453945e-05, + "loss": 0.1104, + "num_input_tokens_seen": 24475944, + "step": 42195 + }, + { + "epoch": 6.2853738456955615, + "grad_norm": 10.802249908447266, + "learning_rate": 4.3327749124934916e-05, + "loss": 0.2363, + "num_input_tokens_seen": 24478696, + "step": 42200 + }, + { + "epoch": 6.286118558236521, + "grad_norm": 23.347640991210938, + "learning_rate": 4.332553900570169e-05, + "loss": 0.4526, + "num_input_tokens_seen": 24481576, + "step": 42205 + }, + { + "epoch": 6.28686327077748, + "grad_norm": 0.051174454391002655, + "learning_rate": 4.3323328576877104e-05, + "loss": 0.0031, + "num_input_tokens_seen": 24484680, + "step": 42210 + }, + { + "epoch": 6.2876079833184395, + "grad_norm": 20.560047149658203, + "learning_rate": 4.33211178384985e-05, + "loss": 0.4189, + "num_input_tokens_seen": 24488104, + "step": 42215 + }, + { + "epoch": 6.288352695859398, + "grad_norm": 36.8958625793457, + "learning_rate": 4.331890679060324e-05, + "loss": 0.2668, + "num_input_tokens_seen": 24491080, + "step": 42220 + }, + { + "epoch": 6.289097408400357, + "grad_norm": 31.558094024658203, + "learning_rate": 4.331669543322867e-05, + "loss": 0.3714, + "num_input_tokens_seen": 24493768, + "step": 42225 + }, + { + "epoch": 6.289842120941317, + "grad_norm": 32.16224670410156, + "learning_rate": 4.331448376641214e-05, + "loss": 0.1546, + "num_input_tokens_seen": 24496424, + "step": 42230 + }, + { + "epoch": 6.290586833482275, + "grad_norm": 15.3507661819458, + "learning_rate": 4.331227179019103e-05, + "loss": 0.1986, + "num_input_tokens_seen": 24499240, + "step": 42235 + }, + { + "epoch": 6.291331546023235, + "grad_norm": 2.6575684547424316, + "learning_rate": 4.3310059504602685e-05, + "loss": 0.0122, + "num_input_tokens_seen": 24502312, + "step": 42240 + }, + { + "epoch": 6.292076258564194, + "grad_norm": 55.09074020385742, + "learning_rate": 4.330784690968451e-05, + "loss": 0.0771, + "num_input_tokens_seen": 24505448, + "step": 42245 + }, + { + "epoch": 6.2928209711051535, + "grad_norm": 22.72606658935547, + "learning_rate": 4.330563400547386e-05, + "loss": 0.1911, + "num_input_tokens_seen": 24508360, + "step": 42250 + }, + { + "epoch": 6.293565683646112, + "grad_norm": 25.950639724731445, + "learning_rate": 4.330342079200813e-05, + "loss": 0.2656, + "num_input_tokens_seen": 24511304, + "step": 42255 + }, + { + "epoch": 6.294310396187072, + "grad_norm": 0.25309038162231445, + "learning_rate": 4.330120726932471e-05, + "loss": 0.1012, + "num_input_tokens_seen": 24514280, + "step": 42260 + }, + { + "epoch": 6.295055108728031, + "grad_norm": 17.867128372192383, + "learning_rate": 4.329899343746099e-05, + "loss": 0.3105, + "num_input_tokens_seen": 24517224, + "step": 42265 + }, + { + "epoch": 6.29579982126899, + "grad_norm": 5.181252479553223, + "learning_rate": 4.3296779296454374e-05, + "loss": 0.1209, + "num_input_tokens_seen": 24520104, + "step": 42270 + }, + { + "epoch": 6.296544533809949, + "grad_norm": 0.11614697426557541, + "learning_rate": 4.3294564846342275e-05, + "loss": 0.1967, + "num_input_tokens_seen": 24523048, + "step": 42275 + }, + { + "epoch": 6.297289246350909, + "grad_norm": 58.26220703125, + "learning_rate": 4.329235008716209e-05, + "loss": 0.7007, + "num_input_tokens_seen": 24525896, + "step": 42280 + }, + { + "epoch": 6.2980339588918675, + "grad_norm": 0.2518426179885864, + "learning_rate": 4.329013501895125e-05, + "loss": 0.1103, + "num_input_tokens_seen": 24528648, + "step": 42285 + }, + { + "epoch": 6.298778671432827, + "grad_norm": 0.01869930885732174, + "learning_rate": 4.3287919641747155e-05, + "loss": 0.0339, + "num_input_tokens_seen": 24531592, + "step": 42290 + }, + { + "epoch": 6.299523383973786, + "grad_norm": 1.4078041315078735, + "learning_rate": 4.328570395558725e-05, + "loss": 0.0103, + "num_input_tokens_seen": 24534184, + "step": 42295 + }, + { + "epoch": 6.3002680965147455, + "grad_norm": 33.47146224975586, + "learning_rate": 4.328348796050896e-05, + "loss": 0.1397, + "num_input_tokens_seen": 24537224, + "step": 42300 + }, + { + "epoch": 6.301012809055704, + "grad_norm": 0.009977137669920921, + "learning_rate": 4.3281271656549734e-05, + "loss": 0.2738, + "num_input_tokens_seen": 24540136, + "step": 42305 + }, + { + "epoch": 6.301757521596664, + "grad_norm": 13.018135070800781, + "learning_rate": 4.3279055043746996e-05, + "loss": 0.4312, + "num_input_tokens_seen": 24542952, + "step": 42310 + }, + { + "epoch": 6.302502234137623, + "grad_norm": 0.056818887591362, + "learning_rate": 4.3276838122138196e-05, + "loss": 0.1654, + "num_input_tokens_seen": 24545736, + "step": 42315 + }, + { + "epoch": 6.303246946678582, + "grad_norm": 52.75068664550781, + "learning_rate": 4.3274620891760795e-05, + "loss": 0.2922, + "num_input_tokens_seen": 24548584, + "step": 42320 + }, + { + "epoch": 6.303991659219541, + "grad_norm": 0.3093812167644501, + "learning_rate": 4.327240335265226e-05, + "loss": 0.0976, + "num_input_tokens_seen": 24551496, + "step": 42325 + }, + { + "epoch": 6.304736371760501, + "grad_norm": 0.23455452919006348, + "learning_rate": 4.3270185504850024e-05, + "loss": 0.2805, + "num_input_tokens_seen": 24554248, + "step": 42330 + }, + { + "epoch": 6.3054810843014595, + "grad_norm": 44.287757873535156, + "learning_rate": 4.326796734839158e-05, + "loss": 0.281, + "num_input_tokens_seen": 24557096, + "step": 42335 + }, + { + "epoch": 6.306225796842419, + "grad_norm": 51.47099304199219, + "learning_rate": 4.32657488833144e-05, + "loss": 0.1609, + "num_input_tokens_seen": 24559784, + "step": 42340 + }, + { + "epoch": 6.306970509383378, + "grad_norm": 15.06644344329834, + "learning_rate": 4.326353010965595e-05, + "loss": 0.2686, + "num_input_tokens_seen": 24562728, + "step": 42345 + }, + { + "epoch": 6.3077152219243375, + "grad_norm": 0.04650973156094551, + "learning_rate": 4.326131102745372e-05, + "loss": 0.0999, + "num_input_tokens_seen": 24565704, + "step": 42350 + }, + { + "epoch": 6.308459934465296, + "grad_norm": 17.9698429107666, + "learning_rate": 4.3259091636745196e-05, + "loss": 0.1955, + "num_input_tokens_seen": 24568584, + "step": 42355 + }, + { + "epoch": 6.309204647006256, + "grad_norm": 33.48280715942383, + "learning_rate": 4.325687193756789e-05, + "loss": 0.189, + "num_input_tokens_seen": 24571272, + "step": 42360 + }, + { + "epoch": 6.309949359547215, + "grad_norm": 0.0973435714840889, + "learning_rate": 4.325465192995928e-05, + "loss": 0.1324, + "num_input_tokens_seen": 24573896, + "step": 42365 + }, + { + "epoch": 6.310694072088174, + "grad_norm": 1.9816287755966187, + "learning_rate": 4.325243161395688e-05, + "loss": 0.4537, + "num_input_tokens_seen": 24576680, + "step": 42370 + }, + { + "epoch": 6.311438784629133, + "grad_norm": 8.960389137268066, + "learning_rate": 4.3250210989598196e-05, + "loss": 0.0867, + "num_input_tokens_seen": 24579464, + "step": 42375 + }, + { + "epoch": 6.312183497170093, + "grad_norm": 8.671761512756348, + "learning_rate": 4.324799005692075e-05, + "loss": 0.1324, + "num_input_tokens_seen": 24582184, + "step": 42380 + }, + { + "epoch": 6.3129282097110515, + "grad_norm": 73.17781066894531, + "learning_rate": 4.3245768815962055e-05, + "loss": 0.4875, + "num_input_tokens_seen": 24585416, + "step": 42385 + }, + { + "epoch": 6.31367292225201, + "grad_norm": 129.76649475097656, + "learning_rate": 4.3243547266759646e-05, + "loss": 0.2026, + "num_input_tokens_seen": 24588360, + "step": 42390 + }, + { + "epoch": 6.31441763479297, + "grad_norm": 0.5303490161895752, + "learning_rate": 4.3241325409351044e-05, + "loss": 0.0098, + "num_input_tokens_seen": 24591336, + "step": 42395 + }, + { + "epoch": 6.31516234733393, + "grad_norm": 0.032591383904218674, + "learning_rate": 4.323910324377379e-05, + "loss": 0.1542, + "num_input_tokens_seen": 24594152, + "step": 42400 + }, + { + "epoch": 6.315907059874888, + "grad_norm": 43.95285415649414, + "learning_rate": 4.3236880770065426e-05, + "loss": 0.1908, + "num_input_tokens_seen": 24597224, + "step": 42405 + }, + { + "epoch": 6.316651772415847, + "grad_norm": 0.49260759353637695, + "learning_rate": 4.323465798826349e-05, + "loss": 0.0018, + "num_input_tokens_seen": 24600072, + "step": 42410 + }, + { + "epoch": 6.317396484956807, + "grad_norm": 75.14295959472656, + "learning_rate": 4.323243489840554e-05, + "loss": 0.1792, + "num_input_tokens_seen": 24603080, + "step": 42415 + }, + { + "epoch": 6.3181411974977655, + "grad_norm": 26.56979751586914, + "learning_rate": 4.323021150052914e-05, + "loss": 0.1718, + "num_input_tokens_seen": 24605928, + "step": 42420 + }, + { + "epoch": 6.318885910038725, + "grad_norm": 12.05189323425293, + "learning_rate": 4.322798779467184e-05, + "loss": 0.1282, + "num_input_tokens_seen": 24608936, + "step": 42425 + }, + { + "epoch": 6.319630622579684, + "grad_norm": 0.03479640930891037, + "learning_rate": 4.322576378087121e-05, + "loss": 0.2249, + "num_input_tokens_seen": 24611752, + "step": 42430 + }, + { + "epoch": 6.3203753351206435, + "grad_norm": 3.8348522186279297, + "learning_rate": 4.322353945916483e-05, + "loss": 0.1806, + "num_input_tokens_seen": 24614280, + "step": 42435 + }, + { + "epoch": 6.321120047661602, + "grad_norm": 29.664878845214844, + "learning_rate": 4.322131482959027e-05, + "loss": 0.6422, + "num_input_tokens_seen": 24617544, + "step": 42440 + }, + { + "epoch": 6.321864760202562, + "grad_norm": 0.06585371494293213, + "learning_rate": 4.321908989218512e-05, + "loss": 0.2719, + "num_input_tokens_seen": 24620232, + "step": 42445 + }, + { + "epoch": 6.322609472743521, + "grad_norm": 0.06640651822090149, + "learning_rate": 4.321686464698696e-05, + "loss": 0.0089, + "num_input_tokens_seen": 24622984, + "step": 42450 + }, + { + "epoch": 6.32335418528448, + "grad_norm": 0.21163251996040344, + "learning_rate": 4.321463909403338e-05, + "loss": 0.0836, + "num_input_tokens_seen": 24625640, + "step": 42455 + }, + { + "epoch": 6.324098897825439, + "grad_norm": 36.22929000854492, + "learning_rate": 4.3212413233362e-05, + "loss": 0.1576, + "num_input_tokens_seen": 24628552, + "step": 42460 + }, + { + "epoch": 6.324843610366399, + "grad_norm": 2.1794304847717285, + "learning_rate": 4.32101870650104e-05, + "loss": 0.0359, + "num_input_tokens_seen": 24631368, + "step": 42465 + }, + { + "epoch": 6.3255883229073575, + "grad_norm": 0.10336357355117798, + "learning_rate": 4.3207960589016196e-05, + "loss": 0.1164, + "num_input_tokens_seen": 24634184, + "step": 42470 + }, + { + "epoch": 6.326333035448317, + "grad_norm": 0.03870416060090065, + "learning_rate": 4.3205733805417e-05, + "loss": 0.0671, + "num_input_tokens_seen": 24637096, + "step": 42475 + }, + { + "epoch": 6.327077747989276, + "grad_norm": 1.5125095844268799, + "learning_rate": 4.320350671425044e-05, + "loss": 0.0608, + "num_input_tokens_seen": 24640072, + "step": 42480 + }, + { + "epoch": 6.327822460530236, + "grad_norm": 19.624549865722656, + "learning_rate": 4.320127931555415e-05, + "loss": 0.0242, + "num_input_tokens_seen": 24643112, + "step": 42485 + }, + { + "epoch": 6.328567173071194, + "grad_norm": 0.06040140613913536, + "learning_rate": 4.319905160936572e-05, + "loss": 0.4977, + "num_input_tokens_seen": 24645864, + "step": 42490 + }, + { + "epoch": 6.329311885612154, + "grad_norm": 0.01983853243291378, + "learning_rate": 4.319682359572282e-05, + "loss": 0.2601, + "num_input_tokens_seen": 24648552, + "step": 42495 + }, + { + "epoch": 6.330056598153113, + "grad_norm": 0.3110817074775696, + "learning_rate": 4.319459527466308e-05, + "loss": 0.2304, + "num_input_tokens_seen": 24651400, + "step": 42500 + }, + { + "epoch": 6.330801310694072, + "grad_norm": 0.037393711507320404, + "learning_rate": 4.3192366646224146e-05, + "loss": 0.1605, + "num_input_tokens_seen": 24654376, + "step": 42505 + }, + { + "epoch": 6.331546023235031, + "grad_norm": 140.4270782470703, + "learning_rate": 4.3190137710443666e-05, + "loss": 0.4851, + "num_input_tokens_seen": 24657448, + "step": 42510 + }, + { + "epoch": 6.332290735775991, + "grad_norm": 14.889965057373047, + "learning_rate": 4.3187908467359294e-05, + "loss": 0.238, + "num_input_tokens_seen": 24660264, + "step": 42515 + }, + { + "epoch": 6.3330354483169495, + "grad_norm": 0.01896519772708416, + "learning_rate": 4.31856789170087e-05, + "loss": 0.0041, + "num_input_tokens_seen": 24662984, + "step": 42520 + }, + { + "epoch": 6.333780160857909, + "grad_norm": 0.11543238162994385, + "learning_rate": 4.318344905942954e-05, + "loss": 0.0036, + "num_input_tokens_seen": 24665928, + "step": 42525 + }, + { + "epoch": 6.334524873398868, + "grad_norm": 0.004682311322540045, + "learning_rate": 4.318121889465949e-05, + "loss": 0.3468, + "num_input_tokens_seen": 24669064, + "step": 42530 + }, + { + "epoch": 6.335269585939828, + "grad_norm": 74.58206939697266, + "learning_rate": 4.317898842273622e-05, + "loss": 0.1534, + "num_input_tokens_seen": 24671720, + "step": 42535 + }, + { + "epoch": 6.336014298480786, + "grad_norm": 0.6918045878410339, + "learning_rate": 4.317675764369743e-05, + "loss": 0.0101, + "num_input_tokens_seen": 24674536, + "step": 42540 + }, + { + "epoch": 6.336759011021746, + "grad_norm": 11.108869552612305, + "learning_rate": 4.3174526557580785e-05, + "loss": 0.1724, + "num_input_tokens_seen": 24677704, + "step": 42545 + }, + { + "epoch": 6.337503723562705, + "grad_norm": 0.2639837861061096, + "learning_rate": 4.317229516442398e-05, + "loss": 0.1076, + "num_input_tokens_seen": 24680808, + "step": 42550 + }, + { + "epoch": 6.338248436103664, + "grad_norm": 13.868010520935059, + "learning_rate": 4.317006346426473e-05, + "loss": 0.2513, + "num_input_tokens_seen": 24683528, + "step": 42555 + }, + { + "epoch": 6.338993148644623, + "grad_norm": 0.18015074729919434, + "learning_rate": 4.3167831457140715e-05, + "loss": 0.0792, + "num_input_tokens_seen": 24686792, + "step": 42560 + }, + { + "epoch": 6.339737861185583, + "grad_norm": 0.01774289831519127, + "learning_rate": 4.316559914308966e-05, + "loss": 0.3984, + "num_input_tokens_seen": 24689768, + "step": 42565 + }, + { + "epoch": 6.340482573726542, + "grad_norm": 61.402713775634766, + "learning_rate": 4.316336652214926e-05, + "loss": 0.2046, + "num_input_tokens_seen": 24692840, + "step": 42570 + }, + { + "epoch": 6.3412272862675, + "grad_norm": 8.805076599121094, + "learning_rate": 4.316113359435725e-05, + "loss": 0.4241, + "num_input_tokens_seen": 24695784, + "step": 42575 + }, + { + "epoch": 6.34197199880846, + "grad_norm": 41.03611755371094, + "learning_rate": 4.315890035975135e-05, + "loss": 0.4857, + "num_input_tokens_seen": 24698344, + "step": 42580 + }, + { + "epoch": 6.342716711349419, + "grad_norm": 29.569580078125, + "learning_rate": 4.315666681836928e-05, + "loss": 0.2761, + "num_input_tokens_seen": 24700904, + "step": 42585 + }, + { + "epoch": 6.343461423890378, + "grad_norm": 86.35845947265625, + "learning_rate": 4.315443297024878e-05, + "loss": 0.1357, + "num_input_tokens_seen": 24703400, + "step": 42590 + }, + { + "epoch": 6.344206136431337, + "grad_norm": 63.667686462402344, + "learning_rate": 4.315219881542758e-05, + "loss": 0.3581, + "num_input_tokens_seen": 24706408, + "step": 42595 + }, + { + "epoch": 6.344950848972297, + "grad_norm": 0.37295693159103394, + "learning_rate": 4.314996435394344e-05, + "loss": 0.1214, + "num_input_tokens_seen": 24709320, + "step": 42600 + }, + { + "epoch": 6.3456955615132555, + "grad_norm": 1.8973137140274048, + "learning_rate": 4.314772958583408e-05, + "loss": 0.0071, + "num_input_tokens_seen": 24712456, + "step": 42605 + }, + { + "epoch": 6.346440274054215, + "grad_norm": 20.03074073791504, + "learning_rate": 4.3145494511137294e-05, + "loss": 0.2627, + "num_input_tokens_seen": 24715592, + "step": 42610 + }, + { + "epoch": 6.347184986595174, + "grad_norm": 82.42427825927734, + "learning_rate": 4.3143259129890814e-05, + "loss": 0.3063, + "num_input_tokens_seen": 24718728, + "step": 42615 + }, + { + "epoch": 6.347929699136134, + "grad_norm": 0.31431567668914795, + "learning_rate": 4.314102344213241e-05, + "loss": 0.0054, + "num_input_tokens_seen": 24721640, + "step": 42620 + }, + { + "epoch": 6.348674411677092, + "grad_norm": 15.532307624816895, + "learning_rate": 4.3138787447899854e-05, + "loss": 0.2422, + "num_input_tokens_seen": 24724616, + "step": 42625 + }, + { + "epoch": 6.349419124218052, + "grad_norm": 0.12212593108415604, + "learning_rate": 4.313655114723092e-05, + "loss": 0.0011, + "num_input_tokens_seen": 24727336, + "step": 42630 + }, + { + "epoch": 6.350163836759011, + "grad_norm": 0.06668291240930557, + "learning_rate": 4.3134314540163376e-05, + "loss": 0.1461, + "num_input_tokens_seen": 24730440, + "step": 42635 + }, + { + "epoch": 6.35090854929997, + "grad_norm": 0.41457369923591614, + "learning_rate": 4.3132077626735036e-05, + "loss": 0.2785, + "num_input_tokens_seen": 24733352, + "step": 42640 + }, + { + "epoch": 6.351653261840929, + "grad_norm": 45.57217788696289, + "learning_rate": 4.312984040698366e-05, + "loss": 0.1123, + "num_input_tokens_seen": 24736296, + "step": 42645 + }, + { + "epoch": 6.352397974381889, + "grad_norm": 0.005381811410188675, + "learning_rate": 4.3127602880947065e-05, + "loss": 0.0018, + "num_input_tokens_seen": 24739496, + "step": 42650 + }, + { + "epoch": 6.353142686922848, + "grad_norm": 1.0915340185165405, + "learning_rate": 4.3125365048663035e-05, + "loss": 0.1376, + "num_input_tokens_seen": 24742312, + "step": 42655 + }, + { + "epoch": 6.353887399463807, + "grad_norm": 55.15922927856445, + "learning_rate": 4.31231269101694e-05, + "loss": 0.1222, + "num_input_tokens_seen": 24745000, + "step": 42660 + }, + { + "epoch": 6.354632112004766, + "grad_norm": 0.23700517416000366, + "learning_rate": 4.312088846550394e-05, + "loss": 0.229, + "num_input_tokens_seen": 24747912, + "step": 42665 + }, + { + "epoch": 6.355376824545726, + "grad_norm": 0.04593627154827118, + "learning_rate": 4.311864971470449e-05, + "loss": 0.7683, + "num_input_tokens_seen": 24750824, + "step": 42670 + }, + { + "epoch": 6.356121537086684, + "grad_norm": 0.05674158036708832, + "learning_rate": 4.311641065780887e-05, + "loss": 0.3291, + "num_input_tokens_seen": 24753704, + "step": 42675 + }, + { + "epoch": 6.356866249627644, + "grad_norm": 7.817122936248779, + "learning_rate": 4.31141712948549e-05, + "loss": 0.3282, + "num_input_tokens_seen": 24756296, + "step": 42680 + }, + { + "epoch": 6.357610962168603, + "grad_norm": 105.94880676269531, + "learning_rate": 4.311193162588043e-05, + "loss": 0.2186, + "num_input_tokens_seen": 24759176, + "step": 42685 + }, + { + "epoch": 6.358355674709562, + "grad_norm": 0.2894797921180725, + "learning_rate": 4.3109691650923265e-05, + "loss": 0.1731, + "num_input_tokens_seen": 24762120, + "step": 42690 + }, + { + "epoch": 6.359100387250521, + "grad_norm": 0.1915186494588852, + "learning_rate": 4.310745137002128e-05, + "loss": 0.258, + "num_input_tokens_seen": 24764936, + "step": 42695 + }, + { + "epoch": 6.359845099791481, + "grad_norm": 1.9002768993377686, + "learning_rate": 4.3105210783212304e-05, + "loss": 0.0399, + "num_input_tokens_seen": 24767592, + "step": 42700 + }, + { + "epoch": 6.36058981233244, + "grad_norm": 0.7581705451011658, + "learning_rate": 4.310296989053419e-05, + "loss": 0.348, + "num_input_tokens_seen": 24770472, + "step": 42705 + }, + { + "epoch": 6.361334524873399, + "grad_norm": 0.07602132111787796, + "learning_rate": 4.31007286920248e-05, + "loss": 0.0026, + "num_input_tokens_seen": 24773352, + "step": 42710 + }, + { + "epoch": 6.362079237414358, + "grad_norm": 0.04057875648140907, + "learning_rate": 4.3098487187721995e-05, + "loss": 0.4045, + "num_input_tokens_seen": 24776456, + "step": 42715 + }, + { + "epoch": 6.362823949955318, + "grad_norm": 3.7753684520721436, + "learning_rate": 4.3096245377663645e-05, + "loss": 0.1588, + "num_input_tokens_seen": 24779336, + "step": 42720 + }, + { + "epoch": 6.363568662496276, + "grad_norm": 30.792631149291992, + "learning_rate": 4.3094003261887625e-05, + "loss": 0.1399, + "num_input_tokens_seen": 24782440, + "step": 42725 + }, + { + "epoch": 6.364313375037236, + "grad_norm": 11.768232345581055, + "learning_rate": 4.30917608404318e-05, + "loss": 0.0611, + "num_input_tokens_seen": 24785448, + "step": 42730 + }, + { + "epoch": 6.365058087578195, + "grad_norm": 0.01622992753982544, + "learning_rate": 4.308951811333407e-05, + "loss": 0.0125, + "num_input_tokens_seen": 24788840, + "step": 42735 + }, + { + "epoch": 6.365802800119154, + "grad_norm": 0.006809365935623646, + "learning_rate": 4.3087275080632314e-05, + "loss": 0.0367, + "num_input_tokens_seen": 24791720, + "step": 42740 + }, + { + "epoch": 6.366547512660113, + "grad_norm": 29.98943328857422, + "learning_rate": 4.308503174236443e-05, + "loss": 0.2343, + "num_input_tokens_seen": 24794728, + "step": 42745 + }, + { + "epoch": 6.367292225201073, + "grad_norm": 0.02070963941514492, + "learning_rate": 4.308278809856832e-05, + "loss": 0.2347, + "num_input_tokens_seen": 24797352, + "step": 42750 + }, + { + "epoch": 6.368036937742032, + "grad_norm": 0.07322923839092255, + "learning_rate": 4.3080544149281875e-05, + "loss": 0.0973, + "num_input_tokens_seen": 24800264, + "step": 42755 + }, + { + "epoch": 6.36878165028299, + "grad_norm": 61.1244010925293, + "learning_rate": 4.307829989454302e-05, + "loss": 0.1869, + "num_input_tokens_seen": 24803272, + "step": 42760 + }, + { + "epoch": 6.36952636282395, + "grad_norm": 0.01367418747395277, + "learning_rate": 4.307605533438965e-05, + "loss": 0.3842, + "num_input_tokens_seen": 24806344, + "step": 42765 + }, + { + "epoch": 6.370271075364909, + "grad_norm": 0.7901753187179565, + "learning_rate": 4.307381046885971e-05, + "loss": 0.3566, + "num_input_tokens_seen": 24809256, + "step": 42770 + }, + { + "epoch": 6.371015787905868, + "grad_norm": 72.09607696533203, + "learning_rate": 4.307156529799111e-05, + "loss": 0.1983, + "num_input_tokens_seen": 24812040, + "step": 42775 + }, + { + "epoch": 6.371760500446827, + "grad_norm": 11.023384094238281, + "learning_rate": 4.306931982182178e-05, + "loss": 0.0555, + "num_input_tokens_seen": 24814920, + "step": 42780 + }, + { + "epoch": 6.372505212987787, + "grad_norm": 0.0520060732960701, + "learning_rate": 4.306707404038966e-05, + "loss": 0.1235, + "num_input_tokens_seen": 24817672, + "step": 42785 + }, + { + "epoch": 6.373249925528746, + "grad_norm": 0.05150270834565163, + "learning_rate": 4.306482795373268e-05, + "loss": 0.5767, + "num_input_tokens_seen": 24820616, + "step": 42790 + }, + { + "epoch": 6.373994638069705, + "grad_norm": 0.9232523441314697, + "learning_rate": 4.306258156188879e-05, + "loss": 0.0033, + "num_input_tokens_seen": 24823528, + "step": 42795 + }, + { + "epoch": 6.374739350610664, + "grad_norm": 0.912929892539978, + "learning_rate": 4.306033486489595e-05, + "loss": 0.1981, + "num_input_tokens_seen": 24826280, + "step": 42800 + }, + { + "epoch": 6.375484063151624, + "grad_norm": 9.872383117675781, + "learning_rate": 4.30580878627921e-05, + "loss": 0.3658, + "num_input_tokens_seen": 24829448, + "step": 42805 + }, + { + "epoch": 6.376228775692582, + "grad_norm": 0.015914445742964745, + "learning_rate": 4.305584055561522e-05, + "loss": 0.3497, + "num_input_tokens_seen": 24832232, + "step": 42810 + }, + { + "epoch": 6.376973488233542, + "grad_norm": 64.81697845458984, + "learning_rate": 4.3053592943403256e-05, + "loss": 0.3523, + "num_input_tokens_seen": 24835048, + "step": 42815 + }, + { + "epoch": 6.377718200774501, + "grad_norm": 0.014673737809062004, + "learning_rate": 4.305134502619419e-05, + "loss": 0.0312, + "num_input_tokens_seen": 24837800, + "step": 42820 + }, + { + "epoch": 6.3784629133154604, + "grad_norm": 6.0430216789245605, + "learning_rate": 4.3049096804026e-05, + "loss": 0.0364, + "num_input_tokens_seen": 24840872, + "step": 42825 + }, + { + "epoch": 6.379207625856419, + "grad_norm": 34.539222717285156, + "learning_rate": 4.304684827693666e-05, + "loss": 0.1716, + "num_input_tokens_seen": 24843656, + "step": 42830 + }, + { + "epoch": 6.379952338397379, + "grad_norm": 18.117860794067383, + "learning_rate": 4.304459944496416e-05, + "loss": 0.1369, + "num_input_tokens_seen": 24846504, + "step": 42835 + }, + { + "epoch": 6.380697050938338, + "grad_norm": 39.7913703918457, + "learning_rate": 4.3042350308146496e-05, + "loss": 0.3997, + "num_input_tokens_seen": 24849672, + "step": 42840 + }, + { + "epoch": 6.381441763479297, + "grad_norm": 14.385627746582031, + "learning_rate": 4.304010086652165e-05, + "loss": 0.3083, + "num_input_tokens_seen": 24852488, + "step": 42845 + }, + { + "epoch": 6.382186476020256, + "grad_norm": 2.3591861724853516, + "learning_rate": 4.3037851120127645e-05, + "loss": 0.0111, + "num_input_tokens_seen": 24855368, + "step": 42850 + }, + { + "epoch": 6.382931188561216, + "grad_norm": 1.8535727262496948, + "learning_rate": 4.3035601069002476e-05, + "loss": 0.1181, + "num_input_tokens_seen": 24858088, + "step": 42855 + }, + { + "epoch": 6.383675901102174, + "grad_norm": 14.314022064208984, + "learning_rate": 4.303335071318416e-05, + "loss": 0.129, + "num_input_tokens_seen": 24861320, + "step": 42860 + }, + { + "epoch": 6.384420613643134, + "grad_norm": 14.305952072143555, + "learning_rate": 4.303110005271071e-05, + "loss": 0.2975, + "num_input_tokens_seen": 24864328, + "step": 42865 + }, + { + "epoch": 6.385165326184093, + "grad_norm": 74.4726791381836, + "learning_rate": 4.302884908762015e-05, + "loss": 0.346, + "num_input_tokens_seen": 24867176, + "step": 42870 + }, + { + "epoch": 6.3859100387250525, + "grad_norm": 52.96616744995117, + "learning_rate": 4.302659781795051e-05, + "loss": 0.5203, + "num_input_tokens_seen": 24870152, + "step": 42875 + }, + { + "epoch": 6.386654751266011, + "grad_norm": 0.1880328208208084, + "learning_rate": 4.302434624373982e-05, + "loss": 0.4565, + "num_input_tokens_seen": 24872968, + "step": 42880 + }, + { + "epoch": 6.387399463806971, + "grad_norm": 64.96520233154297, + "learning_rate": 4.3022094365026124e-05, + "loss": 0.7247, + "num_input_tokens_seen": 24875912, + "step": 42885 + }, + { + "epoch": 6.38814417634793, + "grad_norm": 1.4976671934127808, + "learning_rate": 4.3019842181847456e-05, + "loss": 0.1917, + "num_input_tokens_seen": 24878984, + "step": 42890 + }, + { + "epoch": 6.388888888888889, + "grad_norm": 21.777589797973633, + "learning_rate": 4.301758969424187e-05, + "loss": 0.0811, + "num_input_tokens_seen": 24882440, + "step": 42895 + }, + { + "epoch": 6.389633601429848, + "grad_norm": 123.8636474609375, + "learning_rate": 4.301533690224741e-05, + "loss": 0.3065, + "num_input_tokens_seen": 24885416, + "step": 42900 + }, + { + "epoch": 6.390378313970807, + "grad_norm": 0.3228125274181366, + "learning_rate": 4.3013083805902156e-05, + "loss": 0.047, + "num_input_tokens_seen": 24888392, + "step": 42905 + }, + { + "epoch": 6.3911230265117664, + "grad_norm": 64.36266326904297, + "learning_rate": 4.301083040524415e-05, + "loss": 0.1624, + "num_input_tokens_seen": 24891368, + "step": 42910 + }, + { + "epoch": 6.391867739052726, + "grad_norm": 9.929183959960938, + "learning_rate": 4.3008576700311473e-05, + "loss": 0.2098, + "num_input_tokens_seen": 24894312, + "step": 42915 + }, + { + "epoch": 6.392612451593685, + "grad_norm": 42.59611892700195, + "learning_rate": 4.30063226911422e-05, + "loss": 0.2831, + "num_input_tokens_seen": 24897000, + "step": 42920 + }, + { + "epoch": 6.393357164134644, + "grad_norm": 0.030392851680517197, + "learning_rate": 4.30040683777744e-05, + "loss": 0.0279, + "num_input_tokens_seen": 24899528, + "step": 42925 + }, + { + "epoch": 6.394101876675603, + "grad_norm": 1.1963485479354858, + "learning_rate": 4.300181376024616e-05, + "loss": 0.0738, + "num_input_tokens_seen": 24902632, + "step": 42930 + }, + { + "epoch": 6.394846589216562, + "grad_norm": 0.01683652214705944, + "learning_rate": 4.299955883859558e-05, + "loss": 0.1324, + "num_input_tokens_seen": 24905352, + "step": 42935 + }, + { + "epoch": 6.395591301757522, + "grad_norm": 0.10160969942808151, + "learning_rate": 4.2997303612860746e-05, + "loss": 0.0554, + "num_input_tokens_seen": 24908520, + "step": 42940 + }, + { + "epoch": 6.39633601429848, + "grad_norm": 2.21073317527771, + "learning_rate": 4.299504808307976e-05, + "loss": 0.2077, + "num_input_tokens_seen": 24911240, + "step": 42945 + }, + { + "epoch": 6.39708072683944, + "grad_norm": 27.24057960510254, + "learning_rate": 4.299279224929072e-05, + "loss": 0.2329, + "num_input_tokens_seen": 24913960, + "step": 42950 + }, + { + "epoch": 6.397825439380399, + "grad_norm": 4.3699798583984375, + "learning_rate": 4.299053611153175e-05, + "loss": 0.0557, + "num_input_tokens_seen": 24917096, + "step": 42955 + }, + { + "epoch": 6.3985701519213585, + "grad_norm": 0.37696635723114014, + "learning_rate": 4.2988279669840945e-05, + "loss": 0.0187, + "num_input_tokens_seen": 24919944, + "step": 42960 + }, + { + "epoch": 6.399314864462317, + "grad_norm": 35.19267654418945, + "learning_rate": 4.298602292425645e-05, + "loss": 0.1554, + "num_input_tokens_seen": 24922696, + "step": 42965 + }, + { + "epoch": 6.400059577003277, + "grad_norm": 85.15020751953125, + "learning_rate": 4.298376587481637e-05, + "loss": 0.1475, + "num_input_tokens_seen": 24925448, + "step": 42970 + }, + { + "epoch": 6.400804289544236, + "grad_norm": 45.13504409790039, + "learning_rate": 4.2981508521558854e-05, + "loss": 0.0953, + "num_input_tokens_seen": 24927976, + "step": 42975 + }, + { + "epoch": 6.401549002085195, + "grad_norm": 17.76117706298828, + "learning_rate": 4.2979250864522016e-05, + "loss": 0.1968, + "num_input_tokens_seen": 24930824, + "step": 42980 + }, + { + "epoch": 6.402293714626154, + "grad_norm": 4.243358135223389, + "learning_rate": 4.297699290374401e-05, + "loss": 0.2478, + "num_input_tokens_seen": 24933448, + "step": 42985 + }, + { + "epoch": 6.403038427167114, + "grad_norm": 0.0726219117641449, + "learning_rate": 4.297473463926299e-05, + "loss": 0.265, + "num_input_tokens_seen": 24936616, + "step": 42990 + }, + { + "epoch": 6.4037831397080724, + "grad_norm": 0.03619968518614769, + "learning_rate": 4.2972476071117086e-05, + "loss": 0.3789, + "num_input_tokens_seen": 24939464, + "step": 42995 + }, + { + "epoch": 6.404527852249032, + "grad_norm": 26.194303512573242, + "learning_rate": 4.2970217199344465e-05, + "loss": 0.2274, + "num_input_tokens_seen": 24942792, + "step": 43000 + }, + { + "epoch": 6.405272564789991, + "grad_norm": 13.474360466003418, + "learning_rate": 4.296795802398329e-05, + "loss": 0.202, + "num_input_tokens_seen": 24945512, + "step": 43005 + }, + { + "epoch": 6.4060172773309505, + "grad_norm": 0.03233598545193672, + "learning_rate": 4.296569854507173e-05, + "loss": 0.1296, + "num_input_tokens_seen": 24948456, + "step": 43010 + }, + { + "epoch": 6.406761989871909, + "grad_norm": 0.025749802589416504, + "learning_rate": 4.2963438762647954e-05, + "loss": 0.0401, + "num_input_tokens_seen": 24951528, + "step": 43015 + }, + { + "epoch": 6.407506702412869, + "grad_norm": 1.9333240985870361, + "learning_rate": 4.2961178676750124e-05, + "loss": 0.1683, + "num_input_tokens_seen": 24954536, + "step": 43020 + }, + { + "epoch": 6.408251414953828, + "grad_norm": 1.6380256414413452, + "learning_rate": 4.295891828741645e-05, + "loss": 0.135, + "num_input_tokens_seen": 24957576, + "step": 43025 + }, + { + "epoch": 6.408996127494787, + "grad_norm": 9.65285587310791, + "learning_rate": 4.29566575946851e-05, + "loss": 0.1002, + "num_input_tokens_seen": 24960264, + "step": 43030 + }, + { + "epoch": 6.409740840035746, + "grad_norm": 10.764214515686035, + "learning_rate": 4.295439659859427e-05, + "loss": 0.5123, + "num_input_tokens_seen": 24963144, + "step": 43035 + }, + { + "epoch": 6.410485552576706, + "grad_norm": 0.03222702071070671, + "learning_rate": 4.2952135299182155e-05, + "loss": 0.1627, + "num_input_tokens_seen": 24965960, + "step": 43040 + }, + { + "epoch": 6.4112302651176645, + "grad_norm": 0.40820083022117615, + "learning_rate": 4.294987369648696e-05, + "loss": 0.4115, + "num_input_tokens_seen": 24968872, + "step": 43045 + }, + { + "epoch": 6.411974977658624, + "grad_norm": 5.070250988006592, + "learning_rate": 4.2947611790546894e-05, + "loss": 0.224, + "num_input_tokens_seen": 24971688, + "step": 43050 + }, + { + "epoch": 6.412719690199583, + "grad_norm": 0.5468531847000122, + "learning_rate": 4.2945349581400174e-05, + "loss": 0.0788, + "num_input_tokens_seen": 24974632, + "step": 43055 + }, + { + "epoch": 6.4134644027405425, + "grad_norm": 0.12345975637435913, + "learning_rate": 4.2943087069085e-05, + "loss": 0.1825, + "num_input_tokens_seen": 24977480, + "step": 43060 + }, + { + "epoch": 6.414209115281501, + "grad_norm": 32.276756286621094, + "learning_rate": 4.294082425363961e-05, + "loss": 0.4411, + "num_input_tokens_seen": 24980648, + "step": 43065 + }, + { + "epoch": 6.414953827822461, + "grad_norm": 16.34360122680664, + "learning_rate": 4.293856113510223e-05, + "loss": 0.07, + "num_input_tokens_seen": 24983624, + "step": 43070 + }, + { + "epoch": 6.41569854036342, + "grad_norm": 0.2918339967727661, + "learning_rate": 4.29362977135111e-05, + "loss": 0.0038, + "num_input_tokens_seen": 24986568, + "step": 43075 + }, + { + "epoch": 6.416443252904379, + "grad_norm": 35.41474151611328, + "learning_rate": 4.2934033988904437e-05, + "loss": 0.7265, + "num_input_tokens_seen": 24989128, + "step": 43080 + }, + { + "epoch": 6.417187965445338, + "grad_norm": 0.06159675866365433, + "learning_rate": 4.2931769961320504e-05, + "loss": 0.156, + "num_input_tokens_seen": 24991816, + "step": 43085 + }, + { + "epoch": 6.417932677986297, + "grad_norm": 4.8304219245910645, + "learning_rate": 4.292950563079754e-05, + "loss": 0.4902, + "num_input_tokens_seen": 24994792, + "step": 43090 + }, + { + "epoch": 6.4186773905272565, + "grad_norm": 5.476047039031982, + "learning_rate": 4.2927240997373795e-05, + "loss": 0.1862, + "num_input_tokens_seen": 24997512, + "step": 43095 + }, + { + "epoch": 6.419422103068215, + "grad_norm": 0.489362508058548, + "learning_rate": 4.292497606108754e-05, + "loss": 0.0243, + "num_input_tokens_seen": 25000264, + "step": 43100 + }, + { + "epoch": 6.420166815609175, + "grad_norm": 2.9381260871887207, + "learning_rate": 4.2922710821977044e-05, + "loss": 0.6236, + "num_input_tokens_seen": 25003208, + "step": 43105 + }, + { + "epoch": 6.420911528150134, + "grad_norm": 5.067258834838867, + "learning_rate": 4.2920445280080544e-05, + "loss": 0.2344, + "num_input_tokens_seen": 25006088, + "step": 43110 + }, + { + "epoch": 6.421656240691093, + "grad_norm": 0.7073650360107422, + "learning_rate": 4.291817943543634e-05, + "loss": 0.0194, + "num_input_tokens_seen": 25008712, + "step": 43115 + }, + { + "epoch": 6.422400953232052, + "grad_norm": 0.4996563494205475, + "learning_rate": 4.291591328808272e-05, + "loss": 0.0024, + "num_input_tokens_seen": 25011592, + "step": 43120 + }, + { + "epoch": 6.423145665773012, + "grad_norm": 2.3705108165740967, + "learning_rate": 4.291364683805794e-05, + "loss": 0.2148, + "num_input_tokens_seen": 25014376, + "step": 43125 + }, + { + "epoch": 6.4238903783139705, + "grad_norm": 27.956819534301758, + "learning_rate": 4.291138008540031e-05, + "loss": 0.2285, + "num_input_tokens_seen": 25017320, + "step": 43130 + }, + { + "epoch": 6.42463509085493, + "grad_norm": 0.15223269164562225, + "learning_rate": 4.2909113030148106e-05, + "loss": 0.0924, + "num_input_tokens_seen": 25020264, + "step": 43135 + }, + { + "epoch": 6.425379803395889, + "grad_norm": 0.5471738576889038, + "learning_rate": 4.290684567233965e-05, + "loss": 0.0061, + "num_input_tokens_seen": 25023144, + "step": 43140 + }, + { + "epoch": 6.4261245159368485, + "grad_norm": 60.62384796142578, + "learning_rate": 4.2904578012013233e-05, + "loss": 0.1319, + "num_input_tokens_seen": 25026216, + "step": 43145 + }, + { + "epoch": 6.426869228477807, + "grad_norm": 2.71195912361145, + "learning_rate": 4.290231004920717e-05, + "loss": 0.291, + "num_input_tokens_seen": 25029128, + "step": 43150 + }, + { + "epoch": 6.427613941018767, + "grad_norm": 0.05380573868751526, + "learning_rate": 4.2900041783959775e-05, + "loss": 0.0211, + "num_input_tokens_seen": 25032104, + "step": 43155 + }, + { + "epoch": 6.428358653559726, + "grad_norm": 80.77288818359375, + "learning_rate": 4.2897773216309366e-05, + "loss": 0.6253, + "num_input_tokens_seen": 25034856, + "step": 43160 + }, + { + "epoch": 6.429103366100685, + "grad_norm": 32.86398696899414, + "learning_rate": 4.289550434629426e-05, + "loss": 0.3437, + "num_input_tokens_seen": 25037768, + "step": 43165 + }, + { + "epoch": 6.429848078641644, + "grad_norm": 33.036861419677734, + "learning_rate": 4.2893235173952805e-05, + "loss": 0.0812, + "num_input_tokens_seen": 25040616, + "step": 43170 + }, + { + "epoch": 6.430592791182604, + "grad_norm": 10.168660163879395, + "learning_rate": 4.2890965699323335e-05, + "loss": 0.456, + "num_input_tokens_seen": 25043464, + "step": 43175 + }, + { + "epoch": 6.4313375037235625, + "grad_norm": 0.9890889525413513, + "learning_rate": 4.288869592244417e-05, + "loss": 0.2936, + "num_input_tokens_seen": 25046472, + "step": 43180 + }, + { + "epoch": 6.432082216264522, + "grad_norm": 194.2267608642578, + "learning_rate": 4.288642584335367e-05, + "loss": 0.355, + "num_input_tokens_seen": 25049512, + "step": 43185 + }, + { + "epoch": 6.432826928805481, + "grad_norm": 0.065342977643013, + "learning_rate": 4.2884155462090194e-05, + "loss": 0.2515, + "num_input_tokens_seen": 25052296, + "step": 43190 + }, + { + "epoch": 6.4335716413464406, + "grad_norm": 13.852252006530762, + "learning_rate": 4.2881884778692076e-05, + "loss": 0.2759, + "num_input_tokens_seen": 25055304, + "step": 43195 + }, + { + "epoch": 6.434316353887399, + "grad_norm": 0.014553923159837723, + "learning_rate": 4.287961379319769e-05, + "loss": 0.0497, + "num_input_tokens_seen": 25058024, + "step": 43200 + }, + { + "epoch": 6.435061066428359, + "grad_norm": 0.06597871333360672, + "learning_rate": 4.287734250564541e-05, + "loss": 0.2855, + "num_input_tokens_seen": 25060968, + "step": 43205 + }, + { + "epoch": 6.435805778969318, + "grad_norm": 8.2512788772583, + "learning_rate": 4.28750709160736e-05, + "loss": 0.2187, + "num_input_tokens_seen": 25063752, + "step": 43210 + }, + { + "epoch": 6.436550491510277, + "grad_norm": 2.3286564350128174, + "learning_rate": 4.2872799024520626e-05, + "loss": 0.2072, + "num_input_tokens_seen": 25066728, + "step": 43215 + }, + { + "epoch": 6.437295204051236, + "grad_norm": 0.03314682096242905, + "learning_rate": 4.287052683102488e-05, + "loss": 0.1652, + "num_input_tokens_seen": 25069512, + "step": 43220 + }, + { + "epoch": 6.438039916592196, + "grad_norm": 0.04169129580259323, + "learning_rate": 4.286825433562474e-05, + "loss": 0.0208, + "num_input_tokens_seen": 25072520, + "step": 43225 + }, + { + "epoch": 6.4387846291331545, + "grad_norm": 0.11293169111013412, + "learning_rate": 4.286598153835861e-05, + "loss": 0.3981, + "num_input_tokens_seen": 25075336, + "step": 43230 + }, + { + "epoch": 6.439529341674114, + "grad_norm": 14.91911792755127, + "learning_rate": 4.2863708439264886e-05, + "loss": 0.2461, + "num_input_tokens_seen": 25078152, + "step": 43235 + }, + { + "epoch": 6.440274054215073, + "grad_norm": 0.038943540304899216, + "learning_rate": 4.286143503838195e-05, + "loss": 0.4781, + "num_input_tokens_seen": 25080840, + "step": 43240 + }, + { + "epoch": 6.441018766756033, + "grad_norm": 3.9421231746673584, + "learning_rate": 4.285916133574823e-05, + "loss": 0.3233, + "num_input_tokens_seen": 25083624, + "step": 43245 + }, + { + "epoch": 6.441763479296991, + "grad_norm": 9.105741500854492, + "learning_rate": 4.2856887331402126e-05, + "loss": 0.3576, + "num_input_tokens_seen": 25086504, + "step": 43250 + }, + { + "epoch": 6.44250819183795, + "grad_norm": 0.08085062354803085, + "learning_rate": 4.285461302538207e-05, + "loss": 0.1118, + "num_input_tokens_seen": 25089384, + "step": 43255 + }, + { + "epoch": 6.44325290437891, + "grad_norm": 4.992698669433594, + "learning_rate": 4.285233841772647e-05, + "loss": 0.2283, + "num_input_tokens_seen": 25092232, + "step": 43260 + }, + { + "epoch": 6.443997616919869, + "grad_norm": 19.75090217590332, + "learning_rate": 4.2850063508473746e-05, + "loss": 0.2806, + "num_input_tokens_seen": 25095176, + "step": 43265 + }, + { + "epoch": 6.444742329460828, + "grad_norm": 0.2522783875465393, + "learning_rate": 4.284778829766235e-05, + "loss": 0.2405, + "num_input_tokens_seen": 25098184, + "step": 43270 + }, + { + "epoch": 6.445487042001787, + "grad_norm": 0.026331085711717606, + "learning_rate": 4.284551278533071e-05, + "loss": 0.1696, + "num_input_tokens_seen": 25101032, + "step": 43275 + }, + { + "epoch": 6.4462317545427466, + "grad_norm": 38.138309478759766, + "learning_rate": 4.284323697151726e-05, + "loss": 0.3599, + "num_input_tokens_seen": 25103752, + "step": 43280 + }, + { + "epoch": 6.446976467083705, + "grad_norm": 0.2874976396560669, + "learning_rate": 4.284096085626047e-05, + "loss": 0.1566, + "num_input_tokens_seen": 25106536, + "step": 43285 + }, + { + "epoch": 6.447721179624665, + "grad_norm": 31.486141204833984, + "learning_rate": 4.283868443959877e-05, + "loss": 0.3115, + "num_input_tokens_seen": 25109224, + "step": 43290 + }, + { + "epoch": 6.448465892165624, + "grad_norm": 0.061456311494112015, + "learning_rate": 4.283640772157064e-05, + "loss": 0.1033, + "num_input_tokens_seen": 25112392, + "step": 43295 + }, + { + "epoch": 6.449210604706583, + "grad_norm": 8.396854400634766, + "learning_rate": 4.283413070221452e-05, + "loss": 0.129, + "num_input_tokens_seen": 25115624, + "step": 43300 + }, + { + "epoch": 6.449955317247542, + "grad_norm": 8.946242332458496, + "learning_rate": 4.283185338156888e-05, + "loss": 0.3065, + "num_input_tokens_seen": 25118408, + "step": 43305 + }, + { + "epoch": 6.450700029788502, + "grad_norm": 53.587074279785156, + "learning_rate": 4.282957575967221e-05, + "loss": 0.6325, + "num_input_tokens_seen": 25121448, + "step": 43310 + }, + { + "epoch": 6.4514447423294605, + "grad_norm": 99.64778137207031, + "learning_rate": 4.282729783656298e-05, + "loss": 0.1764, + "num_input_tokens_seen": 25124424, + "step": 43315 + }, + { + "epoch": 6.45218945487042, + "grad_norm": 16.97294807434082, + "learning_rate": 4.2825019612279666e-05, + "loss": 0.6351, + "num_input_tokens_seen": 25127112, + "step": 43320 + }, + { + "epoch": 6.452934167411379, + "grad_norm": 0.06224555894732475, + "learning_rate": 4.282274108686076e-05, + "loss": 0.0054, + "num_input_tokens_seen": 25129672, + "step": 43325 + }, + { + "epoch": 6.453678879952339, + "grad_norm": 0.038814105093479156, + "learning_rate": 4.282046226034476e-05, + "loss": 0.0822, + "num_input_tokens_seen": 25132648, + "step": 43330 + }, + { + "epoch": 6.454423592493297, + "grad_norm": 29.14788246154785, + "learning_rate": 4.2818183132770175e-05, + "loss": 0.3921, + "num_input_tokens_seen": 25135752, + "step": 43335 + }, + { + "epoch": 6.455168305034257, + "grad_norm": 23.298736572265625, + "learning_rate": 4.281590370417548e-05, + "loss": 0.195, + "num_input_tokens_seen": 25138760, + "step": 43340 + }, + { + "epoch": 6.455913017575216, + "grad_norm": 0.25492364168167114, + "learning_rate": 4.28136239745992e-05, + "loss": 0.2475, + "num_input_tokens_seen": 25141832, + "step": 43345 + }, + { + "epoch": 6.456657730116175, + "grad_norm": 7.93395471572876, + "learning_rate": 4.2811343944079855e-05, + "loss": 0.5329, + "num_input_tokens_seen": 25144744, + "step": 43350 + }, + { + "epoch": 6.457402442657134, + "grad_norm": 26.296998977661133, + "learning_rate": 4.280906361265595e-05, + "loss": 0.3435, + "num_input_tokens_seen": 25147688, + "step": 43355 + }, + { + "epoch": 6.458147155198094, + "grad_norm": 16.362293243408203, + "learning_rate": 4.2806782980366025e-05, + "loss": 0.1049, + "num_input_tokens_seen": 25150664, + "step": 43360 + }, + { + "epoch": 6.4588918677390526, + "grad_norm": 33.00114822387695, + "learning_rate": 4.2804502047248594e-05, + "loss": 0.4561, + "num_input_tokens_seen": 25153544, + "step": 43365 + }, + { + "epoch": 6.459636580280012, + "grad_norm": 28.339366912841797, + "learning_rate": 4.2802220813342194e-05, + "loss": 0.3084, + "num_input_tokens_seen": 25156712, + "step": 43370 + }, + { + "epoch": 6.460381292820971, + "grad_norm": 20.134605407714844, + "learning_rate": 4.2799939278685376e-05, + "loss": 0.1528, + "num_input_tokens_seen": 25159720, + "step": 43375 + }, + { + "epoch": 6.461126005361931, + "grad_norm": 2.177978754043579, + "learning_rate": 4.279765744331666e-05, + "loss": 0.0703, + "num_input_tokens_seen": 25162600, + "step": 43380 + }, + { + "epoch": 6.461870717902889, + "grad_norm": 16.611949920654297, + "learning_rate": 4.2795375307274624e-05, + "loss": 0.0905, + "num_input_tokens_seen": 25165896, + "step": 43385 + }, + { + "epoch": 6.462615430443849, + "grad_norm": 11.253965377807617, + "learning_rate": 4.2793092870597804e-05, + "loss": 0.3537, + "num_input_tokens_seen": 25168680, + "step": 43390 + }, + { + "epoch": 6.463360142984808, + "grad_norm": 2.5986456871032715, + "learning_rate": 4.279081013332476e-05, + "loss": 0.1461, + "num_input_tokens_seen": 25171912, + "step": 43395 + }, + { + "epoch": 6.464104855525767, + "grad_norm": 3.647408962249756, + "learning_rate": 4.278852709549406e-05, + "loss": 0.1023, + "num_input_tokens_seen": 25174952, + "step": 43400 + }, + { + "epoch": 6.464849568066726, + "grad_norm": 0.03065156750380993, + "learning_rate": 4.2786243757144284e-05, + "loss": 0.1622, + "num_input_tokens_seen": 25177640, + "step": 43405 + }, + { + "epoch": 6.465594280607686, + "grad_norm": 3.7408645153045654, + "learning_rate": 4.278396011831399e-05, + "loss": 0.2542, + "num_input_tokens_seen": 25180584, + "step": 43410 + }, + { + "epoch": 6.466338993148645, + "grad_norm": 24.396642684936523, + "learning_rate": 4.2781676179041764e-05, + "loss": 0.397, + "num_input_tokens_seen": 25183432, + "step": 43415 + }, + { + "epoch": 6.467083705689604, + "grad_norm": 50.360755920410156, + "learning_rate": 4.2779391939366194e-05, + "loss": 0.2549, + "num_input_tokens_seen": 25186504, + "step": 43420 + }, + { + "epoch": 6.467828418230563, + "grad_norm": 20.90648078918457, + "learning_rate": 4.277710739932586e-05, + "loss": 0.0756, + "num_input_tokens_seen": 25188904, + "step": 43425 + }, + { + "epoch": 6.468573130771523, + "grad_norm": 0.9934497475624084, + "learning_rate": 4.277482255895937e-05, + "loss": 0.0344, + "num_input_tokens_seen": 25191688, + "step": 43430 + }, + { + "epoch": 6.469317843312481, + "grad_norm": 0.21331115067005157, + "learning_rate": 4.277253741830532e-05, + "loss": 0.0661, + "num_input_tokens_seen": 25194600, + "step": 43435 + }, + { + "epoch": 6.47006255585344, + "grad_norm": 2.144324541091919, + "learning_rate": 4.2770251977402314e-05, + "loss": 0.1436, + "num_input_tokens_seen": 25197288, + "step": 43440 + }, + { + "epoch": 6.4708072683944, + "grad_norm": 0.020559700205922127, + "learning_rate": 4.2767966236288956e-05, + "loss": 0.1402, + "num_input_tokens_seen": 25200456, + "step": 43445 + }, + { + "epoch": 6.4715519809353586, + "grad_norm": 0.39003026485443115, + "learning_rate": 4.276568019500388e-05, + "loss": 0.0034, + "num_input_tokens_seen": 25203304, + "step": 43450 + }, + { + "epoch": 6.472296693476318, + "grad_norm": 70.15709686279297, + "learning_rate": 4.276339385358568e-05, + "loss": 0.3766, + "num_input_tokens_seen": 25206024, + "step": 43455 + }, + { + "epoch": 6.473041406017277, + "grad_norm": 0.020858611911535263, + "learning_rate": 4.2761107212073e-05, + "loss": 0.0799, + "num_input_tokens_seen": 25208680, + "step": 43460 + }, + { + "epoch": 6.473786118558237, + "grad_norm": 0.0803077220916748, + "learning_rate": 4.275882027050446e-05, + "loss": 0.3423, + "num_input_tokens_seen": 25211688, + "step": 43465 + }, + { + "epoch": 6.474530831099195, + "grad_norm": 0.1655217409133911, + "learning_rate": 4.275653302891871e-05, + "loss": 0.0532, + "num_input_tokens_seen": 25214696, + "step": 43470 + }, + { + "epoch": 6.475275543640155, + "grad_norm": 25.91925811767578, + "learning_rate": 4.275424548735437e-05, + "loss": 0.036, + "num_input_tokens_seen": 25217480, + "step": 43475 + }, + { + "epoch": 6.476020256181114, + "grad_norm": 0.012370208278298378, + "learning_rate": 4.27519576458501e-05, + "loss": 0.042, + "num_input_tokens_seen": 25220360, + "step": 43480 + }, + { + "epoch": 6.476764968722073, + "grad_norm": 0.06084681674838066, + "learning_rate": 4.274966950444456e-05, + "loss": 0.1741, + "num_input_tokens_seen": 25223112, + "step": 43485 + }, + { + "epoch": 6.477509681263032, + "grad_norm": 4.925465106964111, + "learning_rate": 4.2747381063176384e-05, + "loss": 0.1322, + "num_input_tokens_seen": 25225960, + "step": 43490 + }, + { + "epoch": 6.478254393803992, + "grad_norm": 0.016326311975717545, + "learning_rate": 4.274509232208425e-05, + "loss": 0.2139, + "num_input_tokens_seen": 25228616, + "step": 43495 + }, + { + "epoch": 6.478999106344951, + "grad_norm": 2.1132092475891113, + "learning_rate": 4.274280328120681e-05, + "loss": 0.064, + "num_input_tokens_seen": 25231560, + "step": 43500 + }, + { + "epoch": 6.47974381888591, + "grad_norm": 20.5664005279541, + "learning_rate": 4.274051394058274e-05, + "loss": 0.3968, + "num_input_tokens_seen": 25234376, + "step": 43505 + }, + { + "epoch": 6.480488531426869, + "grad_norm": 28.97974395751953, + "learning_rate": 4.273822430025072e-05, + "loss": 0.2499, + "num_input_tokens_seen": 25237288, + "step": 43510 + }, + { + "epoch": 6.481233243967829, + "grad_norm": 0.009068859741091728, + "learning_rate": 4.2735934360249426e-05, + "loss": 0.0561, + "num_input_tokens_seen": 25239816, + "step": 43515 + }, + { + "epoch": 6.481977956508787, + "grad_norm": 0.04367467015981674, + "learning_rate": 4.2733644120617547e-05, + "loss": 0.0974, + "num_input_tokens_seen": 25242632, + "step": 43520 + }, + { + "epoch": 6.482722669049747, + "grad_norm": 8.054722785949707, + "learning_rate": 4.273135358139377e-05, + "loss": 0.1613, + "num_input_tokens_seen": 25245480, + "step": 43525 + }, + { + "epoch": 6.483467381590706, + "grad_norm": 44.44852066040039, + "learning_rate": 4.272906274261681e-05, + "loss": 0.1139, + "num_input_tokens_seen": 25248360, + "step": 43530 + }, + { + "epoch": 6.484212094131665, + "grad_norm": 4.22199821472168, + "learning_rate": 4.2726771604325346e-05, + "loss": 0.4507, + "num_input_tokens_seen": 25251208, + "step": 43535 + }, + { + "epoch": 6.484956806672624, + "grad_norm": 0.0549265556037426, + "learning_rate": 4.272448016655809e-05, + "loss": 0.1519, + "num_input_tokens_seen": 25254056, + "step": 43540 + }, + { + "epoch": 6.485701519213584, + "grad_norm": 0.009765123948454857, + "learning_rate": 4.272218842935376e-05, + "loss": 0.2259, + "num_input_tokens_seen": 25256808, + "step": 43545 + }, + { + "epoch": 6.486446231754543, + "grad_norm": 16.032840728759766, + "learning_rate": 4.271989639275107e-05, + "loss": 0.39, + "num_input_tokens_seen": 25259752, + "step": 43550 + }, + { + "epoch": 6.487190944295502, + "grad_norm": 0.48475295305252075, + "learning_rate": 4.271760405678874e-05, + "loss": 0.0823, + "num_input_tokens_seen": 25262984, + "step": 43555 + }, + { + "epoch": 6.487935656836461, + "grad_norm": 0.02361009269952774, + "learning_rate": 4.2715311421505486e-05, + "loss": 0.0492, + "num_input_tokens_seen": 25265608, + "step": 43560 + }, + { + "epoch": 6.488680369377421, + "grad_norm": 117.4397964477539, + "learning_rate": 4.271301848694006e-05, + "loss": 0.2361, + "num_input_tokens_seen": 25268680, + "step": 43565 + }, + { + "epoch": 6.489425081918379, + "grad_norm": 0.23173561692237854, + "learning_rate": 4.271072525313119e-05, + "loss": 0.2262, + "num_input_tokens_seen": 25271720, + "step": 43570 + }, + { + "epoch": 6.490169794459339, + "grad_norm": 2.688600540161133, + "learning_rate": 4.2708431720117614e-05, + "loss": 0.2183, + "num_input_tokens_seen": 25274504, + "step": 43575 + }, + { + "epoch": 6.490914507000298, + "grad_norm": 33.41346740722656, + "learning_rate": 4.270613788793808e-05, + "loss": 0.1805, + "num_input_tokens_seen": 25277416, + "step": 43580 + }, + { + "epoch": 6.4916592195412575, + "grad_norm": 0.03100133314728737, + "learning_rate": 4.2703843756631344e-05, + "loss": 0.1247, + "num_input_tokens_seen": 25280328, + "step": 43585 + }, + { + "epoch": 6.492403932082216, + "grad_norm": 0.2581264078617096, + "learning_rate": 4.270154932623617e-05, + "loss": 0.0114, + "num_input_tokens_seen": 25283240, + "step": 43590 + }, + { + "epoch": 6.493148644623176, + "grad_norm": 14.63115119934082, + "learning_rate": 4.26992545967913e-05, + "loss": 0.3159, + "num_input_tokens_seen": 25286376, + "step": 43595 + }, + { + "epoch": 6.493893357164135, + "grad_norm": 0.3721567392349243, + "learning_rate": 4.2696959568335515e-05, + "loss": 0.3486, + "num_input_tokens_seen": 25289768, + "step": 43600 + }, + { + "epoch": 6.494638069705093, + "grad_norm": 8.978968620300293, + "learning_rate": 4.2694664240907586e-05, + "loss": 0.2756, + "num_input_tokens_seen": 25292584, + "step": 43605 + }, + { + "epoch": 6.495382782246053, + "grad_norm": 51.965789794921875, + "learning_rate": 4.269236861454629e-05, + "loss": 0.1789, + "num_input_tokens_seen": 25295560, + "step": 43610 + }, + { + "epoch": 6.496127494787013, + "grad_norm": 21.270540237426758, + "learning_rate": 4.2690072689290405e-05, + "loss": 0.0615, + "num_input_tokens_seen": 25298408, + "step": 43615 + }, + { + "epoch": 6.496872207327971, + "grad_norm": 0.03238016366958618, + "learning_rate": 4.268777646517872e-05, + "loss": 0.0259, + "num_input_tokens_seen": 25301480, + "step": 43620 + }, + { + "epoch": 6.49761691986893, + "grad_norm": 0.03236425668001175, + "learning_rate": 4.268547994225003e-05, + "loss": 0.2388, + "num_input_tokens_seen": 25304264, + "step": 43625 + }, + { + "epoch": 6.49836163240989, + "grad_norm": 25.644001007080078, + "learning_rate": 4.2683183120543134e-05, + "loss": 0.1872, + "num_input_tokens_seen": 25307016, + "step": 43630 + }, + { + "epoch": 6.499106344950849, + "grad_norm": 1.738031268119812, + "learning_rate": 4.2680886000096834e-05, + "loss": 0.1504, + "num_input_tokens_seen": 25309864, + "step": 43635 + }, + { + "epoch": 6.499851057491808, + "grad_norm": 22.50457763671875, + "learning_rate": 4.267858858094993e-05, + "loss": 0.2163, + "num_input_tokens_seen": 25312648, + "step": 43640 + }, + { + "epoch": 6.500595770032767, + "grad_norm": 0.25769439339637756, + "learning_rate": 4.267629086314123e-05, + "loss": 0.2164, + "num_input_tokens_seen": 25315592, + "step": 43645 + }, + { + "epoch": 6.501340482573727, + "grad_norm": 49.394290924072266, + "learning_rate": 4.2673992846709574e-05, + "loss": 0.4762, + "num_input_tokens_seen": 25318568, + "step": 43650 + }, + { + "epoch": 6.502085195114685, + "grad_norm": 8.016697883605957, + "learning_rate": 4.267169453169377e-05, + "loss": 0.1573, + "num_input_tokens_seen": 25321384, + "step": 43655 + }, + { + "epoch": 6.502829907655645, + "grad_norm": 11.372432708740234, + "learning_rate": 4.266939591813265e-05, + "loss": 0.055, + "num_input_tokens_seen": 25324232, + "step": 43660 + }, + { + "epoch": 6.503574620196604, + "grad_norm": 0.025007907301187515, + "learning_rate": 4.266709700606504e-05, + "loss": 0.1282, + "num_input_tokens_seen": 25327208, + "step": 43665 + }, + { + "epoch": 6.5043193327375635, + "grad_norm": 0.2652042508125305, + "learning_rate": 4.266479779552979e-05, + "loss": 0.0286, + "num_input_tokens_seen": 25330024, + "step": 43670 + }, + { + "epoch": 6.505064045278522, + "grad_norm": 4.581120014190674, + "learning_rate": 4.266249828656572e-05, + "loss": 0.3529, + "num_input_tokens_seen": 25332936, + "step": 43675 + }, + { + "epoch": 6.505808757819482, + "grad_norm": 12.470114707946777, + "learning_rate": 4.2660198479211705e-05, + "loss": 0.2196, + "num_input_tokens_seen": 25335368, + "step": 43680 + }, + { + "epoch": 6.506553470360441, + "grad_norm": 9.072545051574707, + "learning_rate": 4.265789837350658e-05, + "loss": 0.0437, + "num_input_tokens_seen": 25338280, + "step": 43685 + }, + { + "epoch": 6.5072981829014, + "grad_norm": 14.535017967224121, + "learning_rate": 4.2655597969489216e-05, + "loss": 0.0843, + "num_input_tokens_seen": 25341320, + "step": 43690 + }, + { + "epoch": 6.508042895442359, + "grad_norm": 19.193862915039062, + "learning_rate": 4.265329726719845e-05, + "loss": 0.084, + "num_input_tokens_seen": 25344168, + "step": 43695 + }, + { + "epoch": 6.508787607983319, + "grad_norm": 9.682734489440918, + "learning_rate": 4.2650996266673197e-05, + "loss": 0.3499, + "num_input_tokens_seen": 25347048, + "step": 43700 + }, + { + "epoch": 6.509532320524277, + "grad_norm": 0.03117206133902073, + "learning_rate": 4.264869496795229e-05, + "loss": 0.2495, + "num_input_tokens_seen": 25349960, + "step": 43705 + }, + { + "epoch": 6.510277033065237, + "grad_norm": 0.017458025366067886, + "learning_rate": 4.264639337107461e-05, + "loss": 0.0837, + "num_input_tokens_seen": 25352552, + "step": 43710 + }, + { + "epoch": 6.511021745606196, + "grad_norm": 0.005059836897999048, + "learning_rate": 4.264409147607905e-05, + "loss": 0.0019, + "num_input_tokens_seen": 25355624, + "step": 43715 + }, + { + "epoch": 6.5117664581471555, + "grad_norm": 12.0726318359375, + "learning_rate": 4.264178928300451e-05, + "loss": 0.6823, + "num_input_tokens_seen": 25358344, + "step": 43720 + }, + { + "epoch": 6.512511170688114, + "grad_norm": 19.947647094726562, + "learning_rate": 4.263948679188986e-05, + "loss": 0.2524, + "num_input_tokens_seen": 25361032, + "step": 43725 + }, + { + "epoch": 6.513255883229074, + "grad_norm": 0.04606516659259796, + "learning_rate": 4.263718400277401e-05, + "loss": 0.0283, + "num_input_tokens_seen": 25363944, + "step": 43730 + }, + { + "epoch": 6.514000595770033, + "grad_norm": 40.522335052490234, + "learning_rate": 4.263488091569586e-05, + "loss": 0.2792, + "num_input_tokens_seen": 25366760, + "step": 43735 + }, + { + "epoch": 6.514745308310992, + "grad_norm": 36.809776306152344, + "learning_rate": 4.263257753069432e-05, + "loss": 0.3258, + "num_input_tokens_seen": 25369672, + "step": 43740 + }, + { + "epoch": 6.515490020851951, + "grad_norm": 23.22444725036621, + "learning_rate": 4.263027384780831e-05, + "loss": 0.0606, + "num_input_tokens_seen": 25372520, + "step": 43745 + }, + { + "epoch": 6.516234733392911, + "grad_norm": 21.998836517333984, + "learning_rate": 4.2627969867076736e-05, + "loss": 0.4091, + "num_input_tokens_seen": 25375176, + "step": 43750 + }, + { + "epoch": 6.5169794459338695, + "grad_norm": 128.1205596923828, + "learning_rate": 4.2625665588538534e-05, + "loss": 0.4765, + "num_input_tokens_seen": 25378216, + "step": 43755 + }, + { + "epoch": 6.517724158474829, + "grad_norm": 5.076225280761719, + "learning_rate": 4.262336101223262e-05, + "loss": 0.1145, + "num_input_tokens_seen": 25381160, + "step": 43760 + }, + { + "epoch": 6.518468871015788, + "grad_norm": 33.714664459228516, + "learning_rate": 4.2621056138197936e-05, + "loss": 0.1149, + "num_input_tokens_seen": 25383784, + "step": 43765 + }, + { + "epoch": 6.519213583556747, + "grad_norm": 20.086851119995117, + "learning_rate": 4.261875096647341e-05, + "loss": 0.1905, + "num_input_tokens_seen": 25386344, + "step": 43770 + }, + { + "epoch": 6.519958296097706, + "grad_norm": 14.852140426635742, + "learning_rate": 4.2616445497098e-05, + "loss": 0.2388, + "num_input_tokens_seen": 25389448, + "step": 43775 + }, + { + "epoch": 6.520703008638666, + "grad_norm": 42.22060012817383, + "learning_rate": 4.261413973011065e-05, + "loss": 0.263, + "num_input_tokens_seen": 25392552, + "step": 43780 + }, + { + "epoch": 6.521447721179625, + "grad_norm": 0.649904191493988, + "learning_rate": 4.261183366555032e-05, + "loss": 0.3644, + "num_input_tokens_seen": 25395400, + "step": 43785 + }, + { + "epoch": 6.522192433720583, + "grad_norm": 0.3994770050048828, + "learning_rate": 4.260952730345594e-05, + "loss": 0.0816, + "num_input_tokens_seen": 25398216, + "step": 43790 + }, + { + "epoch": 6.522937146261543, + "grad_norm": 13.830021858215332, + "learning_rate": 4.260722064386651e-05, + "loss": 0.3535, + "num_input_tokens_seen": 25401256, + "step": 43795 + }, + { + "epoch": 6.523681858802503, + "grad_norm": 2.9272069931030273, + "learning_rate": 4.2604913686820966e-05, + "loss": 0.2829, + "num_input_tokens_seen": 25404008, + "step": 43800 + }, + { + "epoch": 6.5244265713434615, + "grad_norm": 4.741625785827637, + "learning_rate": 4.260260643235831e-05, + "loss": 0.2315, + "num_input_tokens_seen": 25406824, + "step": 43805 + }, + { + "epoch": 6.52517128388442, + "grad_norm": 13.352535247802734, + "learning_rate": 4.260029888051751e-05, + "loss": 0.0792, + "num_input_tokens_seen": 25409960, + "step": 43810 + }, + { + "epoch": 6.52591599642538, + "grad_norm": 6.55194091796875, + "learning_rate": 4.259799103133754e-05, + "loss": 0.2612, + "num_input_tokens_seen": 25412520, + "step": 43815 + }, + { + "epoch": 6.526660708966339, + "grad_norm": 0.524082362651825, + "learning_rate": 4.25956828848574e-05, + "loss": 0.208, + "num_input_tokens_seen": 25415528, + "step": 43820 + }, + { + "epoch": 6.527405421507298, + "grad_norm": 8.589096069335938, + "learning_rate": 4.259337444111609e-05, + "loss": 0.2506, + "num_input_tokens_seen": 25418248, + "step": 43825 + }, + { + "epoch": 6.528150134048257, + "grad_norm": 2.8881354331970215, + "learning_rate": 4.259106570015259e-05, + "loss": 0.3695, + "num_input_tokens_seen": 25421256, + "step": 43830 + }, + { + "epoch": 6.528894846589217, + "grad_norm": 78.27947998046875, + "learning_rate": 4.2588756662005926e-05, + "loss": 0.1103, + "num_input_tokens_seen": 25423976, + "step": 43835 + }, + { + "epoch": 6.5296395591301755, + "grad_norm": 10.699806213378906, + "learning_rate": 4.258644732671508e-05, + "loss": 0.0716, + "num_input_tokens_seen": 25426568, + "step": 43840 + }, + { + "epoch": 6.530384271671135, + "grad_norm": 21.891796112060547, + "learning_rate": 4.258413769431908e-05, + "loss": 0.4243, + "num_input_tokens_seen": 25429416, + "step": 43845 + }, + { + "epoch": 6.531128984212094, + "grad_norm": 0.08761393278837204, + "learning_rate": 4.2581827764856955e-05, + "loss": 0.247, + "num_input_tokens_seen": 25432232, + "step": 43850 + }, + { + "epoch": 6.5318736967530535, + "grad_norm": 16.224834442138672, + "learning_rate": 4.257951753836772e-05, + "loss": 0.0151, + "num_input_tokens_seen": 25434888, + "step": 43855 + }, + { + "epoch": 6.532618409294012, + "grad_norm": 13.294926643371582, + "learning_rate": 4.2577207014890394e-05, + "loss": 0.2826, + "num_input_tokens_seen": 25437672, + "step": 43860 + }, + { + "epoch": 6.533363121834972, + "grad_norm": 0.19265076518058777, + "learning_rate": 4.2574896194464033e-05, + "loss": 0.2453, + "num_input_tokens_seen": 25440424, + "step": 43865 + }, + { + "epoch": 6.534107834375931, + "grad_norm": 15.574368476867676, + "learning_rate": 4.2572585077127654e-05, + "loss": 0.3469, + "num_input_tokens_seen": 25443752, + "step": 43870 + }, + { + "epoch": 6.53485254691689, + "grad_norm": 1.1983375549316406, + "learning_rate": 4.2570273662920315e-05, + "loss": 0.0807, + "num_input_tokens_seen": 25446504, + "step": 43875 + }, + { + "epoch": 6.535597259457849, + "grad_norm": 0.3533165156841278, + "learning_rate": 4.2567961951881053e-05, + "loss": 0.1879, + "num_input_tokens_seen": 25449448, + "step": 43880 + }, + { + "epoch": 6.536341971998809, + "grad_norm": 0.2261359691619873, + "learning_rate": 4.256564994404893e-05, + "loss": 0.4681, + "num_input_tokens_seen": 25452488, + "step": 43885 + }, + { + "epoch": 6.5370866845397675, + "grad_norm": 22.15509033203125, + "learning_rate": 4.2563337639463005e-05, + "loss": 0.1502, + "num_input_tokens_seen": 25455528, + "step": 43890 + }, + { + "epoch": 6.537831397080727, + "grad_norm": 73.31995391845703, + "learning_rate": 4.256102503816234e-05, + "loss": 0.1554, + "num_input_tokens_seen": 25458408, + "step": 43895 + }, + { + "epoch": 6.538576109621686, + "grad_norm": 3.1188390254974365, + "learning_rate": 4.255871214018601e-05, + "loss": 0.0157, + "num_input_tokens_seen": 25461064, + "step": 43900 + }, + { + "epoch": 6.5393208221626455, + "grad_norm": 0.4494670331478119, + "learning_rate": 4.255639894557309e-05, + "loss": 0.1928, + "num_input_tokens_seen": 25463944, + "step": 43905 + }, + { + "epoch": 6.540065534703604, + "grad_norm": 0.08760040998458862, + "learning_rate": 4.255408545436264e-05, + "loss": 0.1702, + "num_input_tokens_seen": 25466696, + "step": 43910 + }, + { + "epoch": 6.540810247244564, + "grad_norm": 0.6075748801231384, + "learning_rate": 4.255177166659376e-05, + "loss": 0.1346, + "num_input_tokens_seen": 25469576, + "step": 43915 + }, + { + "epoch": 6.541554959785523, + "grad_norm": 71.25157165527344, + "learning_rate": 4.254945758230554e-05, + "loss": 0.3165, + "num_input_tokens_seen": 25472648, + "step": 43920 + }, + { + "epoch": 6.542299672326482, + "grad_norm": 0.02185927890241146, + "learning_rate": 4.254714320153708e-05, + "loss": 0.0827, + "num_input_tokens_seen": 25475752, + "step": 43925 + }, + { + "epoch": 6.543044384867441, + "grad_norm": 51.24417495727539, + "learning_rate": 4.254482852432745e-05, + "loss": 0.372, + "num_input_tokens_seen": 25479016, + "step": 43930 + }, + { + "epoch": 6.5437890974084, + "grad_norm": 0.18987782299518585, + "learning_rate": 4.254251355071579e-05, + "loss": 0.0018, + "num_input_tokens_seen": 25482216, + "step": 43935 + }, + { + "epoch": 6.5445338099493595, + "grad_norm": 0.14445170760154724, + "learning_rate": 4.254019828074118e-05, + "loss": 0.5942, + "num_input_tokens_seen": 25484968, + "step": 43940 + }, + { + "epoch": 6.545278522490319, + "grad_norm": 0.6841854453086853, + "learning_rate": 4.2537882714442756e-05, + "loss": 0.202, + "num_input_tokens_seen": 25487880, + "step": 43945 + }, + { + "epoch": 6.546023235031278, + "grad_norm": 0.03599387779831886, + "learning_rate": 4.253556685185963e-05, + "loss": 0.4905, + "num_input_tokens_seen": 25490760, + "step": 43950 + }, + { + "epoch": 6.546767947572237, + "grad_norm": 33.59393310546875, + "learning_rate": 4.2533250693030924e-05, + "loss": 0.1877, + "num_input_tokens_seen": 25493512, + "step": 43955 + }, + { + "epoch": 6.547512660113196, + "grad_norm": 0.11696182936429977, + "learning_rate": 4.2530934237995756e-05, + "loss": 0.1151, + "num_input_tokens_seen": 25496680, + "step": 43960 + }, + { + "epoch": 6.548257372654156, + "grad_norm": 30.332763671875, + "learning_rate": 4.252861748679329e-05, + "loss": 0.2668, + "num_input_tokens_seen": 25499592, + "step": 43965 + }, + { + "epoch": 6.549002085195115, + "grad_norm": 41.4860725402832, + "learning_rate": 4.252630043946263e-05, + "loss": 0.5104, + "num_input_tokens_seen": 25502536, + "step": 43970 + }, + { + "epoch": 6.5497467977360735, + "grad_norm": 39.45355224609375, + "learning_rate": 4.252398309604294e-05, + "loss": 0.1816, + "num_input_tokens_seen": 25505544, + "step": 43975 + }, + { + "epoch": 6.550491510277033, + "grad_norm": 9.626856803894043, + "learning_rate": 4.252166545657337e-05, + "loss": 0.2482, + "num_input_tokens_seen": 25508648, + "step": 43980 + }, + { + "epoch": 6.551236222817992, + "grad_norm": 0.08772753179073334, + "learning_rate": 4.2519347521093077e-05, + "loss": 0.325, + "num_input_tokens_seen": 25511304, + "step": 43985 + }, + { + "epoch": 6.5519809353589515, + "grad_norm": 56.714717864990234, + "learning_rate": 4.251702928964121e-05, + "loss": 0.2745, + "num_input_tokens_seen": 25514248, + "step": 43990 + }, + { + "epoch": 6.55272564789991, + "grad_norm": 9.928499221801758, + "learning_rate": 4.2514710762256925e-05, + "loss": 0.2432, + "num_input_tokens_seen": 25517288, + "step": 43995 + }, + { + "epoch": 6.55347036044087, + "grad_norm": 54.9698486328125, + "learning_rate": 4.2512391938979416e-05, + "loss": 0.412, + "num_input_tokens_seen": 25520392, + "step": 44000 + }, + { + "epoch": 6.554215072981829, + "grad_norm": 2.4796857833862305, + "learning_rate": 4.251007281984783e-05, + "loss": 0.3365, + "num_input_tokens_seen": 25523208, + "step": 44005 + }, + { + "epoch": 6.554959785522788, + "grad_norm": 0.7827732563018799, + "learning_rate": 4.250775340490137e-05, + "loss": 0.2331, + "num_input_tokens_seen": 25525832, + "step": 44010 + }, + { + "epoch": 6.555704498063747, + "grad_norm": 8.364459037780762, + "learning_rate": 4.2505433694179216e-05, + "loss": 0.281, + "num_input_tokens_seen": 25528680, + "step": 44015 + }, + { + "epoch": 6.556449210604707, + "grad_norm": 0.1568787693977356, + "learning_rate": 4.250311368772054e-05, + "loss": 0.0402, + "num_input_tokens_seen": 25531720, + "step": 44020 + }, + { + "epoch": 6.5571939231456655, + "grad_norm": 5.48577880859375, + "learning_rate": 4.250079338556455e-05, + "loss": 0.3742, + "num_input_tokens_seen": 25534504, + "step": 44025 + }, + { + "epoch": 6.557938635686625, + "grad_norm": 0.19696760177612305, + "learning_rate": 4.2498472787750456e-05, + "loss": 0.1488, + "num_input_tokens_seen": 25537512, + "step": 44030 + }, + { + "epoch": 6.558683348227584, + "grad_norm": 103.52477264404297, + "learning_rate": 4.249615189431744e-05, + "loss": 0.1733, + "num_input_tokens_seen": 25540808, + "step": 44035 + }, + { + "epoch": 6.559428060768544, + "grad_norm": 36.372703552246094, + "learning_rate": 4.2493830705304716e-05, + "loss": 0.4929, + "num_input_tokens_seen": 25543880, + "step": 44040 + }, + { + "epoch": 6.560172773309502, + "grad_norm": 0.6414240598678589, + "learning_rate": 4.24915092207515e-05, + "loss": 0.1373, + "num_input_tokens_seen": 25546568, + "step": 44045 + }, + { + "epoch": 6.560917485850462, + "grad_norm": 1.4728683233261108, + "learning_rate": 4.248918744069702e-05, + "loss": 0.2094, + "num_input_tokens_seen": 25549384, + "step": 44050 + }, + { + "epoch": 6.561662198391421, + "grad_norm": 0.16121825575828552, + "learning_rate": 4.2486865365180494e-05, + "loss": 0.1142, + "num_input_tokens_seen": 25552296, + "step": 44055 + }, + { + "epoch": 6.56240691093238, + "grad_norm": 92.07735443115234, + "learning_rate": 4.2484542994241145e-05, + "loss": 0.3402, + "num_input_tokens_seen": 25555240, + "step": 44060 + }, + { + "epoch": 6.563151623473339, + "grad_norm": 0.05480875074863434, + "learning_rate": 4.2482220327918214e-05, + "loss": 0.1095, + "num_input_tokens_seen": 25558216, + "step": 44065 + }, + { + "epoch": 6.563896336014299, + "grad_norm": 2.5050623416900635, + "learning_rate": 4.2479897366250946e-05, + "loss": 0.3894, + "num_input_tokens_seen": 25561192, + "step": 44070 + }, + { + "epoch": 6.5646410485552575, + "grad_norm": 26.623403549194336, + "learning_rate": 4.247757410927857e-05, + "loss": 0.3883, + "num_input_tokens_seen": 25564296, + "step": 44075 + }, + { + "epoch": 6.565385761096217, + "grad_norm": 14.994283676147461, + "learning_rate": 4.247525055704034e-05, + "loss": 0.2571, + "num_input_tokens_seen": 25567304, + "step": 44080 + }, + { + "epoch": 6.566130473637176, + "grad_norm": 14.837858200073242, + "learning_rate": 4.247292670957552e-05, + "loss": 0.2539, + "num_input_tokens_seen": 25570248, + "step": 44085 + }, + { + "epoch": 6.566875186178136, + "grad_norm": 0.026379650458693504, + "learning_rate": 4.247060256692336e-05, + "loss": 0.5585, + "num_input_tokens_seen": 25573384, + "step": 44090 + }, + { + "epoch": 6.567619898719094, + "grad_norm": 0.11201290786266327, + "learning_rate": 4.246827812912313e-05, + "loss": 0.1876, + "num_input_tokens_seen": 25576232, + "step": 44095 + }, + { + "epoch": 6.568364611260054, + "grad_norm": 25.769132614135742, + "learning_rate": 4.246595339621409e-05, + "loss": 0.1467, + "num_input_tokens_seen": 25579112, + "step": 44100 + }, + { + "epoch": 6.569109323801013, + "grad_norm": 0.15027382969856262, + "learning_rate": 4.246362836823551e-05, + "loss": 0.2035, + "num_input_tokens_seen": 25581992, + "step": 44105 + }, + { + "epoch": 6.569854036341972, + "grad_norm": 36.209983825683594, + "learning_rate": 4.2461303045226695e-05, + "loss": 0.2238, + "num_input_tokens_seen": 25584776, + "step": 44110 + }, + { + "epoch": 6.570598748882931, + "grad_norm": 23.02669334411621, + "learning_rate": 4.24589774272269e-05, + "loss": 0.0666, + "num_input_tokens_seen": 25587720, + "step": 44115 + }, + { + "epoch": 6.57134346142389, + "grad_norm": 29.688213348388672, + "learning_rate": 4.245665151427544e-05, + "loss": 0.3398, + "num_input_tokens_seen": 25590600, + "step": 44120 + }, + { + "epoch": 6.57208817396485, + "grad_norm": 0.6954150199890137, + "learning_rate": 4.245432530641158e-05, + "loss": 0.241, + "num_input_tokens_seen": 25593608, + "step": 44125 + }, + { + "epoch": 6.572832886505809, + "grad_norm": 43.412601470947266, + "learning_rate": 4.245199880367464e-05, + "loss": 0.1518, + "num_input_tokens_seen": 25596488, + "step": 44130 + }, + { + "epoch": 6.573577599046768, + "grad_norm": 1.3384008407592773, + "learning_rate": 4.2449672006103914e-05, + "loss": 0.0076, + "num_input_tokens_seen": 25599464, + "step": 44135 + }, + { + "epoch": 6.574322311587727, + "grad_norm": 0.03304975852370262, + "learning_rate": 4.244734491373872e-05, + "loss": 0.2235, + "num_input_tokens_seen": 25602344, + "step": 44140 + }, + { + "epoch": 6.575067024128686, + "grad_norm": 21.17646026611328, + "learning_rate": 4.244501752661836e-05, + "loss": 0.3297, + "num_input_tokens_seen": 25605256, + "step": 44145 + }, + { + "epoch": 6.575811736669645, + "grad_norm": 15.28373908996582, + "learning_rate": 4.244268984478216e-05, + "loss": 0.1165, + "num_input_tokens_seen": 25608136, + "step": 44150 + }, + { + "epoch": 6.576556449210605, + "grad_norm": 27.552448272705078, + "learning_rate": 4.2440361868269453e-05, + "loss": 0.1117, + "num_input_tokens_seen": 25611016, + "step": 44155 + }, + { + "epoch": 6.5773011617515635, + "grad_norm": 6.496496200561523, + "learning_rate": 4.243803359711954e-05, + "loss": 0.3252, + "num_input_tokens_seen": 25613768, + "step": 44160 + }, + { + "epoch": 6.578045874292523, + "grad_norm": 0.30303215980529785, + "learning_rate": 4.243570503137179e-05, + "loss": 0.2224, + "num_input_tokens_seen": 25616616, + "step": 44165 + }, + { + "epoch": 6.578790586833482, + "grad_norm": 0.03430667147040367, + "learning_rate": 4.2433376171065514e-05, + "loss": 0.1417, + "num_input_tokens_seen": 25619336, + "step": 44170 + }, + { + "epoch": 6.579535299374442, + "grad_norm": 2.8656322956085205, + "learning_rate": 4.2431047016240064e-05, + "loss": 0.4103, + "num_input_tokens_seen": 25622312, + "step": 44175 + }, + { + "epoch": 6.5802800119154, + "grad_norm": 24.53981590270996, + "learning_rate": 4.242871756693481e-05, + "loss": 0.2057, + "num_input_tokens_seen": 25625192, + "step": 44180 + }, + { + "epoch": 6.58102472445636, + "grad_norm": 2.5967204570770264, + "learning_rate": 4.242638782318906e-05, + "loss": 0.0321, + "num_input_tokens_seen": 25627944, + "step": 44185 + }, + { + "epoch": 6.581769436997319, + "grad_norm": 23.607900619506836, + "learning_rate": 4.242405778504221e-05, + "loss": 0.136, + "num_input_tokens_seen": 25630696, + "step": 44190 + }, + { + "epoch": 6.582514149538278, + "grad_norm": 0.18132168054580688, + "learning_rate": 4.242172745253362e-05, + "loss": 0.0995, + "num_input_tokens_seen": 25633736, + "step": 44195 + }, + { + "epoch": 6.583258862079237, + "grad_norm": 22.376482009887695, + "learning_rate": 4.241939682570265e-05, + "loss": 0.1416, + "num_input_tokens_seen": 25636744, + "step": 44200 + }, + { + "epoch": 6.584003574620197, + "grad_norm": 71.64608001708984, + "learning_rate": 4.241706590458867e-05, + "loss": 0.3863, + "num_input_tokens_seen": 25639848, + "step": 44205 + }, + { + "epoch": 6.584748287161156, + "grad_norm": 26.380935668945312, + "learning_rate": 4.241473468923106e-05, + "loss": 0.4065, + "num_input_tokens_seen": 25642792, + "step": 44210 + }, + { + "epoch": 6.585492999702115, + "grad_norm": 0.6089507341384888, + "learning_rate": 4.2412403179669216e-05, + "loss": 0.4147, + "num_input_tokens_seen": 25646056, + "step": 44215 + }, + { + "epoch": 6.586237712243074, + "grad_norm": 69.90763092041016, + "learning_rate": 4.2410071375942505e-05, + "loss": 0.3458, + "num_input_tokens_seen": 25649128, + "step": 44220 + }, + { + "epoch": 6.586982424784034, + "grad_norm": 4.396578788757324, + "learning_rate": 4.240773927809034e-05, + "loss": 0.3483, + "num_input_tokens_seen": 25651944, + "step": 44225 + }, + { + "epoch": 6.587727137324992, + "grad_norm": 53.146644592285156, + "learning_rate": 4.240540688615212e-05, + "loss": 0.4201, + "num_input_tokens_seen": 25654568, + "step": 44230 + }, + { + "epoch": 6.588471849865952, + "grad_norm": 7.746089458465576, + "learning_rate": 4.240307420016724e-05, + "loss": 0.2557, + "num_input_tokens_seen": 25657576, + "step": 44235 + }, + { + "epoch": 6.589216562406911, + "grad_norm": 29.079971313476562, + "learning_rate": 4.24007412201751e-05, + "loss": 0.0764, + "num_input_tokens_seen": 25660904, + "step": 44240 + }, + { + "epoch": 6.58996127494787, + "grad_norm": 16.610336303710938, + "learning_rate": 4.239840794621512e-05, + "loss": 0.221, + "num_input_tokens_seen": 25663816, + "step": 44245 + }, + { + "epoch": 6.590705987488829, + "grad_norm": 1.8219619989395142, + "learning_rate": 4.2396074378326725e-05, + "loss": 0.038, + "num_input_tokens_seen": 25666664, + "step": 44250 + }, + { + "epoch": 6.591450700029789, + "grad_norm": 15.6638822555542, + "learning_rate": 4.239374051654934e-05, + "loss": 0.2084, + "num_input_tokens_seen": 25669992, + "step": 44255 + }, + { + "epoch": 6.592195412570748, + "grad_norm": 10.16663646697998, + "learning_rate": 4.239140636092238e-05, + "loss": 0.441, + "num_input_tokens_seen": 25672904, + "step": 44260 + }, + { + "epoch": 6.592940125111707, + "grad_norm": 5.3321943283081055, + "learning_rate": 4.238907191148528e-05, + "loss": 0.4606, + "num_input_tokens_seen": 25675720, + "step": 44265 + }, + { + "epoch": 6.593684837652666, + "grad_norm": 0.1637832522392273, + "learning_rate": 4.23867371682775e-05, + "loss": 0.2071, + "num_input_tokens_seen": 25678376, + "step": 44270 + }, + { + "epoch": 6.594429550193626, + "grad_norm": 0.31238678097724915, + "learning_rate": 4.2384402131338455e-05, + "loss": 0.2117, + "num_input_tokens_seen": 25681256, + "step": 44275 + }, + { + "epoch": 6.595174262734584, + "grad_norm": 25.41559410095215, + "learning_rate": 4.2382066800707606e-05, + "loss": 0.1512, + "num_input_tokens_seen": 25683944, + "step": 44280 + }, + { + "epoch": 6.595918975275543, + "grad_norm": 0.5098695158958435, + "learning_rate": 4.237973117642441e-05, + "loss": 0.2087, + "num_input_tokens_seen": 25686888, + "step": 44285 + }, + { + "epoch": 6.596663687816503, + "grad_norm": 0.3538585603237152, + "learning_rate": 4.237739525852831e-05, + "loss": 0.1677, + "num_input_tokens_seen": 25689704, + "step": 44290 + }, + { + "epoch": 6.5974084003574625, + "grad_norm": 3.4062039852142334, + "learning_rate": 4.237505904705879e-05, + "loss": 0.2071, + "num_input_tokens_seen": 25692424, + "step": 44295 + }, + { + "epoch": 6.598153112898421, + "grad_norm": 0.14635056257247925, + "learning_rate": 4.23727225420553e-05, + "loss": 0.2826, + "num_input_tokens_seen": 25695368, + "step": 44300 + }, + { + "epoch": 6.59889782543938, + "grad_norm": 0.5969852805137634, + "learning_rate": 4.237038574355732e-05, + "loss": 0.107, + "num_input_tokens_seen": 25698440, + "step": 44305 + }, + { + "epoch": 6.59964253798034, + "grad_norm": 22.42262840270996, + "learning_rate": 4.236804865160433e-05, + "loss": 0.033, + "num_input_tokens_seen": 25701160, + "step": 44310 + }, + { + "epoch": 6.600387250521299, + "grad_norm": 31.058650970458984, + "learning_rate": 4.236571126623581e-05, + "loss": 0.0295, + "num_input_tokens_seen": 25704200, + "step": 44315 + }, + { + "epoch": 6.601131963062258, + "grad_norm": 20.616127014160156, + "learning_rate": 4.236337358749124e-05, + "loss": 0.4013, + "num_input_tokens_seen": 25707016, + "step": 44320 + }, + { + "epoch": 6.601876675603217, + "grad_norm": 13.22312068939209, + "learning_rate": 4.2361035615410127e-05, + "loss": 0.5692, + "num_input_tokens_seen": 25709832, + "step": 44325 + }, + { + "epoch": 6.602621388144176, + "grad_norm": 0.13924524188041687, + "learning_rate": 4.2358697350031964e-05, + "loss": 0.3399, + "num_input_tokens_seen": 25712520, + "step": 44330 + }, + { + "epoch": 6.603366100685135, + "grad_norm": 29.288150787353516, + "learning_rate": 4.2356358791396244e-05, + "loss": 0.0594, + "num_input_tokens_seen": 25715144, + "step": 44335 + }, + { + "epoch": 6.604110813226095, + "grad_norm": 0.12381685525178909, + "learning_rate": 4.235401993954249e-05, + "loss": 0.2111, + "num_input_tokens_seen": 25717736, + "step": 44340 + }, + { + "epoch": 6.604855525767054, + "grad_norm": 0.2027769535779953, + "learning_rate": 4.2351680794510205e-05, + "loss": 0.2019, + "num_input_tokens_seen": 25720744, + "step": 44345 + }, + { + "epoch": 6.605600238308013, + "grad_norm": 0.8406639099121094, + "learning_rate": 4.234934135633891e-05, + "loss": 0.0901, + "num_input_tokens_seen": 25723560, + "step": 44350 + }, + { + "epoch": 6.606344950848972, + "grad_norm": 15.364089012145996, + "learning_rate": 4.234700162506813e-05, + "loss": 0.3262, + "num_input_tokens_seen": 25726120, + "step": 44355 + }, + { + "epoch": 6.607089663389932, + "grad_norm": 23.27134132385254, + "learning_rate": 4.234466160073738e-05, + "loss": 0.0861, + "num_input_tokens_seen": 25728936, + "step": 44360 + }, + { + "epoch": 6.60783437593089, + "grad_norm": 0.41470471024513245, + "learning_rate": 4.23423212833862e-05, + "loss": 0.2611, + "num_input_tokens_seen": 25731912, + "step": 44365 + }, + { + "epoch": 6.60857908847185, + "grad_norm": 0.1990438550710678, + "learning_rate": 4.233998067305413e-05, + "loss": 0.2202, + "num_input_tokens_seen": 25734600, + "step": 44370 + }, + { + "epoch": 6.609323801012809, + "grad_norm": 1.4662814140319824, + "learning_rate": 4.233763976978071e-05, + "loss": 0.0094, + "num_input_tokens_seen": 25737544, + "step": 44375 + }, + { + "epoch": 6.6100685135537685, + "grad_norm": 49.69521713256836, + "learning_rate": 4.233529857360549e-05, + "loss": 0.1954, + "num_input_tokens_seen": 25740328, + "step": 44380 + }, + { + "epoch": 6.610813226094727, + "grad_norm": 0.8794490694999695, + "learning_rate": 4.233295708456801e-05, + "loss": 0.4393, + "num_input_tokens_seen": 25743304, + "step": 44385 + }, + { + "epoch": 6.611557938635687, + "grad_norm": 41.32030487060547, + "learning_rate": 4.2330615302707856e-05, + "loss": 0.2503, + "num_input_tokens_seen": 25746088, + "step": 44390 + }, + { + "epoch": 6.612302651176646, + "grad_norm": 0.07275379449129105, + "learning_rate": 4.2328273228064555e-05, + "loss": 0.2275, + "num_input_tokens_seen": 25749032, + "step": 44395 + }, + { + "epoch": 6.613047363717605, + "grad_norm": 66.5950927734375, + "learning_rate": 4.2325930860677695e-05, + "loss": 0.1857, + "num_input_tokens_seen": 25752072, + "step": 44400 + }, + { + "epoch": 6.613792076258564, + "grad_norm": 1.2978569269180298, + "learning_rate": 4.232358820058684e-05, + "loss": 0.1011, + "num_input_tokens_seen": 25754664, + "step": 44405 + }, + { + "epoch": 6.614536788799524, + "grad_norm": 23.803821563720703, + "learning_rate": 4.232124524783157e-05, + "loss": 0.1531, + "num_input_tokens_seen": 25757384, + "step": 44410 + }, + { + "epoch": 6.615281501340482, + "grad_norm": 14.515617370605469, + "learning_rate": 4.231890200245147e-05, + "loss": 0.1038, + "num_input_tokens_seen": 25760072, + "step": 44415 + }, + { + "epoch": 6.616026213881442, + "grad_norm": 46.13024139404297, + "learning_rate": 4.231655846448611e-05, + "loss": 0.1372, + "num_input_tokens_seen": 25762728, + "step": 44420 + }, + { + "epoch": 6.616770926422401, + "grad_norm": 9.222668647766113, + "learning_rate": 4.2314214633975105e-05, + "loss": 0.2689, + "num_input_tokens_seen": 25765768, + "step": 44425 + }, + { + "epoch": 6.6175156389633605, + "grad_norm": 15.480338096618652, + "learning_rate": 4.231187051095804e-05, + "loss": 0.4505, + "num_input_tokens_seen": 25768424, + "step": 44430 + }, + { + "epoch": 6.618260351504319, + "grad_norm": 1.1593294143676758, + "learning_rate": 4.2309526095474514e-05, + "loss": 0.3729, + "num_input_tokens_seen": 25771304, + "step": 44435 + }, + { + "epoch": 6.619005064045279, + "grad_norm": 5.3286004066467285, + "learning_rate": 4.230718138756414e-05, + "loss": 0.286, + "num_input_tokens_seen": 25774056, + "step": 44440 + }, + { + "epoch": 6.619749776586238, + "grad_norm": 27.20146942138672, + "learning_rate": 4.2304836387266534e-05, + "loss": 0.2907, + "num_input_tokens_seen": 25777224, + "step": 44445 + }, + { + "epoch": 6.620494489127196, + "grad_norm": 0.9989823698997498, + "learning_rate": 4.230249109462129e-05, + "loss": 0.0724, + "num_input_tokens_seen": 25780072, + "step": 44450 + }, + { + "epoch": 6.621239201668156, + "grad_norm": 64.17188262939453, + "learning_rate": 4.2300145509668054e-05, + "loss": 0.0945, + "num_input_tokens_seen": 25783080, + "step": 44455 + }, + { + "epoch": 6.621983914209116, + "grad_norm": 3.096938133239746, + "learning_rate": 4.2297799632446444e-05, + "loss": 0.0134, + "num_input_tokens_seen": 25786120, + "step": 44460 + }, + { + "epoch": 6.6227286267500745, + "grad_norm": 0.014187654480338097, + "learning_rate": 4.229545346299609e-05, + "loss": 0.6927, + "num_input_tokens_seen": 25789096, + "step": 44465 + }, + { + "epoch": 6.623473339291033, + "grad_norm": 26.07942008972168, + "learning_rate": 4.2293107001356624e-05, + "loss": 0.3486, + "num_input_tokens_seen": 25792008, + "step": 44470 + }, + { + "epoch": 6.624218051831993, + "grad_norm": 0.02420390211045742, + "learning_rate": 4.2290760247567695e-05, + "loss": 0.0932, + "num_input_tokens_seen": 25794632, + "step": 44475 + }, + { + "epoch": 6.6249627643729525, + "grad_norm": 23.523963928222656, + "learning_rate": 4.2288413201668945e-05, + "loss": 0.3685, + "num_input_tokens_seen": 25797384, + "step": 44480 + }, + { + "epoch": 6.625707476913911, + "grad_norm": 1.4813408851623535, + "learning_rate": 4.228606586370002e-05, + "loss": 0.1606, + "num_input_tokens_seen": 25800200, + "step": 44485 + }, + { + "epoch": 6.62645218945487, + "grad_norm": 28.31475830078125, + "learning_rate": 4.228371823370058e-05, + "loss": 0.4193, + "num_input_tokens_seen": 25802760, + "step": 44490 + }, + { + "epoch": 6.62719690199583, + "grad_norm": 0.15139220654964447, + "learning_rate": 4.228137031171029e-05, + "loss": 0.1919, + "num_input_tokens_seen": 25805512, + "step": 44495 + }, + { + "epoch": 6.627941614536788, + "grad_norm": 0.281471312046051, + "learning_rate": 4.227902209776881e-05, + "loss": 0.1933, + "num_input_tokens_seen": 25808584, + "step": 44500 + }, + { + "epoch": 6.628686327077748, + "grad_norm": 0.5778990387916565, + "learning_rate": 4.227667359191582e-05, + "loss": 0.3031, + "num_input_tokens_seen": 25811592, + "step": 44505 + }, + { + "epoch": 6.629431039618707, + "grad_norm": 0.1335126757621765, + "learning_rate": 4.227432479419099e-05, + "loss": 0.3317, + "num_input_tokens_seen": 25814280, + "step": 44510 + }, + { + "epoch": 6.6301757521596665, + "grad_norm": 7.784054279327393, + "learning_rate": 4.227197570463399e-05, + "loss": 0.1905, + "num_input_tokens_seen": 25817352, + "step": 44515 + }, + { + "epoch": 6.630920464700625, + "grad_norm": 45.83279037475586, + "learning_rate": 4.2269626323284514e-05, + "loss": 0.4154, + "num_input_tokens_seen": 25820328, + "step": 44520 + }, + { + "epoch": 6.631665177241585, + "grad_norm": 20.844301223754883, + "learning_rate": 4.226727665018226e-05, + "loss": 0.0198, + "num_input_tokens_seen": 25823304, + "step": 44525 + }, + { + "epoch": 6.632409889782544, + "grad_norm": 16.68021583557129, + "learning_rate": 4.2264926685366925e-05, + "loss": 0.4134, + "num_input_tokens_seen": 25826440, + "step": 44530 + }, + { + "epoch": 6.633154602323503, + "grad_norm": 1.3196812868118286, + "learning_rate": 4.2262576428878184e-05, + "loss": 0.0101, + "num_input_tokens_seen": 25829096, + "step": 44535 + }, + { + "epoch": 6.633899314864462, + "grad_norm": 63.756492614746094, + "learning_rate": 4.226022588075577e-05, + "loss": 0.2916, + "num_input_tokens_seen": 25832040, + "step": 44540 + }, + { + "epoch": 6.634644027405422, + "grad_norm": 25.548118591308594, + "learning_rate": 4.2257875041039375e-05, + "loss": 0.1777, + "num_input_tokens_seen": 25835080, + "step": 44545 + }, + { + "epoch": 6.6353887399463805, + "grad_norm": 18.345897674560547, + "learning_rate": 4.225552390976873e-05, + "loss": 0.1024, + "num_input_tokens_seen": 25838408, + "step": 44550 + }, + { + "epoch": 6.63613345248734, + "grad_norm": 0.04248788207769394, + "learning_rate": 4.225317248698354e-05, + "loss": 0.1077, + "num_input_tokens_seen": 25841352, + "step": 44555 + }, + { + "epoch": 6.636878165028299, + "grad_norm": 0.03409520164132118, + "learning_rate": 4.225082077272354e-05, + "loss": 0.0777, + "num_input_tokens_seen": 25844232, + "step": 44560 + }, + { + "epoch": 6.6376228775692585, + "grad_norm": 0.30580151081085205, + "learning_rate": 4.224846876702845e-05, + "loss": 0.0507, + "num_input_tokens_seen": 25847112, + "step": 44565 + }, + { + "epoch": 6.638367590110217, + "grad_norm": 9.789386749267578, + "learning_rate": 4.224611646993801e-05, + "loss": 0.4777, + "num_input_tokens_seen": 25850184, + "step": 44570 + }, + { + "epoch": 6.639112302651177, + "grad_norm": 38.55558776855469, + "learning_rate": 4.224376388149197e-05, + "loss": 0.0539, + "num_input_tokens_seen": 25852968, + "step": 44575 + }, + { + "epoch": 6.639857015192136, + "grad_norm": 11.733792304992676, + "learning_rate": 4.2241411001730057e-05, + "loss": 0.1765, + "num_input_tokens_seen": 25856136, + "step": 44580 + }, + { + "epoch": 6.640601727733095, + "grad_norm": 18.495746612548828, + "learning_rate": 4.223905783069203e-05, + "loss": 0.2981, + "num_input_tokens_seen": 25859208, + "step": 44585 + }, + { + "epoch": 6.641346440274054, + "grad_norm": 11.666829109191895, + "learning_rate": 4.2236704368417644e-05, + "loss": 0.348, + "num_input_tokens_seen": 25862408, + "step": 44590 + }, + { + "epoch": 6.642091152815014, + "grad_norm": 0.4020112156867981, + "learning_rate": 4.223435061494666e-05, + "loss": 0.308, + "num_input_tokens_seen": 25865416, + "step": 44595 + }, + { + "epoch": 6.6428358653559725, + "grad_norm": 10.370949745178223, + "learning_rate": 4.223199657031883e-05, + "loss": 0.3346, + "num_input_tokens_seen": 25868456, + "step": 44600 + }, + { + "epoch": 6.643580577896932, + "grad_norm": 0.018098030239343643, + "learning_rate": 4.222964223457394e-05, + "loss": 0.0525, + "num_input_tokens_seen": 25871144, + "step": 44605 + }, + { + "epoch": 6.644325290437891, + "grad_norm": 19.849773406982422, + "learning_rate": 4.2227287607751756e-05, + "loss": 0.371, + "num_input_tokens_seen": 25874024, + "step": 44610 + }, + { + "epoch": 6.6450700029788505, + "grad_norm": 16.604991912841797, + "learning_rate": 4.222493268989205e-05, + "loss": 0.3087, + "num_input_tokens_seen": 25877000, + "step": 44615 + }, + { + "epoch": 6.645814715519809, + "grad_norm": 0.4184075891971588, + "learning_rate": 4.222257748103461e-05, + "loss": 0.0668, + "num_input_tokens_seen": 25879624, + "step": 44620 + }, + { + "epoch": 6.646559428060769, + "grad_norm": 0.2904303967952728, + "learning_rate": 4.222022198121923e-05, + "loss": 0.1436, + "num_input_tokens_seen": 25882376, + "step": 44625 + }, + { + "epoch": 6.647304140601728, + "grad_norm": 38.87764358520508, + "learning_rate": 4.221786619048571e-05, + "loss": 0.0966, + "num_input_tokens_seen": 25885128, + "step": 44630 + }, + { + "epoch": 6.6480488531426865, + "grad_norm": 6.899889945983887, + "learning_rate": 4.221551010887384e-05, + "loss": 0.3736, + "num_input_tokens_seen": 25888200, + "step": 44635 + }, + { + "epoch": 6.648793565683646, + "grad_norm": 0.20137760043144226, + "learning_rate": 4.2213153736423417e-05, + "loss": 0.1726, + "num_input_tokens_seen": 25890888, + "step": 44640 + }, + { + "epoch": 6.649538278224606, + "grad_norm": 54.54903793334961, + "learning_rate": 4.221079707317426e-05, + "loss": 0.2368, + "num_input_tokens_seen": 25893640, + "step": 44645 + }, + { + "epoch": 6.6502829907655645, + "grad_norm": 23.34520149230957, + "learning_rate": 4.220844011916617e-05, + "loss": 0.4161, + "num_input_tokens_seen": 25896392, + "step": 44650 + }, + { + "epoch": 6.651027703306523, + "grad_norm": 0.09537740796804428, + "learning_rate": 4.2206082874438976e-05, + "loss": 0.2689, + "num_input_tokens_seen": 25899400, + "step": 44655 + }, + { + "epoch": 6.651772415847483, + "grad_norm": 41.720237731933594, + "learning_rate": 4.2203725339032505e-05, + "loss": 0.3012, + "num_input_tokens_seen": 25902216, + "step": 44660 + }, + { + "epoch": 6.652517128388443, + "grad_norm": 2.606538772583008, + "learning_rate": 4.220136751298659e-05, + "loss": 0.3641, + "num_input_tokens_seen": 25905256, + "step": 44665 + }, + { + "epoch": 6.653261840929401, + "grad_norm": 0.26248571276664734, + "learning_rate": 4.219900939634103e-05, + "loss": 0.117, + "num_input_tokens_seen": 25908168, + "step": 44670 + }, + { + "epoch": 6.65400655347036, + "grad_norm": 0.8432390093803406, + "learning_rate": 4.2196650989135706e-05, + "loss": 0.0268, + "num_input_tokens_seen": 25911080, + "step": 44675 + }, + { + "epoch": 6.65475126601132, + "grad_norm": 0.9071159958839417, + "learning_rate": 4.219429229141043e-05, + "loss": 0.1006, + "num_input_tokens_seen": 25913928, + "step": 44680 + }, + { + "epoch": 6.6554959785522785, + "grad_norm": 1.2258083820343018, + "learning_rate": 4.219193330320507e-05, + "loss": 0.2573, + "num_input_tokens_seen": 25916712, + "step": 44685 + }, + { + "epoch": 6.656240691093238, + "grad_norm": 20.82688331604004, + "learning_rate": 4.2189574024559465e-05, + "loss": 0.2093, + "num_input_tokens_seen": 25919656, + "step": 44690 + }, + { + "epoch": 6.656985403634197, + "grad_norm": 0.022681554779410362, + "learning_rate": 4.218721445551348e-05, + "loss": 0.2698, + "num_input_tokens_seen": 25922664, + "step": 44695 + }, + { + "epoch": 6.6577301161751565, + "grad_norm": 0.09547652304172516, + "learning_rate": 4.218485459610697e-05, + "loss": 0.041, + "num_input_tokens_seen": 25925512, + "step": 44700 + }, + { + "epoch": 6.658474828716115, + "grad_norm": 95.34163665771484, + "learning_rate": 4.2182494446379805e-05, + "loss": 0.3509, + "num_input_tokens_seen": 25928712, + "step": 44705 + }, + { + "epoch": 6.659219541257075, + "grad_norm": 0.022112729027867317, + "learning_rate": 4.218013400637187e-05, + "loss": 0.1933, + "num_input_tokens_seen": 25931688, + "step": 44710 + }, + { + "epoch": 6.659964253798034, + "grad_norm": 32.08683395385742, + "learning_rate": 4.217777327612303e-05, + "loss": 0.1541, + "num_input_tokens_seen": 25934504, + "step": 44715 + }, + { + "epoch": 6.660708966338993, + "grad_norm": 0.1006433293223381, + "learning_rate": 4.2175412255673164e-05, + "loss": 0.0059, + "num_input_tokens_seen": 25937224, + "step": 44720 + }, + { + "epoch": 6.661453678879952, + "grad_norm": 8.665042877197266, + "learning_rate": 4.2173050945062165e-05, + "loss": 0.0613, + "num_input_tokens_seen": 25939976, + "step": 44725 + }, + { + "epoch": 6.662198391420912, + "grad_norm": 7.710312366485596, + "learning_rate": 4.217068934432993e-05, + "loss": 0.218, + "num_input_tokens_seen": 25943144, + "step": 44730 + }, + { + "epoch": 6.6629431039618705, + "grad_norm": 0.0329890176653862, + "learning_rate": 4.216832745351634e-05, + "loss": 0.1507, + "num_input_tokens_seen": 25946152, + "step": 44735 + }, + { + "epoch": 6.66368781650283, + "grad_norm": 1.900188684463501, + "learning_rate": 4.2165965272661315e-05, + "loss": 0.2306, + "num_input_tokens_seen": 25949000, + "step": 44740 + }, + { + "epoch": 6.664432529043789, + "grad_norm": 16.542705535888672, + "learning_rate": 4.2163602801804745e-05, + "loss": 0.0669, + "num_input_tokens_seen": 25951624, + "step": 44745 + }, + { + "epoch": 6.665177241584749, + "grad_norm": 166.1625213623047, + "learning_rate": 4.216124004098656e-05, + "loss": 0.0495, + "num_input_tokens_seen": 25954600, + "step": 44750 + }, + { + "epoch": 6.665921954125707, + "grad_norm": 144.81539916992188, + "learning_rate": 4.2158876990246664e-05, + "loss": 0.2756, + "num_input_tokens_seen": 25957544, + "step": 44755 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 28.163345336914062, + "learning_rate": 4.215651364962498e-05, + "loss": 0.2395, + "num_input_tokens_seen": 25960328, + "step": 44760 + }, + { + "epoch": 6.667411379207626, + "grad_norm": 0.5276590585708618, + "learning_rate": 4.215415001916144e-05, + "loss": 0.1928, + "num_input_tokens_seen": 25963272, + "step": 44765 + }, + { + "epoch": 6.668156091748585, + "grad_norm": 55.52066421508789, + "learning_rate": 4.215178609889596e-05, + "loss": 0.4124, + "num_input_tokens_seen": 25966216, + "step": 44770 + }, + { + "epoch": 6.668900804289544, + "grad_norm": 1.1550610065460205, + "learning_rate": 4.214942188886849e-05, + "loss": 0.3413, + "num_input_tokens_seen": 25968904, + "step": 44775 + }, + { + "epoch": 6.669645516830504, + "grad_norm": 0.006979090161621571, + "learning_rate": 4.2147057389118964e-05, + "loss": 0.0028, + "num_input_tokens_seen": 25971944, + "step": 44780 + }, + { + "epoch": 6.6703902293714625, + "grad_norm": 55.48585891723633, + "learning_rate": 4.2144692599687334e-05, + "loss": 0.3169, + "num_input_tokens_seen": 25974856, + "step": 44785 + }, + { + "epoch": 6.671134941912422, + "grad_norm": 15.680542945861816, + "learning_rate": 4.214232752061355e-05, + "loss": 0.1275, + "num_input_tokens_seen": 25977768, + "step": 44790 + }, + { + "epoch": 6.671879654453381, + "grad_norm": 2.486311912536621, + "learning_rate": 4.213996215193756e-05, + "loss": 0.3459, + "num_input_tokens_seen": 25980808, + "step": 44795 + }, + { + "epoch": 6.67262436699434, + "grad_norm": 0.3128950893878937, + "learning_rate": 4.213759649369934e-05, + "loss": 0.3065, + "num_input_tokens_seen": 25983624, + "step": 44800 + }, + { + "epoch": 6.673369079535299, + "grad_norm": 8.809734344482422, + "learning_rate": 4.2135230545938835e-05, + "loss": 0.1865, + "num_input_tokens_seen": 25986568, + "step": 44805 + }, + { + "epoch": 6.674113792076259, + "grad_norm": 45.855220794677734, + "learning_rate": 4.213286430869603e-05, + "loss": 0.1937, + "num_input_tokens_seen": 25989576, + "step": 44810 + }, + { + "epoch": 6.674858504617218, + "grad_norm": 99.2800521850586, + "learning_rate": 4.2130497782010894e-05, + "loss": 0.0647, + "num_input_tokens_seen": 25992040, + "step": 44815 + }, + { + "epoch": 6.6756032171581765, + "grad_norm": 0.0346045084297657, + "learning_rate": 4.212813096592341e-05, + "loss": 0.2725, + "num_input_tokens_seen": 25994824, + "step": 44820 + }, + { + "epoch": 6.676347929699136, + "grad_norm": 11.543270111083984, + "learning_rate": 4.212576386047356e-05, + "loss": 0.1075, + "num_input_tokens_seen": 25997800, + "step": 44825 + }, + { + "epoch": 6.677092642240096, + "grad_norm": 104.84130859375, + "learning_rate": 4.2123396465701336e-05, + "loss": 0.3353, + "num_input_tokens_seen": 26000904, + "step": 44830 + }, + { + "epoch": 6.677837354781055, + "grad_norm": 42.956783294677734, + "learning_rate": 4.212102878164673e-05, + "loss": 0.1377, + "num_input_tokens_seen": 26003720, + "step": 44835 + }, + { + "epoch": 6.678582067322013, + "grad_norm": 0.8380328416824341, + "learning_rate": 4.211866080834975e-05, + "loss": 0.1873, + "num_input_tokens_seen": 26006376, + "step": 44840 + }, + { + "epoch": 6.679326779862973, + "grad_norm": 16.59221839904785, + "learning_rate": 4.2116292545850386e-05, + "loss": 0.6799, + "num_input_tokens_seen": 26009224, + "step": 44845 + }, + { + "epoch": 6.680071492403932, + "grad_norm": 26.086606979370117, + "learning_rate": 4.2113923994188665e-05, + "loss": 0.1602, + "num_input_tokens_seen": 26012360, + "step": 44850 + }, + { + "epoch": 6.680816204944891, + "grad_norm": 17.218856811523438, + "learning_rate": 4.211155515340458e-05, + "loss": 0.2792, + "num_input_tokens_seen": 26015432, + "step": 44855 + }, + { + "epoch": 6.68156091748585, + "grad_norm": 26.021347045898438, + "learning_rate": 4.210918602353817e-05, + "loss": 0.1281, + "num_input_tokens_seen": 26018408, + "step": 44860 + }, + { + "epoch": 6.68230563002681, + "grad_norm": 0.0645536556839943, + "learning_rate": 4.210681660462945e-05, + "loss": 0.217, + "num_input_tokens_seen": 26021288, + "step": 44865 + }, + { + "epoch": 6.6830503425677685, + "grad_norm": 11.446521759033203, + "learning_rate": 4.210444689671845e-05, + "loss": 0.1898, + "num_input_tokens_seen": 26024680, + "step": 44870 + }, + { + "epoch": 6.683795055108728, + "grad_norm": 29.800987243652344, + "learning_rate": 4.2102076899845207e-05, + "loss": 0.0265, + "num_input_tokens_seen": 26027400, + "step": 44875 + }, + { + "epoch": 6.684539767649687, + "grad_norm": 1.763701319694519, + "learning_rate": 4.209970661404975e-05, + "loss": 0.1419, + "num_input_tokens_seen": 26030632, + "step": 44880 + }, + { + "epoch": 6.685284480190647, + "grad_norm": 7.267892837524414, + "learning_rate": 4.209733603937214e-05, + "loss": 0.4261, + "num_input_tokens_seen": 26033320, + "step": 44885 + }, + { + "epoch": 6.686029192731605, + "grad_norm": 0.11354398727416992, + "learning_rate": 4.2094965175852395e-05, + "loss": 0.1709, + "num_input_tokens_seen": 26036328, + "step": 44890 + }, + { + "epoch": 6.686773905272565, + "grad_norm": 10.516158103942871, + "learning_rate": 4.209259402353061e-05, + "loss": 0.1195, + "num_input_tokens_seen": 26039080, + "step": 44895 + }, + { + "epoch": 6.687518617813524, + "grad_norm": 7.111574649810791, + "learning_rate": 4.20902225824468e-05, + "loss": 0.1488, + "num_input_tokens_seen": 26042184, + "step": 44900 + }, + { + "epoch": 6.688263330354483, + "grad_norm": 0.8574759364128113, + "learning_rate": 4.208785085264106e-05, + "loss": 0.0177, + "num_input_tokens_seen": 26044872, + "step": 44905 + }, + { + "epoch": 6.689008042895442, + "grad_norm": 0.07377202063798904, + "learning_rate": 4.2085478834153454e-05, + "loss": 0.1111, + "num_input_tokens_seen": 26047880, + "step": 44910 + }, + { + "epoch": 6.689752755436402, + "grad_norm": 58.13645935058594, + "learning_rate": 4.208310652702404e-05, + "loss": 0.059, + "num_input_tokens_seen": 26050728, + "step": 44915 + }, + { + "epoch": 6.690497467977361, + "grad_norm": 8.536458015441895, + "learning_rate": 4.208073393129291e-05, + "loss": 0.1829, + "num_input_tokens_seen": 26053576, + "step": 44920 + }, + { + "epoch": 6.69124218051832, + "grad_norm": 20.97970199584961, + "learning_rate": 4.207836104700013e-05, + "loss": 0.2427, + "num_input_tokens_seen": 26056488, + "step": 44925 + }, + { + "epoch": 6.691986893059279, + "grad_norm": 0.01819164678454399, + "learning_rate": 4.207598787418581e-05, + "loss": 0.0141, + "num_input_tokens_seen": 26059816, + "step": 44930 + }, + { + "epoch": 6.692731605600239, + "grad_norm": 74.45785522460938, + "learning_rate": 4.207361441289002e-05, + "loss": 0.7545, + "num_input_tokens_seen": 26062920, + "step": 44935 + }, + { + "epoch": 6.693476318141197, + "grad_norm": 0.4421963691711426, + "learning_rate": 4.207124066315287e-05, + "loss": 0.146, + "num_input_tokens_seen": 26066056, + "step": 44940 + }, + { + "epoch": 6.694221030682157, + "grad_norm": 0.9614246487617493, + "learning_rate": 4.206886662501446e-05, + "loss": 0.26, + "num_input_tokens_seen": 26068776, + "step": 44945 + }, + { + "epoch": 6.694965743223116, + "grad_norm": 4.599990367889404, + "learning_rate": 4.2066492298514895e-05, + "loss": 0.3317, + "num_input_tokens_seen": 26071688, + "step": 44950 + }, + { + "epoch": 6.695710455764075, + "grad_norm": 36.839942932128906, + "learning_rate": 4.2064117683694294e-05, + "loss": 0.3146, + "num_input_tokens_seen": 26074536, + "step": 44955 + }, + { + "epoch": 6.696455168305034, + "grad_norm": 0.15091432631015778, + "learning_rate": 4.206174278059276e-05, + "loss": 0.192, + "num_input_tokens_seen": 26077384, + "step": 44960 + }, + { + "epoch": 6.697199880845994, + "grad_norm": 9.576753616333008, + "learning_rate": 4.205936758925043e-05, + "loss": 0.5788, + "num_input_tokens_seen": 26079944, + "step": 44965 + }, + { + "epoch": 6.697944593386953, + "grad_norm": 89.34677124023438, + "learning_rate": 4.2056992109707415e-05, + "loss": 0.1754, + "num_input_tokens_seen": 26082824, + "step": 44970 + }, + { + "epoch": 6.698689305927912, + "grad_norm": 0.9697924852371216, + "learning_rate": 4.205461634200386e-05, + "loss": 0.0056, + "num_input_tokens_seen": 26085416, + "step": 44975 + }, + { + "epoch": 6.699434018468871, + "grad_norm": 19.311416625976562, + "learning_rate": 4.2052240286179886e-05, + "loss": 0.2743, + "num_input_tokens_seen": 26088392, + "step": 44980 + }, + { + "epoch": 6.70017873100983, + "grad_norm": 0.16928492486476898, + "learning_rate": 4.204986394227566e-05, + "loss": 0.3314, + "num_input_tokens_seen": 26091496, + "step": 44985 + }, + { + "epoch": 6.700923443550789, + "grad_norm": 0.09578333795070648, + "learning_rate": 4.20474873103313e-05, + "loss": 0.0147, + "num_input_tokens_seen": 26094440, + "step": 44990 + }, + { + "epoch": 6.701668156091749, + "grad_norm": 12.713431358337402, + "learning_rate": 4.204511039038697e-05, + "loss": 0.3507, + "num_input_tokens_seen": 26097256, + "step": 44995 + }, + { + "epoch": 6.702412868632708, + "grad_norm": 6.285897731781006, + "learning_rate": 4.204273318248283e-05, + "loss": 0.0379, + "num_input_tokens_seen": 26100168, + "step": 45000 + }, + { + "epoch": 6.703157581173667, + "grad_norm": 19.696046829223633, + "learning_rate": 4.204035568665903e-05, + "loss": 0.2921, + "num_input_tokens_seen": 26102824, + "step": 45005 + }, + { + "epoch": 6.703902293714626, + "grad_norm": 26.516746520996094, + "learning_rate": 4.203797790295574e-05, + "loss": 0.4597, + "num_input_tokens_seen": 26105928, + "step": 45010 + }, + { + "epoch": 6.704647006255585, + "grad_norm": 0.1262643039226532, + "learning_rate": 4.203559983141312e-05, + "loss": 0.2723, + "num_input_tokens_seen": 26108776, + "step": 45015 + }, + { + "epoch": 6.705391718796545, + "grad_norm": 7.808053016662598, + "learning_rate": 4.2033221472071364e-05, + "loss": 0.3445, + "num_input_tokens_seen": 26111560, + "step": 45020 + }, + { + "epoch": 6.706136431337503, + "grad_norm": 0.2834709882736206, + "learning_rate": 4.2030842824970645e-05, + "loss": 0.2819, + "num_input_tokens_seen": 26114792, + "step": 45025 + }, + { + "epoch": 6.706881143878463, + "grad_norm": 0.032083962112665176, + "learning_rate": 4.2028463890151144e-05, + "loss": 0.1148, + "num_input_tokens_seen": 26117832, + "step": 45030 + }, + { + "epoch": 6.707625856419422, + "grad_norm": 33.793601989746094, + "learning_rate": 4.202608466765306e-05, + "loss": 0.5333, + "num_input_tokens_seen": 26120808, + "step": 45035 + }, + { + "epoch": 6.708370568960381, + "grad_norm": 0.14453022181987762, + "learning_rate": 4.202370515751657e-05, + "loss": 0.3721, + "num_input_tokens_seen": 26123656, + "step": 45040 + }, + { + "epoch": 6.70911528150134, + "grad_norm": 25.65962791442871, + "learning_rate": 4.2021325359781885e-05, + "loss": 0.4054, + "num_input_tokens_seen": 26126856, + "step": 45045 + }, + { + "epoch": 6.7098599940423, + "grad_norm": 0.16251704096794128, + "learning_rate": 4.201894527448921e-05, + "loss": 0.1902, + "num_input_tokens_seen": 26129512, + "step": 45050 + }, + { + "epoch": 6.710604706583259, + "grad_norm": 23.60011100769043, + "learning_rate": 4.2016564901678744e-05, + "loss": 0.341, + "num_input_tokens_seen": 26132360, + "step": 45055 + }, + { + "epoch": 6.711349419124218, + "grad_norm": 12.524815559387207, + "learning_rate": 4.201418424139072e-05, + "loss": 0.0729, + "num_input_tokens_seen": 26135400, + "step": 45060 + }, + { + "epoch": 6.712094131665177, + "grad_norm": 10.602381706237793, + "learning_rate": 4.201180329366534e-05, + "loss": 0.2609, + "num_input_tokens_seen": 26138536, + "step": 45065 + }, + { + "epoch": 6.712838844206137, + "grad_norm": 1.443804383277893, + "learning_rate": 4.200942205854282e-05, + "loss": 0.1924, + "num_input_tokens_seen": 26141544, + "step": 45070 + }, + { + "epoch": 6.713583556747095, + "grad_norm": 0.5358045697212219, + "learning_rate": 4.2007040536063424e-05, + "loss": 0.0182, + "num_input_tokens_seen": 26144616, + "step": 45075 + }, + { + "epoch": 6.714328269288055, + "grad_norm": 31.23859214782715, + "learning_rate": 4.200465872626736e-05, + "loss": 0.2381, + "num_input_tokens_seen": 26147496, + "step": 45080 + }, + { + "epoch": 6.715072981829014, + "grad_norm": 0.048436325043439865, + "learning_rate": 4.200227662919487e-05, + "loss": 0.1432, + "num_input_tokens_seen": 26150440, + "step": 45085 + }, + { + "epoch": 6.7158176943699734, + "grad_norm": 11.200385093688965, + "learning_rate": 4.1999894244886184e-05, + "loss": 0.0967, + "num_input_tokens_seen": 26153160, + "step": 45090 + }, + { + "epoch": 6.716562406910932, + "grad_norm": 0.5208195447921753, + "learning_rate": 4.1997511573381575e-05, + "loss": 0.0392, + "num_input_tokens_seen": 26155816, + "step": 45095 + }, + { + "epoch": 6.717307119451892, + "grad_norm": 7.1964850425720215, + "learning_rate": 4.199512861472128e-05, + "loss": 0.3907, + "num_input_tokens_seen": 26159016, + "step": 45100 + }, + { + "epoch": 6.718051831992851, + "grad_norm": 76.42053985595703, + "learning_rate": 4.1992745368945554e-05, + "loss": 0.2594, + "num_input_tokens_seen": 26161960, + "step": 45105 + }, + { + "epoch": 6.71879654453381, + "grad_norm": 0.4045164883136749, + "learning_rate": 4.199036183609467e-05, + "loss": 0.1938, + "num_input_tokens_seen": 26164936, + "step": 45110 + }, + { + "epoch": 6.719541257074769, + "grad_norm": 0.01933196745812893, + "learning_rate": 4.1987978016208895e-05, + "loss": 0.0319, + "num_input_tokens_seen": 26167688, + "step": 45115 + }, + { + "epoch": 6.720285969615729, + "grad_norm": 0.9992913007736206, + "learning_rate": 4.1985593909328494e-05, + "loss": 0.0897, + "num_input_tokens_seen": 26170248, + "step": 45120 + }, + { + "epoch": 6.721030682156687, + "grad_norm": 1.093078851699829, + "learning_rate": 4.198320951549375e-05, + "loss": 0.2109, + "num_input_tokens_seen": 26173032, + "step": 45125 + }, + { + "epoch": 6.721775394697647, + "grad_norm": 0.06468188017606735, + "learning_rate": 4.1980824834744934e-05, + "loss": 0.1854, + "num_input_tokens_seen": 26175752, + "step": 45130 + }, + { + "epoch": 6.722520107238606, + "grad_norm": 38.30695724487305, + "learning_rate": 4.1978439867122344e-05, + "loss": 0.1864, + "num_input_tokens_seen": 26178760, + "step": 45135 + }, + { + "epoch": 6.7232648197795655, + "grad_norm": 80.52189636230469, + "learning_rate": 4.197605461266627e-05, + "loss": 0.0654, + "num_input_tokens_seen": 26181736, + "step": 45140 + }, + { + "epoch": 6.724009532320524, + "grad_norm": 2.5814626216888428, + "learning_rate": 4.197366907141701e-05, + "loss": 0.3201, + "num_input_tokens_seen": 26184456, + "step": 45145 + }, + { + "epoch": 6.724754244861483, + "grad_norm": 5.1595988273620605, + "learning_rate": 4.197128324341486e-05, + "loss": 0.3609, + "num_input_tokens_seen": 26187144, + "step": 45150 + }, + { + "epoch": 6.725498957402443, + "grad_norm": 26.288585662841797, + "learning_rate": 4.196889712870013e-05, + "loss": 0.2888, + "num_input_tokens_seen": 26190056, + "step": 45155 + }, + { + "epoch": 6.726243669943402, + "grad_norm": 0.024164430797100067, + "learning_rate": 4.196651072731313e-05, + "loss": 0.1845, + "num_input_tokens_seen": 26192872, + "step": 45160 + }, + { + "epoch": 6.726988382484361, + "grad_norm": 6.107508182525635, + "learning_rate": 4.196412403929417e-05, + "loss": 0.4617, + "num_input_tokens_seen": 26195592, + "step": 45165 + }, + { + "epoch": 6.72773309502532, + "grad_norm": 3.3963911533355713, + "learning_rate": 4.196173706468358e-05, + "loss": 0.1891, + "num_input_tokens_seen": 26198344, + "step": 45170 + }, + { + "epoch": 6.7284778075662794, + "grad_norm": 68.24110412597656, + "learning_rate": 4.195934980352169e-05, + "loss": 0.2818, + "num_input_tokens_seen": 26201224, + "step": 45175 + }, + { + "epoch": 6.729222520107239, + "grad_norm": 0.13775838911533356, + "learning_rate": 4.195696225584881e-05, + "loss": 0.1407, + "num_input_tokens_seen": 26204104, + "step": 45180 + }, + { + "epoch": 6.729967232648198, + "grad_norm": 15.709890365600586, + "learning_rate": 4.195457442170528e-05, + "loss": 0.2526, + "num_input_tokens_seen": 26207240, + "step": 45185 + }, + { + "epoch": 6.730711945189157, + "grad_norm": 0.13911093771457672, + "learning_rate": 4.195218630113146e-05, + "loss": 0.0071, + "num_input_tokens_seen": 26210120, + "step": 45190 + }, + { + "epoch": 6.731456657730116, + "grad_norm": 172.6339111328125, + "learning_rate": 4.1949797894167676e-05, + "loss": 0.2745, + "num_input_tokens_seen": 26213160, + "step": 45195 + }, + { + "epoch": 6.732201370271075, + "grad_norm": 0.13333790004253387, + "learning_rate": 4.1947409200854296e-05, + "loss": 0.3235, + "num_input_tokens_seen": 26216232, + "step": 45200 + }, + { + "epoch": 6.732946082812035, + "grad_norm": 78.22933959960938, + "learning_rate": 4.1945020221231643e-05, + "loss": 0.1917, + "num_input_tokens_seen": 26219016, + "step": 45205 + }, + { + "epoch": 6.733690795352993, + "grad_norm": 0.126682847738266, + "learning_rate": 4.194263095534011e-05, + "loss": 0.0346, + "num_input_tokens_seen": 26221672, + "step": 45210 + }, + { + "epoch": 6.734435507893953, + "grad_norm": 0.1891261488199234, + "learning_rate": 4.194024140322004e-05, + "loss": 0.3285, + "num_input_tokens_seen": 26224424, + "step": 45215 + }, + { + "epoch": 6.735180220434912, + "grad_norm": 0.2369523048400879, + "learning_rate": 4.193785156491181e-05, + "loss": 0.2142, + "num_input_tokens_seen": 26227304, + "step": 45220 + }, + { + "epoch": 6.7359249329758715, + "grad_norm": 14.801584243774414, + "learning_rate": 4.193546144045579e-05, + "loss": 0.4754, + "num_input_tokens_seen": 26230408, + "step": 45225 + }, + { + "epoch": 6.73666964551683, + "grad_norm": 0.42727169394493103, + "learning_rate": 4.193307102989237e-05, + "loss": 0.4654, + "num_input_tokens_seen": 26233448, + "step": 45230 + }, + { + "epoch": 6.73741435805779, + "grad_norm": 15.033449172973633, + "learning_rate": 4.1930680333261915e-05, + "loss": 0.3317, + "num_input_tokens_seen": 26236456, + "step": 45235 + }, + { + "epoch": 6.738159070598749, + "grad_norm": 8.836956977844238, + "learning_rate": 4.1928289350604826e-05, + "loss": 0.2707, + "num_input_tokens_seen": 26239688, + "step": 45240 + }, + { + "epoch": 6.738903783139708, + "grad_norm": 12.779777526855469, + "learning_rate": 4.19258980819615e-05, + "loss": 0.2635, + "num_input_tokens_seen": 26242824, + "step": 45245 + }, + { + "epoch": 6.739648495680667, + "grad_norm": 0.44478556513786316, + "learning_rate": 4.192350652737232e-05, + "loss": 0.0033, + "num_input_tokens_seen": 26245672, + "step": 45250 + }, + { + "epoch": 6.740393208221627, + "grad_norm": 0.1645158976316452, + "learning_rate": 4.19211146868777e-05, + "loss": 0.2652, + "num_input_tokens_seen": 26248392, + "step": 45255 + }, + { + "epoch": 6.7411379207625854, + "grad_norm": 18.912681579589844, + "learning_rate": 4.1918722560518045e-05, + "loss": 0.398, + "num_input_tokens_seen": 26251336, + "step": 45260 + }, + { + "epoch": 6.741882633303545, + "grad_norm": 34.58257293701172, + "learning_rate": 4.191633014833377e-05, + "loss": 0.4214, + "num_input_tokens_seen": 26254472, + "step": 45265 + }, + { + "epoch": 6.742627345844504, + "grad_norm": 0.02226879820227623, + "learning_rate": 4.191393745036529e-05, + "loss": 0.2534, + "num_input_tokens_seen": 26257192, + "step": 45270 + }, + { + "epoch": 6.7433720583854635, + "grad_norm": 13.881172180175781, + "learning_rate": 4.191154446665303e-05, + "loss": 0.1715, + "num_input_tokens_seen": 26260008, + "step": 45275 + }, + { + "epoch": 6.744116770926422, + "grad_norm": 0.21597732603549957, + "learning_rate": 4.19091511972374e-05, + "loss": 0.1762, + "num_input_tokens_seen": 26262920, + "step": 45280 + }, + { + "epoch": 6.744861483467382, + "grad_norm": 10.143237113952637, + "learning_rate": 4.1906757642158865e-05, + "loss": 0.0228, + "num_input_tokens_seen": 26265736, + "step": 45285 + }, + { + "epoch": 6.745606196008341, + "grad_norm": 20.646129608154297, + "learning_rate": 4.1904363801457835e-05, + "loss": 0.4898, + "num_input_tokens_seen": 26268904, + "step": 45290 + }, + { + "epoch": 6.7463509085493, + "grad_norm": 26.927438735961914, + "learning_rate": 4.1901969675174755e-05, + "loss": 0.2077, + "num_input_tokens_seen": 26272104, + "step": 45295 + }, + { + "epoch": 6.747095621090259, + "grad_norm": 0.3348572552204132, + "learning_rate": 4.189957526335009e-05, + "loss": 0.203, + "num_input_tokens_seen": 26274984, + "step": 45300 + }, + { + "epoch": 6.747840333631219, + "grad_norm": 2.0249435901641846, + "learning_rate": 4.1897180566024266e-05, + "loss": 0.2873, + "num_input_tokens_seen": 26277512, + "step": 45305 + }, + { + "epoch": 6.7485850461721775, + "grad_norm": 0.5009628534317017, + "learning_rate": 4.189478558323775e-05, + "loss": 0.2736, + "num_input_tokens_seen": 26280328, + "step": 45310 + }, + { + "epoch": 6.749329758713137, + "grad_norm": 44.16956329345703, + "learning_rate": 4.1892390315031e-05, + "loss": 0.119, + "num_input_tokens_seen": 26283720, + "step": 45315 + }, + { + "epoch": 6.750074471254096, + "grad_norm": 0.2509201765060425, + "learning_rate": 4.188999476144449e-05, + "loss": 0.5559, + "num_input_tokens_seen": 26286632, + "step": 45320 + }, + { + "epoch": 6.7508191837950555, + "grad_norm": 5.7607197761535645, + "learning_rate": 4.188759892251868e-05, + "loss": 0.01, + "num_input_tokens_seen": 26289704, + "step": 45325 + }, + { + "epoch": 6.751563896336014, + "grad_norm": 0.23140446841716766, + "learning_rate": 4.188520279829406e-05, + "loss": 0.0354, + "num_input_tokens_seen": 26292808, + "step": 45330 + }, + { + "epoch": 6.752308608876973, + "grad_norm": 5.542747974395752, + "learning_rate": 4.188280638881109e-05, + "loss": 0.455, + "num_input_tokens_seen": 26295688, + "step": 45335 + }, + { + "epoch": 6.753053321417933, + "grad_norm": 0.023660734295845032, + "learning_rate": 4.188040969411027e-05, + "loss": 0.3795, + "num_input_tokens_seen": 26298600, + "step": 45340 + }, + { + "epoch": 6.753798033958892, + "grad_norm": 67.11153411865234, + "learning_rate": 4.187801271423207e-05, + "loss": 0.1385, + "num_input_tokens_seen": 26301800, + "step": 45345 + }, + { + "epoch": 6.754542746499851, + "grad_norm": 94.72962951660156, + "learning_rate": 4.187561544921702e-05, + "loss": 0.0324, + "num_input_tokens_seen": 26304680, + "step": 45350 + }, + { + "epoch": 6.75528745904081, + "grad_norm": 0.07622119039297104, + "learning_rate": 4.187321789910559e-05, + "loss": 0.059, + "num_input_tokens_seen": 26307400, + "step": 45355 + }, + { + "epoch": 6.7560321715817695, + "grad_norm": 10.014076232910156, + "learning_rate": 4.1870820063938296e-05, + "loss": 0.1362, + "num_input_tokens_seen": 26310376, + "step": 45360 + }, + { + "epoch": 6.756776884122728, + "grad_norm": 0.018515966832637787, + "learning_rate": 4.186842194375564e-05, + "loss": 0.0921, + "num_input_tokens_seen": 26313224, + "step": 45365 + }, + { + "epoch": 6.757521596663688, + "grad_norm": 95.636474609375, + "learning_rate": 4.1866023538598136e-05, + "loss": 0.2878, + "num_input_tokens_seen": 26316200, + "step": 45370 + }, + { + "epoch": 6.758266309204647, + "grad_norm": 125.66061401367188, + "learning_rate": 4.186362484850631e-05, + "loss": 0.3703, + "num_input_tokens_seen": 26319432, + "step": 45375 + }, + { + "epoch": 6.759011021745606, + "grad_norm": 55.21614074707031, + "learning_rate": 4.1861225873520684e-05, + "loss": 0.499, + "num_input_tokens_seen": 26322376, + "step": 45380 + }, + { + "epoch": 6.759755734286565, + "grad_norm": 34.961402893066406, + "learning_rate": 4.185882661368178e-05, + "loss": 0.3956, + "num_input_tokens_seen": 26325128, + "step": 45385 + }, + { + "epoch": 6.760500446827525, + "grad_norm": 0.4101192355155945, + "learning_rate": 4.185642706903014e-05, + "loss": 0.1378, + "num_input_tokens_seen": 26327848, + "step": 45390 + }, + { + "epoch": 6.7612451593684835, + "grad_norm": 21.728260040283203, + "learning_rate": 4.185402723960629e-05, + "loss": 0.235, + "num_input_tokens_seen": 26330824, + "step": 45395 + }, + { + "epoch": 6.761989871909443, + "grad_norm": 26.39389419555664, + "learning_rate": 4.185162712545079e-05, + "loss": 0.3424, + "num_input_tokens_seen": 26333736, + "step": 45400 + }, + { + "epoch": 6.762734584450402, + "grad_norm": 12.990144729614258, + "learning_rate": 4.1849226726604165e-05, + "loss": 0.2775, + "num_input_tokens_seen": 26336808, + "step": 45405 + }, + { + "epoch": 6.7634792969913615, + "grad_norm": 0.09604143351316452, + "learning_rate": 4.184682604310698e-05, + "loss": 0.0533, + "num_input_tokens_seen": 26339528, + "step": 45410 + }, + { + "epoch": 6.76422400953232, + "grad_norm": 0.904022216796875, + "learning_rate": 4.18444250749998e-05, + "loss": 0.5366, + "num_input_tokens_seen": 26342728, + "step": 45415 + }, + { + "epoch": 6.76496872207328, + "grad_norm": 26.87183952331543, + "learning_rate": 4.184202382232317e-05, + "loss": 0.295, + "num_input_tokens_seen": 26345800, + "step": 45420 + }, + { + "epoch": 6.765713434614239, + "grad_norm": 15.21982192993164, + "learning_rate": 4.183962228511767e-05, + "loss": 0.434, + "num_input_tokens_seen": 26348808, + "step": 45425 + }, + { + "epoch": 6.766458147155198, + "grad_norm": 0.6791869401931763, + "learning_rate": 4.183722046342386e-05, + "loss": 0.066, + "num_input_tokens_seen": 26351624, + "step": 45430 + }, + { + "epoch": 6.767202859696157, + "grad_norm": 23.96923065185547, + "learning_rate": 4.1834818357282336e-05, + "loss": 0.5026, + "num_input_tokens_seen": 26354472, + "step": 45435 + }, + { + "epoch": 6.767947572237117, + "grad_norm": 0.7311813235282898, + "learning_rate": 4.183241596673366e-05, + "loss": 0.1338, + "num_input_tokens_seen": 26357480, + "step": 45440 + }, + { + "epoch": 6.7686922847780755, + "grad_norm": 6.269680500030518, + "learning_rate": 4.183001329181843e-05, + "loss": 0.0762, + "num_input_tokens_seen": 26360360, + "step": 45445 + }, + { + "epoch": 6.769436997319035, + "grad_norm": 16.753786087036133, + "learning_rate": 4.1827610332577214e-05, + "loss": 0.4806, + "num_input_tokens_seen": 26363080, + "step": 45450 + }, + { + "epoch": 6.770181709859994, + "grad_norm": 47.00687026977539, + "learning_rate": 4.1825207089050634e-05, + "loss": 0.4941, + "num_input_tokens_seen": 26366024, + "step": 45455 + }, + { + "epoch": 6.7709264224009535, + "grad_norm": 0.08273439854383469, + "learning_rate": 4.182280356127928e-05, + "loss": 0.017, + "num_input_tokens_seen": 26369128, + "step": 45460 + }, + { + "epoch": 6.771671134941912, + "grad_norm": 28.27448272705078, + "learning_rate": 4.182039974930376e-05, + "loss": 0.1186, + "num_input_tokens_seen": 26372136, + "step": 45465 + }, + { + "epoch": 6.772415847482872, + "grad_norm": 0.03017289564013481, + "learning_rate": 4.1817995653164675e-05, + "loss": 0.1268, + "num_input_tokens_seen": 26375240, + "step": 45470 + }, + { + "epoch": 6.773160560023831, + "grad_norm": 22.949501037597656, + "learning_rate": 4.1815591272902654e-05, + "loss": 0.3528, + "num_input_tokens_seen": 26377992, + "step": 45475 + }, + { + "epoch": 6.77390527256479, + "grad_norm": 57.627418518066406, + "learning_rate": 4.1813186608558305e-05, + "loss": 0.3773, + "num_input_tokens_seen": 26381064, + "step": 45480 + }, + { + "epoch": 6.774649985105749, + "grad_norm": 0.01889774389564991, + "learning_rate": 4.181078166017226e-05, + "loss": 0.1395, + "num_input_tokens_seen": 26384040, + "step": 45485 + }, + { + "epoch": 6.775394697646709, + "grad_norm": 1.992582082748413, + "learning_rate": 4.180837642778513e-05, + "loss": 0.1127, + "num_input_tokens_seen": 26387112, + "step": 45490 + }, + { + "epoch": 6.7761394101876675, + "grad_norm": 23.822376251220703, + "learning_rate": 4.180597091143759e-05, + "loss": 0.3223, + "num_input_tokens_seen": 26389992, + "step": 45495 + }, + { + "epoch": 6.776884122728626, + "grad_norm": 0.017623331397771835, + "learning_rate": 4.1803565111170227e-05, + "loss": 0.0465, + "num_input_tokens_seen": 26392456, + "step": 45500 + }, + { + "epoch": 6.777628835269586, + "grad_norm": 9.837831497192383, + "learning_rate": 4.180115902702372e-05, + "loss": 0.3647, + "num_input_tokens_seen": 26395592, + "step": 45505 + }, + { + "epoch": 6.778373547810546, + "grad_norm": 19.96366310119629, + "learning_rate": 4.179875265903871e-05, + "loss": 0.2019, + "num_input_tokens_seen": 26398280, + "step": 45510 + }, + { + "epoch": 6.779118260351504, + "grad_norm": 0.062114086002111435, + "learning_rate": 4.1796346007255844e-05, + "loss": 0.1735, + "num_input_tokens_seen": 26401064, + "step": 45515 + }, + { + "epoch": 6.779862972892463, + "grad_norm": 0.5039588212966919, + "learning_rate": 4.1793939071715786e-05, + "loss": 0.2006, + "num_input_tokens_seen": 26403752, + "step": 45520 + }, + { + "epoch": 6.780607685433423, + "grad_norm": 1.1609628200531006, + "learning_rate": 4.1791531852459196e-05, + "loss": 0.408, + "num_input_tokens_seen": 26406600, + "step": 45525 + }, + { + "epoch": 6.781352397974382, + "grad_norm": 0.14611399173736572, + "learning_rate": 4.1789124349526745e-05, + "loss": 0.2189, + "num_input_tokens_seen": 26409512, + "step": 45530 + }, + { + "epoch": 6.782097110515341, + "grad_norm": 0.029236411675810814, + "learning_rate": 4.178671656295909e-05, + "loss": 0.3355, + "num_input_tokens_seen": 26412584, + "step": 45535 + }, + { + "epoch": 6.7828418230563, + "grad_norm": 32.81047439575195, + "learning_rate": 4.1784308492796926e-05, + "loss": 0.3035, + "num_input_tokens_seen": 26415528, + "step": 45540 + }, + { + "epoch": 6.7835865355972595, + "grad_norm": 0.05160272866487503, + "learning_rate": 4.1781900139080933e-05, + "loss": 0.2028, + "num_input_tokens_seen": 26418344, + "step": 45545 + }, + { + "epoch": 6.784331248138218, + "grad_norm": 11.790828704833984, + "learning_rate": 4.1779491501851786e-05, + "loss": 0.3918, + "num_input_tokens_seen": 26420872, + "step": 45550 + }, + { + "epoch": 6.785075960679178, + "grad_norm": 20.49103355407715, + "learning_rate": 4.177708258115019e-05, + "loss": 0.2829, + "num_input_tokens_seen": 26423752, + "step": 45555 + }, + { + "epoch": 6.785820673220137, + "grad_norm": 70.82440948486328, + "learning_rate": 4.177467337701683e-05, + "loss": 0.3112, + "num_input_tokens_seen": 26426568, + "step": 45560 + }, + { + "epoch": 6.786565385761096, + "grad_norm": 21.675678253173828, + "learning_rate": 4.177226388949241e-05, + "loss": 0.3258, + "num_input_tokens_seen": 26429608, + "step": 45565 + }, + { + "epoch": 6.787310098302055, + "grad_norm": 29.51146125793457, + "learning_rate": 4.176985411861765e-05, + "loss": 0.0371, + "num_input_tokens_seen": 26432776, + "step": 45570 + }, + { + "epoch": 6.788054810843015, + "grad_norm": 0.07597760111093521, + "learning_rate": 4.1767444064433244e-05, + "loss": 0.1709, + "num_input_tokens_seen": 26435784, + "step": 45575 + }, + { + "epoch": 6.7887995233839735, + "grad_norm": 2.178027868270874, + "learning_rate": 4.1765033726979906e-05, + "loss": 0.1735, + "num_input_tokens_seen": 26438408, + "step": 45580 + }, + { + "epoch": 6.789544235924933, + "grad_norm": 20.105379104614258, + "learning_rate": 4.176262310629837e-05, + "loss": 0.5168, + "num_input_tokens_seen": 26441544, + "step": 45585 + }, + { + "epoch": 6.790288948465892, + "grad_norm": 16.03126335144043, + "learning_rate": 4.176021220242935e-05, + "loss": 0.2676, + "num_input_tokens_seen": 26444136, + "step": 45590 + }, + { + "epoch": 6.791033661006852, + "grad_norm": 33.723392486572266, + "learning_rate": 4.175780101541358e-05, + "loss": 0.1098, + "num_input_tokens_seen": 26446952, + "step": 45595 + }, + { + "epoch": 6.79177837354781, + "grad_norm": 12.474100112915039, + "learning_rate": 4.175538954529179e-05, + "loss": 0.2646, + "num_input_tokens_seen": 26449800, + "step": 45600 + }, + { + "epoch": 6.79252308608877, + "grad_norm": 2.706312656402588, + "learning_rate": 4.175297779210473e-05, + "loss": 0.2079, + "num_input_tokens_seen": 26452776, + "step": 45605 + }, + { + "epoch": 6.793267798629729, + "grad_norm": 0.7253775596618652, + "learning_rate": 4.1750565755893134e-05, + "loss": 0.1437, + "num_input_tokens_seen": 26455528, + "step": 45610 + }, + { + "epoch": 6.794012511170688, + "grad_norm": 0.5257652997970581, + "learning_rate": 4.174815343669775e-05, + "loss": 0.4034, + "num_input_tokens_seen": 26458632, + "step": 45615 + }, + { + "epoch": 6.794757223711647, + "grad_norm": 0.12302277237176895, + "learning_rate": 4.1745740834559335e-05, + "loss": 0.2211, + "num_input_tokens_seen": 26461768, + "step": 45620 + }, + { + "epoch": 6.795501936252607, + "grad_norm": 0.10216782987117767, + "learning_rate": 4.174332794951866e-05, + "loss": 0.3318, + "num_input_tokens_seen": 26464552, + "step": 45625 + }, + { + "epoch": 6.7962466487935655, + "grad_norm": 0.40444710850715637, + "learning_rate": 4.174091478161646e-05, + "loss": 0.2668, + "num_input_tokens_seen": 26467496, + "step": 45630 + }, + { + "epoch": 6.796991361334525, + "grad_norm": 9.37839412689209, + "learning_rate": 4.173850133089353e-05, + "loss": 0.2776, + "num_input_tokens_seen": 26470024, + "step": 45635 + }, + { + "epoch": 6.797736073875484, + "grad_norm": 40.43608093261719, + "learning_rate": 4.173608759739063e-05, + "loss": 0.4714, + "num_input_tokens_seen": 26473096, + "step": 45640 + }, + { + "epoch": 6.798480786416444, + "grad_norm": 0.1084446832537651, + "learning_rate": 4.173367358114855e-05, + "loss": 0.2331, + "num_input_tokens_seen": 26476424, + "step": 45645 + }, + { + "epoch": 6.799225498957402, + "grad_norm": 25.41679573059082, + "learning_rate": 4.1731259282208047e-05, + "loss": 0.191, + "num_input_tokens_seen": 26479496, + "step": 45650 + }, + { + "epoch": 6.799970211498362, + "grad_norm": 0.3004581332206726, + "learning_rate": 4.1728844700609926e-05, + "loss": 0.2289, + "num_input_tokens_seen": 26482184, + "step": 45655 + }, + { + "epoch": 6.800714924039321, + "grad_norm": 3.1998116970062256, + "learning_rate": 4.172642983639498e-05, + "loss": 0.0177, + "num_input_tokens_seen": 26485128, + "step": 45660 + }, + { + "epoch": 6.8014596365802795, + "grad_norm": 23.874000549316406, + "learning_rate": 4.1724014689604e-05, + "loss": 0.0467, + "num_input_tokens_seen": 26487880, + "step": 45665 + }, + { + "epoch": 6.802204349121239, + "grad_norm": 0.8482291102409363, + "learning_rate": 4.1721599260277796e-05, + "loss": 0.03, + "num_input_tokens_seen": 26490632, + "step": 45670 + }, + { + "epoch": 6.802949061662199, + "grad_norm": 0.7917176485061646, + "learning_rate": 4.171918354845716e-05, + "loss": 0.2502, + "num_input_tokens_seen": 26493416, + "step": 45675 + }, + { + "epoch": 6.803693774203158, + "grad_norm": 7.983882904052734, + "learning_rate": 4.171676755418291e-05, + "loss": 0.0887, + "num_input_tokens_seen": 26496424, + "step": 45680 + }, + { + "epoch": 6.804438486744116, + "grad_norm": 15.6112060546875, + "learning_rate": 4.171435127749587e-05, + "loss": 0.1379, + "num_input_tokens_seen": 26499496, + "step": 45685 + }, + { + "epoch": 6.805183199285076, + "grad_norm": 1.0294189453125, + "learning_rate": 4.171193471843685e-05, + "loss": 0.2107, + "num_input_tokens_seen": 26502408, + "step": 45690 + }, + { + "epoch": 6.805927911826036, + "grad_norm": 5.353392124176025, + "learning_rate": 4.170951787704667e-05, + "loss": 0.1365, + "num_input_tokens_seen": 26505288, + "step": 45695 + }, + { + "epoch": 6.806672624366994, + "grad_norm": 6.102403163909912, + "learning_rate": 4.170710075336617e-05, + "loss": 0.1664, + "num_input_tokens_seen": 26508104, + "step": 45700 + }, + { + "epoch": 6.807417336907953, + "grad_norm": 27.401525497436523, + "learning_rate": 4.170468334743619e-05, + "loss": 0.3722, + "num_input_tokens_seen": 26511080, + "step": 45705 + }, + { + "epoch": 6.808162049448913, + "grad_norm": 0.1669459044933319, + "learning_rate": 4.1702265659297554e-05, + "loss": 0.3487, + "num_input_tokens_seen": 26514216, + "step": 45710 + }, + { + "epoch": 6.8089067619898715, + "grad_norm": 11.618437767028809, + "learning_rate": 4.169984768899112e-05, + "loss": 0.4917, + "num_input_tokens_seen": 26517384, + "step": 45715 + }, + { + "epoch": 6.809651474530831, + "grad_norm": 0.0697910338640213, + "learning_rate": 4.169742943655774e-05, + "loss": 0.2625, + "num_input_tokens_seen": 26520456, + "step": 45720 + }, + { + "epoch": 6.81039618707179, + "grad_norm": 19.350149154663086, + "learning_rate": 4.169501090203826e-05, + "loss": 0.2809, + "num_input_tokens_seen": 26523624, + "step": 45725 + }, + { + "epoch": 6.81114089961275, + "grad_norm": 9.025778770446777, + "learning_rate": 4.1692592085473525e-05, + "loss": 0.1021, + "num_input_tokens_seen": 26526504, + "step": 45730 + }, + { + "epoch": 6.811885612153708, + "grad_norm": 37.130367279052734, + "learning_rate": 4.169017298690442e-05, + "loss": 0.173, + "num_input_tokens_seen": 26529192, + "step": 45735 + }, + { + "epoch": 6.812630324694668, + "grad_norm": 15.387197494506836, + "learning_rate": 4.168775360637181e-05, + "loss": 0.2416, + "num_input_tokens_seen": 26532168, + "step": 45740 + }, + { + "epoch": 6.813375037235627, + "grad_norm": 15.188352584838867, + "learning_rate": 4.168533394391656e-05, + "loss": 0.1409, + "num_input_tokens_seen": 26534888, + "step": 45745 + }, + { + "epoch": 6.814119749776586, + "grad_norm": 5.927433013916016, + "learning_rate": 4.1682913999579545e-05, + "loss": 0.3728, + "num_input_tokens_seen": 26537800, + "step": 45750 + }, + { + "epoch": 6.814864462317545, + "grad_norm": 72.01142883300781, + "learning_rate": 4.1680493773401657e-05, + "loss": 0.2713, + "num_input_tokens_seen": 26540680, + "step": 45755 + }, + { + "epoch": 6.815609174858505, + "grad_norm": 18.294275283813477, + "learning_rate": 4.167807326542379e-05, + "loss": 0.4532, + "num_input_tokens_seen": 26543432, + "step": 45760 + }, + { + "epoch": 6.816353887399464, + "grad_norm": 6.418880462646484, + "learning_rate": 4.167565247568681e-05, + "loss": 0.3623, + "num_input_tokens_seen": 26546120, + "step": 45765 + }, + { + "epoch": 6.817098599940423, + "grad_norm": 2.7378342151641846, + "learning_rate": 4.167323140423164e-05, + "loss": 0.2351, + "num_input_tokens_seen": 26548936, + "step": 45770 + }, + { + "epoch": 6.817843312481382, + "grad_norm": 0.3516349792480469, + "learning_rate": 4.167081005109917e-05, + "loss": 0.076, + "num_input_tokens_seen": 26551816, + "step": 45775 + }, + { + "epoch": 6.818588025022342, + "grad_norm": 3.131828546524048, + "learning_rate": 4.16683884163303e-05, + "loss": 0.2537, + "num_input_tokens_seen": 26554632, + "step": 45780 + }, + { + "epoch": 6.8193327375633, + "grad_norm": 27.244108200073242, + "learning_rate": 4.166596649996596e-05, + "loss": 0.2762, + "num_input_tokens_seen": 26557576, + "step": 45785 + }, + { + "epoch": 6.82007745010426, + "grad_norm": 14.037951469421387, + "learning_rate": 4.166354430204705e-05, + "loss": 0.1295, + "num_input_tokens_seen": 26560264, + "step": 45790 + }, + { + "epoch": 6.820822162645219, + "grad_norm": 79.73348999023438, + "learning_rate": 4.166112182261449e-05, + "loss": 0.3545, + "num_input_tokens_seen": 26563048, + "step": 45795 + }, + { + "epoch": 6.821566875186178, + "grad_norm": 14.810155868530273, + "learning_rate": 4.1658699061709215e-05, + "loss": 0.3679, + "num_input_tokens_seen": 26565960, + "step": 45800 + }, + { + "epoch": 6.822311587727137, + "grad_norm": 0.2998676300048828, + "learning_rate": 4.1656276019372156e-05, + "loss": 0.144, + "num_input_tokens_seen": 26568872, + "step": 45805 + }, + { + "epoch": 6.823056300268097, + "grad_norm": 17.096681594848633, + "learning_rate": 4.165385269564423e-05, + "loss": 0.3912, + "num_input_tokens_seen": 26571624, + "step": 45810 + }, + { + "epoch": 6.823801012809056, + "grad_norm": 86.29662322998047, + "learning_rate": 4.16514290905664e-05, + "loss": 0.2688, + "num_input_tokens_seen": 26574440, + "step": 45815 + }, + { + "epoch": 6.824545725350015, + "grad_norm": 0.19547082483768463, + "learning_rate": 4.164900520417959e-05, + "loss": 0.0796, + "num_input_tokens_seen": 26577256, + "step": 45820 + }, + { + "epoch": 6.825290437890974, + "grad_norm": 9.605208396911621, + "learning_rate": 4.164658103652477e-05, + "loss": 0.1811, + "num_input_tokens_seen": 26580328, + "step": 45825 + }, + { + "epoch": 6.826035150431934, + "grad_norm": 1.311478614807129, + "learning_rate": 4.164415658764287e-05, + "loss": 0.1907, + "num_input_tokens_seen": 26583048, + "step": 45830 + }, + { + "epoch": 6.826779862972892, + "grad_norm": 5.0539703369140625, + "learning_rate": 4.164173185757487e-05, + "loss": 0.158, + "num_input_tokens_seen": 26585864, + "step": 45835 + }, + { + "epoch": 6.827524575513852, + "grad_norm": 6.297667980194092, + "learning_rate": 4.163930684636173e-05, + "loss": 0.1889, + "num_input_tokens_seen": 26588808, + "step": 45840 + }, + { + "epoch": 6.828269288054811, + "grad_norm": 0.2905386686325073, + "learning_rate": 4.16368815540444e-05, + "loss": 0.2928, + "num_input_tokens_seen": 26591528, + "step": 45845 + }, + { + "epoch": 6.82901400059577, + "grad_norm": 89.95918273925781, + "learning_rate": 4.1634455980663866e-05, + "loss": 0.1906, + "num_input_tokens_seen": 26594312, + "step": 45850 + }, + { + "epoch": 6.829758713136729, + "grad_norm": 3.747208833694458, + "learning_rate": 4.163203012626111e-05, + "loss": 0.3554, + "num_input_tokens_seen": 26597224, + "step": 45855 + }, + { + "epoch": 6.830503425677689, + "grad_norm": 0.13516157865524292, + "learning_rate": 4.16296039908771e-05, + "loss": 0.0191, + "num_input_tokens_seen": 26600168, + "step": 45860 + }, + { + "epoch": 6.831248138218648, + "grad_norm": 0.32044631242752075, + "learning_rate": 4.162717757455284e-05, + "loss": 0.0569, + "num_input_tokens_seen": 26603016, + "step": 45865 + }, + { + "epoch": 6.831992850759606, + "grad_norm": 12.664250373840332, + "learning_rate": 4.162475087732931e-05, + "loss": 0.3691, + "num_input_tokens_seen": 26606088, + "step": 45870 + }, + { + "epoch": 6.832737563300566, + "grad_norm": 35.30840301513672, + "learning_rate": 4.162232389924751e-05, + "loss": 0.1982, + "num_input_tokens_seen": 26609064, + "step": 45875 + }, + { + "epoch": 6.833482275841525, + "grad_norm": 17.97186279296875, + "learning_rate": 4.1619896640348445e-05, + "loss": 0.1524, + "num_input_tokens_seen": 26611912, + "step": 45880 + }, + { + "epoch": 6.834226988382484, + "grad_norm": 1.0953278541564941, + "learning_rate": 4.1617469100673126e-05, + "loss": 0.1466, + "num_input_tokens_seen": 26614728, + "step": 45885 + }, + { + "epoch": 6.834971700923443, + "grad_norm": 37.474517822265625, + "learning_rate": 4.161504128026255e-05, + "loss": 0.3171, + "num_input_tokens_seen": 26617896, + "step": 45890 + }, + { + "epoch": 6.835716413464403, + "grad_norm": 22.287649154663086, + "learning_rate": 4.1612613179157725e-05, + "loss": 0.1444, + "num_input_tokens_seen": 26620936, + "step": 45895 + }, + { + "epoch": 6.836461126005362, + "grad_norm": 11.353158950805664, + "learning_rate": 4.1610184797399696e-05, + "loss": 0.1894, + "num_input_tokens_seen": 26624104, + "step": 45900 + }, + { + "epoch": 6.837205838546321, + "grad_norm": 3.0813350677490234, + "learning_rate": 4.160775613502948e-05, + "loss": 0.4, + "num_input_tokens_seen": 26626760, + "step": 45905 + }, + { + "epoch": 6.83795055108728, + "grad_norm": 24.306865692138672, + "learning_rate": 4.160532719208809e-05, + "loss": 0.2871, + "num_input_tokens_seen": 26629736, + "step": 45910 + }, + { + "epoch": 6.83869526362824, + "grad_norm": 6.107914447784424, + "learning_rate": 4.160289796861659e-05, + "loss": 0.0136, + "num_input_tokens_seen": 26632680, + "step": 45915 + }, + { + "epoch": 6.839439976169198, + "grad_norm": 4.8830695152282715, + "learning_rate": 4.1600468464656e-05, + "loss": 0.18, + "num_input_tokens_seen": 26635560, + "step": 45920 + }, + { + "epoch": 6.840184688710158, + "grad_norm": 0.36273080110549927, + "learning_rate": 4.1598038680247363e-05, + "loss": 0.0275, + "num_input_tokens_seen": 26638568, + "step": 45925 + }, + { + "epoch": 6.840929401251117, + "grad_norm": 0.037570539861917496, + "learning_rate": 4.159560861543174e-05, + "loss": 0.1711, + "num_input_tokens_seen": 26641448, + "step": 45930 + }, + { + "epoch": 6.8416741137920765, + "grad_norm": 18.88705062866211, + "learning_rate": 4.159317827025016e-05, + "loss": 0.2724, + "num_input_tokens_seen": 26644520, + "step": 45935 + }, + { + "epoch": 6.842418826333035, + "grad_norm": 30.999887466430664, + "learning_rate": 4.159074764474371e-05, + "loss": 0.1714, + "num_input_tokens_seen": 26647208, + "step": 45940 + }, + { + "epoch": 6.843163538873995, + "grad_norm": 0.038857460021972656, + "learning_rate": 4.1588316738953434e-05, + "loss": 0.0195, + "num_input_tokens_seen": 26650312, + "step": 45945 + }, + { + "epoch": 6.843908251414954, + "grad_norm": 0.008813275955617428, + "learning_rate": 4.158588555292041e-05, + "loss": 0.2348, + "num_input_tokens_seen": 26653160, + "step": 45950 + }, + { + "epoch": 6.844652963955913, + "grad_norm": 18.571468353271484, + "learning_rate": 4.158345408668571e-05, + "loss": 0.6831, + "num_input_tokens_seen": 26656456, + "step": 45955 + }, + { + "epoch": 6.845397676496872, + "grad_norm": 44.36162567138672, + "learning_rate": 4.15810223402904e-05, + "loss": 0.1102, + "num_input_tokens_seen": 26659400, + "step": 45960 + }, + { + "epoch": 6.846142389037832, + "grad_norm": 27.212661743164062, + "learning_rate": 4.157859031377558e-05, + "loss": 0.2711, + "num_input_tokens_seen": 26662248, + "step": 45965 + }, + { + "epoch": 6.84688710157879, + "grad_norm": 0.0898815244436264, + "learning_rate": 4.157615800718232e-05, + "loss": 0.3129, + "num_input_tokens_seen": 26664904, + "step": 45970 + }, + { + "epoch": 6.84763181411975, + "grad_norm": 0.06674085557460785, + "learning_rate": 4.1573725420551716e-05, + "loss": 0.072, + "num_input_tokens_seen": 26667848, + "step": 45975 + }, + { + "epoch": 6.848376526660709, + "grad_norm": 6.332658290863037, + "learning_rate": 4.157129255392487e-05, + "loss": 0.4228, + "num_input_tokens_seen": 26670952, + "step": 45980 + }, + { + "epoch": 6.8491212392016685, + "grad_norm": 18.49519157409668, + "learning_rate": 4.1568859407342876e-05, + "loss": 0.2739, + "num_input_tokens_seen": 26673384, + "step": 45985 + }, + { + "epoch": 6.849865951742627, + "grad_norm": 8.540017127990723, + "learning_rate": 4.1566425980846844e-05, + "loss": 0.1919, + "num_input_tokens_seen": 26676552, + "step": 45990 + }, + { + "epoch": 6.850610664283587, + "grad_norm": 33.53857421875, + "learning_rate": 4.156399227447788e-05, + "loss": 0.2129, + "num_input_tokens_seen": 26679304, + "step": 45995 + }, + { + "epoch": 6.851355376824546, + "grad_norm": 16.705520629882812, + "learning_rate": 4.15615582882771e-05, + "loss": 0.3317, + "num_input_tokens_seen": 26682088, + "step": 46000 + }, + { + "epoch": 6.852100089365505, + "grad_norm": 16.765422821044922, + "learning_rate": 4.155912402228563e-05, + "loss": 0.1647, + "num_input_tokens_seen": 26684904, + "step": 46005 + }, + { + "epoch": 6.852844801906464, + "grad_norm": 38.46474838256836, + "learning_rate": 4.155668947654458e-05, + "loss": 0.1812, + "num_input_tokens_seen": 26687976, + "step": 46010 + }, + { + "epoch": 6.853589514447423, + "grad_norm": 12.1238431930542, + "learning_rate": 4.15542546510951e-05, + "loss": 0.1771, + "num_input_tokens_seen": 26690792, + "step": 46015 + }, + { + "epoch": 6.8543342269883825, + "grad_norm": 16.0915584564209, + "learning_rate": 4.155181954597832e-05, + "loss": 0.2887, + "num_input_tokens_seen": 26693704, + "step": 46020 + }, + { + "epoch": 6.855078939529342, + "grad_norm": 8.466179847717285, + "learning_rate": 4.154938416123535e-05, + "loss": 0.2197, + "num_input_tokens_seen": 26696552, + "step": 46025 + }, + { + "epoch": 6.855823652070301, + "grad_norm": 1.5028384923934937, + "learning_rate": 4.154694849690737e-05, + "loss": 0.1094, + "num_input_tokens_seen": 26699784, + "step": 46030 + }, + { + "epoch": 6.85656836461126, + "grad_norm": 0.7650251388549805, + "learning_rate": 4.154451255303551e-05, + "loss": 0.1961, + "num_input_tokens_seen": 26702856, + "step": 46035 + }, + { + "epoch": 6.857313077152219, + "grad_norm": 5.847568035125732, + "learning_rate": 4.154207632966092e-05, + "loss": 0.4453, + "num_input_tokens_seen": 26705832, + "step": 46040 + }, + { + "epoch": 6.858057789693179, + "grad_norm": 4.229816436767578, + "learning_rate": 4.1539639826824765e-05, + "loss": 0.1938, + "num_input_tokens_seen": 26708616, + "step": 46045 + }, + { + "epoch": 6.858802502234138, + "grad_norm": 6.146960258483887, + "learning_rate": 4.1537203044568205e-05, + "loss": 0.0884, + "num_input_tokens_seen": 26711080, + "step": 46050 + }, + { + "epoch": 6.859547214775096, + "grad_norm": 0.7375008463859558, + "learning_rate": 4.153476598293241e-05, + "loss": 0.0414, + "num_input_tokens_seen": 26713768, + "step": 46055 + }, + { + "epoch": 6.860291927316056, + "grad_norm": 24.976627349853516, + "learning_rate": 4.153232864195855e-05, + "loss": 0.2104, + "num_input_tokens_seen": 26716680, + "step": 46060 + }, + { + "epoch": 6.861036639857015, + "grad_norm": 45.18174743652344, + "learning_rate": 4.1529891021687796e-05, + "loss": 0.4789, + "num_input_tokens_seen": 26719624, + "step": 46065 + }, + { + "epoch": 6.8617813523979745, + "grad_norm": 0.0640542283654213, + "learning_rate": 4.152745312216134e-05, + "loss": 0.0325, + "num_input_tokens_seen": 26722728, + "step": 46070 + }, + { + "epoch": 6.862526064938933, + "grad_norm": 0.6513854265213013, + "learning_rate": 4.152501494342035e-05, + "loss": 0.5079, + "num_input_tokens_seen": 26725896, + "step": 46075 + }, + { + "epoch": 6.863270777479893, + "grad_norm": 13.761378288269043, + "learning_rate": 4.152257648550604e-05, + "loss": 0.2209, + "num_input_tokens_seen": 26729128, + "step": 46080 + }, + { + "epoch": 6.864015490020852, + "grad_norm": 12.759798049926758, + "learning_rate": 4.1520137748459587e-05, + "loss": 0.2559, + "num_input_tokens_seen": 26732008, + "step": 46085 + }, + { + "epoch": 6.864760202561811, + "grad_norm": 11.915783882141113, + "learning_rate": 4.1517698732322194e-05, + "loss": 0.1331, + "num_input_tokens_seen": 26734824, + "step": 46090 + }, + { + "epoch": 6.86550491510277, + "grad_norm": 14.88746166229248, + "learning_rate": 4.1515259437135076e-05, + "loss": 0.4126, + "num_input_tokens_seen": 26737896, + "step": 46095 + }, + { + "epoch": 6.86624962764373, + "grad_norm": 0.9607973098754883, + "learning_rate": 4.1512819862939425e-05, + "loss": 0.2336, + "num_input_tokens_seen": 26740488, + "step": 46100 + }, + { + "epoch": 6.8669943401846885, + "grad_norm": 0.1483457237482071, + "learning_rate": 4.1510380009776475e-05, + "loss": 0.2933, + "num_input_tokens_seen": 26743336, + "step": 46105 + }, + { + "epoch": 6.867739052725648, + "grad_norm": 10.81159496307373, + "learning_rate": 4.150793987768743e-05, + "loss": 0.465, + "num_input_tokens_seen": 26746088, + "step": 46110 + }, + { + "epoch": 6.868483765266607, + "grad_norm": 0.1450534164905548, + "learning_rate": 4.1505499466713516e-05, + "loss": 0.0532, + "num_input_tokens_seen": 26749032, + "step": 46115 + }, + { + "epoch": 6.8692284778075665, + "grad_norm": 0.01521777082234621, + "learning_rate": 4.1503058776895974e-05, + "loss": 0.2053, + "num_input_tokens_seen": 26752072, + "step": 46120 + }, + { + "epoch": 6.869973190348525, + "grad_norm": 0.7336267828941345, + "learning_rate": 4.150061780827602e-05, + "loss": 0.0905, + "num_input_tokens_seen": 26754856, + "step": 46125 + }, + { + "epoch": 6.870717902889485, + "grad_norm": 0.630050539970398, + "learning_rate": 4.14981765608949e-05, + "loss": 0.4652, + "num_input_tokens_seen": 26757864, + "step": 46130 + }, + { + "epoch": 6.871462615430444, + "grad_norm": 0.14847996830940247, + "learning_rate": 4.1495735034793856e-05, + "loss": 0.2237, + "num_input_tokens_seen": 26760872, + "step": 46135 + }, + { + "epoch": 6.872207327971403, + "grad_norm": 0.11785834282636642, + "learning_rate": 4.149329323001413e-05, + "loss": 0.2909, + "num_input_tokens_seen": 26763688, + "step": 46140 + }, + { + "epoch": 6.872952040512362, + "grad_norm": 19.522138595581055, + "learning_rate": 4.149085114659699e-05, + "loss": 0.5441, + "num_input_tokens_seen": 26766472, + "step": 46145 + }, + { + "epoch": 6.873696753053322, + "grad_norm": 0.7065876722335815, + "learning_rate": 4.1488408784583664e-05, + "loss": 0.1967, + "num_input_tokens_seen": 26769608, + "step": 46150 + }, + { + "epoch": 6.8744414655942805, + "grad_norm": 11.318220138549805, + "learning_rate": 4.148596614401544e-05, + "loss": 0.2211, + "num_input_tokens_seen": 26772520, + "step": 46155 + }, + { + "epoch": 6.87518617813524, + "grad_norm": 22.75167465209961, + "learning_rate": 4.148352322493357e-05, + "loss": 0.3942, + "num_input_tokens_seen": 26775496, + "step": 46160 + }, + { + "epoch": 6.875930890676199, + "grad_norm": 0.05121331661939621, + "learning_rate": 4.148108002737933e-05, + "loss": 0.3173, + "num_input_tokens_seen": 26778568, + "step": 46165 + }, + { + "epoch": 6.8766756032171585, + "grad_norm": 0.05325683578848839, + "learning_rate": 4.147863655139399e-05, + "loss": 0.1303, + "num_input_tokens_seen": 26781320, + "step": 46170 + }, + { + "epoch": 6.877420315758117, + "grad_norm": 0.26960289478302, + "learning_rate": 4.1476192797018836e-05, + "loss": 0.058, + "num_input_tokens_seen": 26784008, + "step": 46175 + }, + { + "epoch": 6.878165028299077, + "grad_norm": 0.16217224299907684, + "learning_rate": 4.147374876429515e-05, + "loss": 0.1305, + "num_input_tokens_seen": 26786824, + "step": 46180 + }, + { + "epoch": 6.878909740840036, + "grad_norm": 36.11410903930664, + "learning_rate": 4.1471304453264225e-05, + "loss": 0.2931, + "num_input_tokens_seen": 26789800, + "step": 46185 + }, + { + "epoch": 6.879654453380995, + "grad_norm": 4.206353664398193, + "learning_rate": 4.1468859863967345e-05, + "loss": 0.2355, + "num_input_tokens_seen": 26792712, + "step": 46190 + }, + { + "epoch": 6.880399165921954, + "grad_norm": 21.386817932128906, + "learning_rate": 4.1466414996445824e-05, + "loss": 0.4495, + "num_input_tokens_seen": 26795592, + "step": 46195 + }, + { + "epoch": 6.881143878462913, + "grad_norm": 0.7718372941017151, + "learning_rate": 4.146396985074095e-05, + "loss": 0.0434, + "num_input_tokens_seen": 26798440, + "step": 46200 + }, + { + "epoch": 6.8818885910038725, + "grad_norm": 10.227683067321777, + "learning_rate": 4.146152442689405e-05, + "loss": 0.1451, + "num_input_tokens_seen": 26801224, + "step": 46205 + }, + { + "epoch": 6.882633303544832, + "grad_norm": 38.48842239379883, + "learning_rate": 4.1459078724946406e-05, + "loss": 0.2463, + "num_input_tokens_seen": 26803976, + "step": 46210 + }, + { + "epoch": 6.883378016085791, + "grad_norm": 0.07351404428482056, + "learning_rate": 4.1456632744939375e-05, + "loss": 0.1643, + "num_input_tokens_seen": 26807112, + "step": 46215 + }, + { + "epoch": 6.88412272862675, + "grad_norm": 23.24532699584961, + "learning_rate": 4.145418648691425e-05, + "loss": 0.2672, + "num_input_tokens_seen": 26809864, + "step": 46220 + }, + { + "epoch": 6.884867441167709, + "grad_norm": 0.0446920171380043, + "learning_rate": 4.1451739950912365e-05, + "loss": 0.0977, + "num_input_tokens_seen": 26812552, + "step": 46225 + }, + { + "epoch": 6.885612153708668, + "grad_norm": 0.21772290766239166, + "learning_rate": 4.144929313697506e-05, + "loss": 0.1634, + "num_input_tokens_seen": 26815304, + "step": 46230 + }, + { + "epoch": 6.886356866249628, + "grad_norm": 61.88821792602539, + "learning_rate": 4.144684604514366e-05, + "loss": 0.2875, + "num_input_tokens_seen": 26818184, + "step": 46235 + }, + { + "epoch": 6.8871015787905865, + "grad_norm": 0.10271623730659485, + "learning_rate": 4.144439867545952e-05, + "loss": 0.0116, + "num_input_tokens_seen": 26820808, + "step": 46240 + }, + { + "epoch": 6.887846291331546, + "grad_norm": 10.808184623718262, + "learning_rate": 4.144195102796398e-05, + "loss": 0.3003, + "num_input_tokens_seen": 26823784, + "step": 46245 + }, + { + "epoch": 6.888591003872505, + "grad_norm": 20.865039825439453, + "learning_rate": 4.143950310269837e-05, + "loss": 0.1384, + "num_input_tokens_seen": 26826632, + "step": 46250 + }, + { + "epoch": 6.8893357164134645, + "grad_norm": 0.07442519813776016, + "learning_rate": 4.143705489970408e-05, + "loss": 0.2329, + "num_input_tokens_seen": 26829768, + "step": 46255 + }, + { + "epoch": 6.890080428954423, + "grad_norm": 3.1136672496795654, + "learning_rate": 4.143460641902245e-05, + "loss": 0.1385, + "num_input_tokens_seen": 26832744, + "step": 46260 + }, + { + "epoch": 6.890825141495383, + "grad_norm": 4.817009925842285, + "learning_rate": 4.143215766069484e-05, + "loss": 0.2441, + "num_input_tokens_seen": 26836040, + "step": 46265 + }, + { + "epoch": 6.891569854036342, + "grad_norm": 8.142407417297363, + "learning_rate": 4.142970862476264e-05, + "loss": 0.501, + "num_input_tokens_seen": 26838952, + "step": 46270 + }, + { + "epoch": 6.892314566577301, + "grad_norm": 0.2731039226055145, + "learning_rate": 4.14272593112672e-05, + "loss": 0.253, + "num_input_tokens_seen": 26841928, + "step": 46275 + }, + { + "epoch": 6.89305927911826, + "grad_norm": 0.4695386588573456, + "learning_rate": 4.142480972024991e-05, + "loss": 0.1874, + "num_input_tokens_seen": 26844776, + "step": 46280 + }, + { + "epoch": 6.89380399165922, + "grad_norm": 8.935503959655762, + "learning_rate": 4.142235985175217e-05, + "loss": 0.2138, + "num_input_tokens_seen": 26847720, + "step": 46285 + }, + { + "epoch": 6.8945487042001785, + "grad_norm": 8.056787490844727, + "learning_rate": 4.141990970581534e-05, + "loss": 0.0999, + "num_input_tokens_seen": 26850408, + "step": 46290 + }, + { + "epoch": 6.895293416741138, + "grad_norm": 0.858002781867981, + "learning_rate": 4.141745928248082e-05, + "loss": 0.166, + "num_input_tokens_seen": 26853256, + "step": 46295 + }, + { + "epoch": 6.896038129282097, + "grad_norm": 26.206899642944336, + "learning_rate": 4.141500858179002e-05, + "loss": 0.228, + "num_input_tokens_seen": 26856296, + "step": 46300 + }, + { + "epoch": 6.896782841823057, + "grad_norm": 0.870672881603241, + "learning_rate": 4.141255760378432e-05, + "loss": 0.4114, + "num_input_tokens_seen": 26859016, + "step": 46305 + }, + { + "epoch": 6.897527554364015, + "grad_norm": 79.66545867919922, + "learning_rate": 4.141010634850515e-05, + "loss": 0.5851, + "num_input_tokens_seen": 26861992, + "step": 46310 + }, + { + "epoch": 6.898272266904975, + "grad_norm": 22.049116134643555, + "learning_rate": 4.140765481599391e-05, + "loss": 0.2726, + "num_input_tokens_seen": 26864808, + "step": 46315 + }, + { + "epoch": 6.899016979445934, + "grad_norm": 8.359123229980469, + "learning_rate": 4.1405203006292014e-05, + "loss": 0.0803, + "num_input_tokens_seen": 26867592, + "step": 46320 + }, + { + "epoch": 6.899761691986893, + "grad_norm": 19.16071891784668, + "learning_rate": 4.1402750919440894e-05, + "loss": 0.4963, + "num_input_tokens_seen": 26870568, + "step": 46325 + }, + { + "epoch": 6.900506404527852, + "grad_norm": 1.097401738166809, + "learning_rate": 4.140029855548196e-05, + "loss": 0.1641, + "num_input_tokens_seen": 26873416, + "step": 46330 + }, + { + "epoch": 6.901251117068812, + "grad_norm": 21.184621810913086, + "learning_rate": 4.1397845914456656e-05, + "loss": 0.1718, + "num_input_tokens_seen": 26876168, + "step": 46335 + }, + { + "epoch": 6.9019958296097705, + "grad_norm": 12.990138053894043, + "learning_rate": 4.139539299640641e-05, + "loss": 0.3089, + "num_input_tokens_seen": 26879336, + "step": 46340 + }, + { + "epoch": 6.90274054215073, + "grad_norm": 21.138702392578125, + "learning_rate": 4.139293980137267e-05, + "loss": 0.221, + "num_input_tokens_seen": 26882728, + "step": 46345 + }, + { + "epoch": 6.903485254691689, + "grad_norm": 16.447582244873047, + "learning_rate": 4.139048632939686e-05, + "loss": 0.4732, + "num_input_tokens_seen": 26886088, + "step": 46350 + }, + { + "epoch": 6.904229967232649, + "grad_norm": 14.961132049560547, + "learning_rate": 4.138803258052045e-05, + "loss": 0.2452, + "num_input_tokens_seen": 26889064, + "step": 46355 + }, + { + "epoch": 6.904974679773607, + "grad_norm": 28.79184913635254, + "learning_rate": 4.138557855478489e-05, + "loss": 0.2586, + "num_input_tokens_seen": 26891976, + "step": 46360 + }, + { + "epoch": 6.905719392314566, + "grad_norm": 4.5292158126831055, + "learning_rate": 4.1383124252231625e-05, + "loss": 0.3709, + "num_input_tokens_seen": 26894984, + "step": 46365 + }, + { + "epoch": 6.906464104855526, + "grad_norm": 0.16603627800941467, + "learning_rate": 4.138066967290213e-05, + "loss": 0.2534, + "num_input_tokens_seen": 26897768, + "step": 46370 + }, + { + "epoch": 6.907208817396485, + "grad_norm": 15.059267044067383, + "learning_rate": 4.137821481683787e-05, + "loss": 0.251, + "num_input_tokens_seen": 26900776, + "step": 46375 + }, + { + "epoch": 6.907953529937444, + "grad_norm": 13.686347007751465, + "learning_rate": 4.1375759684080314e-05, + "loss": 0.1334, + "num_input_tokens_seen": 26903624, + "step": 46380 + }, + { + "epoch": 6.908698242478403, + "grad_norm": 0.04674597084522247, + "learning_rate": 4.137330427467094e-05, + "loss": 0.3385, + "num_input_tokens_seen": 26906472, + "step": 46385 + }, + { + "epoch": 6.909442955019363, + "grad_norm": 0.28926143050193787, + "learning_rate": 4.137084858865124e-05, + "loss": 0.0533, + "num_input_tokens_seen": 26909448, + "step": 46390 + }, + { + "epoch": 6.910187667560322, + "grad_norm": 1.868955373764038, + "learning_rate": 4.1368392626062685e-05, + "loss": 0.2768, + "num_input_tokens_seen": 26912328, + "step": 46395 + }, + { + "epoch": 6.910932380101281, + "grad_norm": 5.701356410980225, + "learning_rate": 4.1365936386946776e-05, + "loss": 0.1206, + "num_input_tokens_seen": 26915016, + "step": 46400 + }, + { + "epoch": 6.91167709264224, + "grad_norm": 0.46395739912986755, + "learning_rate": 4.1363479871345e-05, + "loss": 0.1285, + "num_input_tokens_seen": 26918440, + "step": 46405 + }, + { + "epoch": 6.912421805183199, + "grad_norm": 14.542814254760742, + "learning_rate": 4.1361023079298874e-05, + "loss": 0.0296, + "num_input_tokens_seen": 26921064, + "step": 46410 + }, + { + "epoch": 6.913166517724158, + "grad_norm": 8.281011581420898, + "learning_rate": 4.135856601084988e-05, + "loss": 0.0254, + "num_input_tokens_seen": 26923752, + "step": 46415 + }, + { + "epoch": 6.913911230265118, + "grad_norm": 142.80213928222656, + "learning_rate": 4.135610866603955e-05, + "loss": 0.0547, + "num_input_tokens_seen": 26926600, + "step": 46420 + }, + { + "epoch": 6.9146559428060765, + "grad_norm": 28.826068878173828, + "learning_rate": 4.135365104490938e-05, + "loss": 0.6309, + "num_input_tokens_seen": 26929064, + "step": 46425 + }, + { + "epoch": 6.915400655347036, + "grad_norm": 96.74844360351562, + "learning_rate": 4.1351193147500887e-05, + "loss": 0.714, + "num_input_tokens_seen": 26931592, + "step": 46430 + }, + { + "epoch": 6.916145367887995, + "grad_norm": 0.07492105662822723, + "learning_rate": 4.134873497385562e-05, + "loss": 0.1302, + "num_input_tokens_seen": 26934472, + "step": 46435 + }, + { + "epoch": 6.916890080428955, + "grad_norm": 0.032570309937000275, + "learning_rate": 4.1346276524015085e-05, + "loss": 0.4508, + "num_input_tokens_seen": 26937512, + "step": 46440 + }, + { + "epoch": 6.917634792969913, + "grad_norm": 0.06296688318252563, + "learning_rate": 4.1343817798020824e-05, + "loss": 0.229, + "num_input_tokens_seen": 26941064, + "step": 46445 + }, + { + "epoch": 6.918379505510873, + "grad_norm": 0.8056623935699463, + "learning_rate": 4.1341358795914375e-05, + "loss": 0.0874, + "num_input_tokens_seen": 26943976, + "step": 46450 + }, + { + "epoch": 6.919124218051832, + "grad_norm": 0.15428540110588074, + "learning_rate": 4.133889951773727e-05, + "loss": 0.2613, + "num_input_tokens_seen": 26947048, + "step": 46455 + }, + { + "epoch": 6.919868930592791, + "grad_norm": 34.87504959106445, + "learning_rate": 4.133643996353107e-05, + "loss": 0.3583, + "num_input_tokens_seen": 26950088, + "step": 46460 + }, + { + "epoch": 6.92061364313375, + "grad_norm": 0.1833622306585312, + "learning_rate": 4.1333980133337324e-05, + "loss": 0.1931, + "num_input_tokens_seen": 26952648, + "step": 46465 + }, + { + "epoch": 6.92135835567471, + "grad_norm": 0.038631096482276917, + "learning_rate": 4.133152002719758e-05, + "loss": 0.0147, + "num_input_tokens_seen": 26955432, + "step": 46470 + }, + { + "epoch": 6.922103068215669, + "grad_norm": 10.027575492858887, + "learning_rate": 4.13290596451534e-05, + "loss": 0.4037, + "num_input_tokens_seen": 26958408, + "step": 46475 + }, + { + "epoch": 6.922847780756628, + "grad_norm": 6.000871181488037, + "learning_rate": 4.1326598987246356e-05, + "loss": 0.2035, + "num_input_tokens_seen": 26961448, + "step": 46480 + }, + { + "epoch": 6.923592493297587, + "grad_norm": 0.36833295226097107, + "learning_rate": 4.132413805351802e-05, + "loss": 0.1345, + "num_input_tokens_seen": 26964616, + "step": 46485 + }, + { + "epoch": 6.924337205838547, + "grad_norm": 0.31191691756248474, + "learning_rate": 4.1321676844009957e-05, + "loss": 0.1309, + "num_input_tokens_seen": 26967944, + "step": 46490 + }, + { + "epoch": 6.925081918379505, + "grad_norm": 31.745281219482422, + "learning_rate": 4.1319215358763756e-05, + "loss": 0.2525, + "num_input_tokens_seen": 26970920, + "step": 46495 + }, + { + "epoch": 6.925826630920465, + "grad_norm": 13.372968673706055, + "learning_rate": 4.1316753597821e-05, + "loss": 0.054, + "num_input_tokens_seen": 26973704, + "step": 46500 + }, + { + "epoch": 6.926571343461424, + "grad_norm": 54.43116760253906, + "learning_rate": 4.1314291561223276e-05, + "loss": 0.2518, + "num_input_tokens_seen": 26976520, + "step": 46505 + }, + { + "epoch": 6.927316056002383, + "grad_norm": 0.3063221275806427, + "learning_rate": 4.131182924901217e-05, + "loss": 0.3789, + "num_input_tokens_seen": 26979464, + "step": 46510 + }, + { + "epoch": 6.928060768543342, + "grad_norm": 1.0822504758834839, + "learning_rate": 4.1309366661229286e-05, + "loss": 0.1066, + "num_input_tokens_seen": 26982376, + "step": 46515 + }, + { + "epoch": 6.928805481084302, + "grad_norm": 20.036043167114258, + "learning_rate": 4.130690379791623e-05, + "loss": 0.0176, + "num_input_tokens_seen": 26984968, + "step": 46520 + }, + { + "epoch": 6.929550193625261, + "grad_norm": 25.837873458862305, + "learning_rate": 4.1304440659114615e-05, + "loss": 0.4864, + "num_input_tokens_seen": 26987976, + "step": 46525 + }, + { + "epoch": 6.930294906166219, + "grad_norm": 13.788301467895508, + "learning_rate": 4.130197724486604e-05, + "loss": 0.384, + "num_input_tokens_seen": 26990984, + "step": 46530 + }, + { + "epoch": 6.931039618707179, + "grad_norm": 2.3940839767456055, + "learning_rate": 4.1299513555212135e-05, + "loss": 0.0099, + "num_input_tokens_seen": 26993608, + "step": 46535 + }, + { + "epoch": 6.931784331248139, + "grad_norm": 0.23827213048934937, + "learning_rate": 4.129704959019451e-05, + "loss": 0.2407, + "num_input_tokens_seen": 26996392, + "step": 46540 + }, + { + "epoch": 6.932529043789097, + "grad_norm": 0.047173574566841125, + "learning_rate": 4.129458534985479e-05, + "loss": 0.62, + "num_input_tokens_seen": 26999368, + "step": 46545 + }, + { + "epoch": 6.933273756330056, + "grad_norm": 16.66333770751953, + "learning_rate": 4.1292120834234624e-05, + "loss": 0.2488, + "num_input_tokens_seen": 27002216, + "step": 46550 + }, + { + "epoch": 6.934018468871016, + "grad_norm": 7.938971519470215, + "learning_rate": 4.128965604337563e-05, + "loss": 0.2165, + "num_input_tokens_seen": 27004936, + "step": 46555 + }, + { + "epoch": 6.9347631814119755, + "grad_norm": 15.629137992858887, + "learning_rate": 4.128719097731945e-05, + "loss": 0.0828, + "num_input_tokens_seen": 27007784, + "step": 46560 + }, + { + "epoch": 6.935507893952934, + "grad_norm": 45.39065933227539, + "learning_rate": 4.1284725636107726e-05, + "loss": 0.3615, + "num_input_tokens_seen": 27010696, + "step": 46565 + }, + { + "epoch": 6.936252606493893, + "grad_norm": 8.39726448059082, + "learning_rate": 4.128226001978213e-05, + "loss": 0.1853, + "num_input_tokens_seen": 27013608, + "step": 46570 + }, + { + "epoch": 6.936997319034853, + "grad_norm": 72.89219665527344, + "learning_rate": 4.127979412838428e-05, + "loss": 0.4613, + "num_input_tokens_seen": 27016264, + "step": 46575 + }, + { + "epoch": 6.937742031575811, + "grad_norm": 15.69894790649414, + "learning_rate": 4.127732796195587e-05, + "loss": 0.269, + "num_input_tokens_seen": 27019080, + "step": 46580 + }, + { + "epoch": 6.938486744116771, + "grad_norm": 3.2658276557922363, + "learning_rate": 4.127486152053854e-05, + "loss": 0.0151, + "num_input_tokens_seen": 27022024, + "step": 46585 + }, + { + "epoch": 6.93923145665773, + "grad_norm": 0.6391395330429077, + "learning_rate": 4.1272394804173966e-05, + "loss": 0.0741, + "num_input_tokens_seen": 27025000, + "step": 46590 + }, + { + "epoch": 6.939976169198689, + "grad_norm": 17.891864776611328, + "learning_rate": 4.126992781290382e-05, + "loss": 0.0581, + "num_input_tokens_seen": 27027720, + "step": 46595 + }, + { + "epoch": 6.940720881739648, + "grad_norm": 0.02643602341413498, + "learning_rate": 4.1267460546769784e-05, + "loss": 0.1884, + "num_input_tokens_seen": 27030888, + "step": 46600 + }, + { + "epoch": 6.941465594280608, + "grad_norm": 0.49205276370048523, + "learning_rate": 4.126499300581353e-05, + "loss": 0.0077, + "num_input_tokens_seen": 27033928, + "step": 46605 + }, + { + "epoch": 6.942210306821567, + "grad_norm": 0.06181507930159569, + "learning_rate": 4.1262525190076763e-05, + "loss": 0.1889, + "num_input_tokens_seen": 27036456, + "step": 46610 + }, + { + "epoch": 6.942955019362526, + "grad_norm": 39.742313385009766, + "learning_rate": 4.1260057099601145e-05, + "loss": 0.2291, + "num_input_tokens_seen": 27039624, + "step": 46615 + }, + { + "epoch": 6.943699731903485, + "grad_norm": 102.6181640625, + "learning_rate": 4.12575887344284e-05, + "loss": 0.1778, + "num_input_tokens_seen": 27042472, + "step": 46620 + }, + { + "epoch": 6.944444444444445, + "grad_norm": 0.02247866429388523, + "learning_rate": 4.125512009460021e-05, + "loss": 0.0979, + "num_input_tokens_seen": 27045672, + "step": 46625 + }, + { + "epoch": 6.945189156985403, + "grad_norm": 0.09025146067142487, + "learning_rate": 4.125265118015829e-05, + "loss": 0.2609, + "num_input_tokens_seen": 27048776, + "step": 46630 + }, + { + "epoch": 6.945933869526363, + "grad_norm": 83.1128158569336, + "learning_rate": 4.125018199114434e-05, + "loss": 0.1243, + "num_input_tokens_seen": 27051816, + "step": 46635 + }, + { + "epoch": 6.946678582067322, + "grad_norm": 0.10807306319475174, + "learning_rate": 4.124771252760009e-05, + "loss": 0.2717, + "num_input_tokens_seen": 27054824, + "step": 46640 + }, + { + "epoch": 6.9474232946082815, + "grad_norm": 1.4701906442642212, + "learning_rate": 4.124524278956725e-05, + "loss": 0.0988, + "num_input_tokens_seen": 27057800, + "step": 46645 + }, + { + "epoch": 6.94816800714924, + "grad_norm": 16.145862579345703, + "learning_rate": 4.1242772777087536e-05, + "loss": 0.0144, + "num_input_tokens_seen": 27060520, + "step": 46650 + }, + { + "epoch": 6.9489127196902, + "grad_norm": 10.790995597839355, + "learning_rate": 4.124030249020269e-05, + "loss": 0.3901, + "num_input_tokens_seen": 27063144, + "step": 46655 + }, + { + "epoch": 6.949657432231159, + "grad_norm": 0.5220706462860107, + "learning_rate": 4.123783192895444e-05, + "loss": 0.1851, + "num_input_tokens_seen": 27066120, + "step": 46660 + }, + { + "epoch": 6.950402144772118, + "grad_norm": 0.014210587367415428, + "learning_rate": 4.1235361093384523e-05, + "loss": 0.0237, + "num_input_tokens_seen": 27068904, + "step": 46665 + }, + { + "epoch": 6.951146857313077, + "grad_norm": 27.365280151367188, + "learning_rate": 4.123288998353468e-05, + "loss": 0.3536, + "num_input_tokens_seen": 27071688, + "step": 46670 + }, + { + "epoch": 6.951891569854037, + "grad_norm": 0.0023320422042161226, + "learning_rate": 4.123041859944666e-05, + "loss": 0.1742, + "num_input_tokens_seen": 27074664, + "step": 46675 + }, + { + "epoch": 6.952636282394995, + "grad_norm": 115.8763198852539, + "learning_rate": 4.122794694116221e-05, + "loss": 0.2891, + "num_input_tokens_seen": 27077480, + "step": 46680 + }, + { + "epoch": 6.953380994935955, + "grad_norm": 16.485166549682617, + "learning_rate": 4.122547500872309e-05, + "loss": 0.3078, + "num_input_tokens_seen": 27080488, + "step": 46685 + }, + { + "epoch": 6.954125707476914, + "grad_norm": 34.191932678222656, + "learning_rate": 4.122300280217107e-05, + "loss": 0.3843, + "num_input_tokens_seen": 27083176, + "step": 46690 + }, + { + "epoch": 6.9548704200178735, + "grad_norm": 47.69921875, + "learning_rate": 4.1220530321547894e-05, + "loss": 0.5108, + "num_input_tokens_seen": 27085896, + "step": 46695 + }, + { + "epoch": 6.955615132558832, + "grad_norm": 0.14375928044319153, + "learning_rate": 4.121805756689535e-05, + "loss": 0.2175, + "num_input_tokens_seen": 27088936, + "step": 46700 + }, + { + "epoch": 6.956359845099792, + "grad_norm": 0.12225618958473206, + "learning_rate": 4.1215584538255206e-05, + "loss": 0.123, + "num_input_tokens_seen": 27091752, + "step": 46705 + }, + { + "epoch": 6.957104557640751, + "grad_norm": 34.9764289855957, + "learning_rate": 4.121311123566924e-05, + "loss": 0.3214, + "num_input_tokens_seen": 27095048, + "step": 46710 + }, + { + "epoch": 6.957849270181709, + "grad_norm": 0.06325870007276535, + "learning_rate": 4.121063765917924e-05, + "loss": 0.079, + "num_input_tokens_seen": 27097800, + "step": 46715 + }, + { + "epoch": 6.958593982722669, + "grad_norm": 37.443565368652344, + "learning_rate": 4.120816380882699e-05, + "loss": 0.1641, + "num_input_tokens_seen": 27100680, + "step": 46720 + }, + { + "epoch": 6.959338695263629, + "grad_norm": 36.55305480957031, + "learning_rate": 4.120568968465429e-05, + "loss": 0.178, + "num_input_tokens_seen": 27103400, + "step": 46725 + }, + { + "epoch": 6.9600834078045875, + "grad_norm": 14.727839469909668, + "learning_rate": 4.120321528670293e-05, + "loss": 0.1996, + "num_input_tokens_seen": 27106312, + "step": 46730 + }, + { + "epoch": 6.960828120345546, + "grad_norm": 0.5096633434295654, + "learning_rate": 4.120074061501472e-05, + "loss": 0.0782, + "num_input_tokens_seen": 27109160, + "step": 46735 + }, + { + "epoch": 6.961572832886506, + "grad_norm": 31.734912872314453, + "learning_rate": 4.1198265669631464e-05, + "loss": 0.2951, + "num_input_tokens_seen": 27112040, + "step": 46740 + }, + { + "epoch": 6.962317545427465, + "grad_norm": 60.08123016357422, + "learning_rate": 4.119579045059496e-05, + "loss": 0.4763, + "num_input_tokens_seen": 27115144, + "step": 46745 + }, + { + "epoch": 6.963062257968424, + "grad_norm": 1.6836870908737183, + "learning_rate": 4.119331495794705e-05, + "loss": 0.102, + "num_input_tokens_seen": 27118440, + "step": 46750 + }, + { + "epoch": 6.963806970509383, + "grad_norm": 35.486839294433594, + "learning_rate": 4.119083919172954e-05, + "loss": 0.344, + "num_input_tokens_seen": 27121224, + "step": 46755 + }, + { + "epoch": 6.964551683050343, + "grad_norm": 8.595817565917969, + "learning_rate": 4.118836315198425e-05, + "loss": 0.0984, + "num_input_tokens_seen": 27123880, + "step": 46760 + }, + { + "epoch": 6.965296395591301, + "grad_norm": 15.000800132751465, + "learning_rate": 4.118588683875303e-05, + "loss": 0.3014, + "num_input_tokens_seen": 27126728, + "step": 46765 + }, + { + "epoch": 6.966041108132261, + "grad_norm": 30.26706314086914, + "learning_rate": 4.11834102520777e-05, + "loss": 0.0632, + "num_input_tokens_seen": 27129832, + "step": 46770 + }, + { + "epoch": 6.96678582067322, + "grad_norm": 21.89060401916504, + "learning_rate": 4.118093339200009e-05, + "loss": 0.3341, + "num_input_tokens_seen": 27133320, + "step": 46775 + }, + { + "epoch": 6.9675305332141795, + "grad_norm": 124.61042785644531, + "learning_rate": 4.1178456258562064e-05, + "loss": 0.34, + "num_input_tokens_seen": 27137224, + "step": 46780 + }, + { + "epoch": 6.968275245755138, + "grad_norm": 39.04377365112305, + "learning_rate": 4.117597885180546e-05, + "loss": 0.0341, + "num_input_tokens_seen": 27140360, + "step": 46785 + }, + { + "epoch": 6.969019958296098, + "grad_norm": 0.1183716356754303, + "learning_rate": 4.117350117177214e-05, + "loss": 0.081, + "num_input_tokens_seen": 27143048, + "step": 46790 + }, + { + "epoch": 6.969764670837057, + "grad_norm": 0.008653845638036728, + "learning_rate": 4.1171023218503945e-05, + "loss": 0.0136, + "num_input_tokens_seen": 27145672, + "step": 46795 + }, + { + "epoch": 6.970509383378016, + "grad_norm": 1.910613775253296, + "learning_rate": 4.1168544992042756e-05, + "loss": 0.1789, + "num_input_tokens_seen": 27148584, + "step": 46800 + }, + { + "epoch": 6.971254095918975, + "grad_norm": 35.80718994140625, + "learning_rate": 4.116606649243043e-05, + "loss": 0.0738, + "num_input_tokens_seen": 27151400, + "step": 46805 + }, + { + "epoch": 6.971998808459935, + "grad_norm": 0.7274067997932434, + "learning_rate": 4.116358771970885e-05, + "loss": 0.1933, + "num_input_tokens_seen": 27154088, + "step": 46810 + }, + { + "epoch": 6.9727435210008935, + "grad_norm": 0.0687704086303711, + "learning_rate": 4.1161108673919874e-05, + "loss": 0.1383, + "num_input_tokens_seen": 27156968, + "step": 46815 + }, + { + "epoch": 6.973488233541853, + "grad_norm": 19.582670211791992, + "learning_rate": 4.115862935510539e-05, + "loss": 0.2761, + "num_input_tokens_seen": 27159912, + "step": 46820 + }, + { + "epoch": 6.974232946082812, + "grad_norm": 7.020047187805176, + "learning_rate": 4.1156149763307296e-05, + "loss": 0.0892, + "num_input_tokens_seen": 27162760, + "step": 46825 + }, + { + "epoch": 6.9749776586237715, + "grad_norm": 0.2671598792076111, + "learning_rate": 4.115366989856746e-05, + "loss": 0.0705, + "num_input_tokens_seen": 27166152, + "step": 46830 + }, + { + "epoch": 6.97572237116473, + "grad_norm": 13.649698257446289, + "learning_rate": 4.11511897609278e-05, + "loss": 0.276, + "num_input_tokens_seen": 27169000, + "step": 46835 + }, + { + "epoch": 6.97646708370569, + "grad_norm": 0.9110006093978882, + "learning_rate": 4.1148709350430194e-05, + "loss": 0.0819, + "num_input_tokens_seen": 27172104, + "step": 46840 + }, + { + "epoch": 6.977211796246649, + "grad_norm": 7.368323802947998, + "learning_rate": 4.114622866711657e-05, + "loss": 0.201, + "num_input_tokens_seen": 27175144, + "step": 46845 + }, + { + "epoch": 6.977956508787608, + "grad_norm": 36.198421478271484, + "learning_rate": 4.114374771102881e-05, + "loss": 0.3267, + "num_input_tokens_seen": 27177992, + "step": 46850 + }, + { + "epoch": 6.978701221328567, + "grad_norm": 5.033214569091797, + "learning_rate": 4.114126648220884e-05, + "loss": 0.4002, + "num_input_tokens_seen": 27180840, + "step": 46855 + }, + { + "epoch": 6.979445933869527, + "grad_norm": 0.7612327337265015, + "learning_rate": 4.1138784980698585e-05, + "loss": 0.5808, + "num_input_tokens_seen": 27183624, + "step": 46860 + }, + { + "epoch": 6.9801906464104855, + "grad_norm": 43.64994430541992, + "learning_rate": 4.113630320653996e-05, + "loss": 0.5526, + "num_input_tokens_seen": 27186408, + "step": 46865 + }, + { + "epoch": 6.980935358951445, + "grad_norm": 0.3370106518268585, + "learning_rate": 4.113382115977489e-05, + "loss": 0.125, + "num_input_tokens_seen": 27189352, + "step": 46870 + }, + { + "epoch": 6.981680071492404, + "grad_norm": 92.65375518798828, + "learning_rate": 4.113133884044531e-05, + "loss": 0.4333, + "num_input_tokens_seen": 27192136, + "step": 46875 + }, + { + "epoch": 6.982424784033363, + "grad_norm": 133.07749938964844, + "learning_rate": 4.112885624859316e-05, + "loss": 0.5523, + "num_input_tokens_seen": 27195304, + "step": 46880 + }, + { + "epoch": 6.983169496574322, + "grad_norm": 0.024714531376957893, + "learning_rate": 4.1126373384260365e-05, + "loss": 0.1109, + "num_input_tokens_seen": 27198344, + "step": 46885 + }, + { + "epoch": 6.983914209115282, + "grad_norm": 45.35355758666992, + "learning_rate": 4.112389024748889e-05, + "loss": 0.1111, + "num_input_tokens_seen": 27201096, + "step": 46890 + }, + { + "epoch": 6.984658921656241, + "grad_norm": 40.689056396484375, + "learning_rate": 4.112140683832068e-05, + "loss": 0.2937, + "num_input_tokens_seen": 27204136, + "step": 46895 + }, + { + "epoch": 6.9854036341971995, + "grad_norm": 0.6057962775230408, + "learning_rate": 4.1118923156797684e-05, + "loss": 0.12, + "num_input_tokens_seen": 27206792, + "step": 46900 + }, + { + "epoch": 6.986148346738159, + "grad_norm": 29.794858932495117, + "learning_rate": 4.111643920296185e-05, + "loss": 0.1404, + "num_input_tokens_seen": 27209512, + "step": 46905 + }, + { + "epoch": 6.986893059279119, + "grad_norm": 27.14517593383789, + "learning_rate": 4.1113954976855174e-05, + "loss": 0.1689, + "num_input_tokens_seen": 27212680, + "step": 46910 + }, + { + "epoch": 6.9876377718200775, + "grad_norm": 6.37407922744751, + "learning_rate": 4.11114704785196e-05, + "loss": 0.1945, + "num_input_tokens_seen": 27215432, + "step": 46915 + }, + { + "epoch": 6.988382484361036, + "grad_norm": 0.975013792514801, + "learning_rate": 4.1108985707997105e-05, + "loss": 0.3344, + "num_input_tokens_seen": 27218216, + "step": 46920 + }, + { + "epoch": 6.989127196901996, + "grad_norm": 1.0391170978546143, + "learning_rate": 4.110650066532967e-05, + "loss": 0.0015, + "num_input_tokens_seen": 27221288, + "step": 46925 + }, + { + "epoch": 6.989871909442955, + "grad_norm": 0.6961522698402405, + "learning_rate": 4.1104015350559275e-05, + "loss": 0.1154, + "num_input_tokens_seen": 27224200, + "step": 46930 + }, + { + "epoch": 6.990616621983914, + "grad_norm": 0.9980616569519043, + "learning_rate": 4.110152976372791e-05, + "loss": 0.3288, + "num_input_tokens_seen": 27227400, + "step": 46935 + }, + { + "epoch": 6.991361334524873, + "grad_norm": 12.998634338378906, + "learning_rate": 4.1099043904877564e-05, + "loss": 0.3829, + "num_input_tokens_seen": 27230344, + "step": 46940 + }, + { + "epoch": 6.992106047065833, + "grad_norm": 20.949583053588867, + "learning_rate": 4.1096557774050235e-05, + "loss": 0.6706, + "num_input_tokens_seen": 27233384, + "step": 46945 + }, + { + "epoch": 6.9928507596067915, + "grad_norm": 25.016633987426758, + "learning_rate": 4.1094071371287925e-05, + "loss": 0.5888, + "num_input_tokens_seen": 27236296, + "step": 46950 + }, + { + "epoch": 6.993595472147751, + "grad_norm": 15.326269149780273, + "learning_rate": 4.109158469663263e-05, + "loss": 0.2004, + "num_input_tokens_seen": 27239048, + "step": 46955 + }, + { + "epoch": 6.99434018468871, + "grad_norm": 5.071754455566406, + "learning_rate": 4.108909775012637e-05, + "loss": 0.0503, + "num_input_tokens_seen": 27241768, + "step": 46960 + }, + { + "epoch": 6.9950848972296695, + "grad_norm": 16.99060821533203, + "learning_rate": 4.1086610531811155e-05, + "loss": 0.0861, + "num_input_tokens_seen": 27244680, + "step": 46965 + }, + { + "epoch": 6.995829609770628, + "grad_norm": 18.604162216186523, + "learning_rate": 4.108412304172901e-05, + "loss": 0.09, + "num_input_tokens_seen": 27247528, + "step": 46970 + }, + { + "epoch": 6.996574322311588, + "grad_norm": 1.392711877822876, + "learning_rate": 4.1081635279921945e-05, + "loss": 0.1154, + "num_input_tokens_seen": 27250600, + "step": 46975 + }, + { + "epoch": 6.997319034852547, + "grad_norm": 22.200448989868164, + "learning_rate": 4.107914724643199e-05, + "loss": 0.3425, + "num_input_tokens_seen": 27253544, + "step": 46980 + }, + { + "epoch": 6.998063747393506, + "grad_norm": 0.03244045749306679, + "learning_rate": 4.107665894130121e-05, + "loss": 0.0082, + "num_input_tokens_seen": 27256360, + "step": 46985 + }, + { + "epoch": 6.998808459934465, + "grad_norm": 7.0737810134887695, + "learning_rate": 4.107417036457159e-05, + "loss": 0.3619, + "num_input_tokens_seen": 27259592, + "step": 46990 + }, + { + "epoch": 6.999553172475425, + "grad_norm": 0.05649425834417343, + "learning_rate": 4.107168151628521e-05, + "loss": 0.1597, + "num_input_tokens_seen": 27262472, + "step": 46995 + }, + { + "epoch": 7.0, + "eval_loss": 1.323397159576416, + "eval_runtime": 51.2868, + "eval_samples_per_second": 58.183, + "eval_steps_per_second": 14.546, + "num_input_tokens_seen": 27263880, + "step": 46998 + }, + { + "epoch": 7.0002978850163835, + "grad_norm": 29.551734924316406, + "learning_rate": 4.10691923964841e-05, + "loss": 0.3582, + "num_input_tokens_seen": 27265128, + "step": 47000 + }, + { + "epoch": 7.001042597557343, + "grad_norm": 2.5166709423065186, + "learning_rate": 4.106670300521033e-05, + "loss": 0.0472, + "num_input_tokens_seen": 27267944, + "step": 47005 + }, + { + "epoch": 7.001787310098302, + "grad_norm": 1.3690448999404907, + "learning_rate": 4.106421334250593e-05, + "loss": 0.1326, + "num_input_tokens_seen": 27270632, + "step": 47010 + }, + { + "epoch": 7.0025320226392616, + "grad_norm": 0.2741718292236328, + "learning_rate": 4.106172340841298e-05, + "loss": 0.0625, + "num_input_tokens_seen": 27273832, + "step": 47015 + }, + { + "epoch": 7.00327673518022, + "grad_norm": 0.025532154366374016, + "learning_rate": 4.105923320297353e-05, + "loss": 0.0845, + "num_input_tokens_seen": 27276616, + "step": 47020 + }, + { + "epoch": 7.00402144772118, + "grad_norm": 0.01766917295753956, + "learning_rate": 4.1056742726229655e-05, + "loss": 0.0826, + "num_input_tokens_seen": 27279464, + "step": 47025 + }, + { + "epoch": 7.004766160262139, + "grad_norm": 113.88995361328125, + "learning_rate": 4.105425197822344e-05, + "loss": 0.3838, + "num_input_tokens_seen": 27282344, + "step": 47030 + }, + { + "epoch": 7.005510872803098, + "grad_norm": 0.04989129677414894, + "learning_rate": 4.105176095899696e-05, + "loss": 0.0007, + "num_input_tokens_seen": 27285352, + "step": 47035 + }, + { + "epoch": 7.006255585344057, + "grad_norm": 0.10176362097263336, + "learning_rate": 4.104926966859227e-05, + "loss": 0.195, + "num_input_tokens_seen": 27288392, + "step": 47040 + }, + { + "epoch": 7.007000297885017, + "grad_norm": 0.7963666319847107, + "learning_rate": 4.1046778107051495e-05, + "loss": 0.314, + "num_input_tokens_seen": 27291144, + "step": 47045 + }, + { + "epoch": 7.0077450104259755, + "grad_norm": 32.83477020263672, + "learning_rate": 4.104428627441672e-05, + "loss": 0.1821, + "num_input_tokens_seen": 27294024, + "step": 47050 + }, + { + "epoch": 7.008489722966935, + "grad_norm": 32.3482666015625, + "learning_rate": 4.104179417073002e-05, + "loss": 0.1974, + "num_input_tokens_seen": 27296808, + "step": 47055 + }, + { + "epoch": 7.009234435507894, + "grad_norm": 0.033086929470300674, + "learning_rate": 4.103930179603352e-05, + "loss": 0.0746, + "num_input_tokens_seen": 27300264, + "step": 47060 + }, + { + "epoch": 7.009979148048854, + "grad_norm": 18.636150360107422, + "learning_rate": 4.103680915036932e-05, + "loss": 0.1178, + "num_input_tokens_seen": 27303432, + "step": 47065 + }, + { + "epoch": 7.010723860589812, + "grad_norm": 0.2710210680961609, + "learning_rate": 4.1034316233779526e-05, + "loss": 0.1462, + "num_input_tokens_seen": 27306408, + "step": 47070 + }, + { + "epoch": 7.011468573130771, + "grad_norm": 0.047357093542814255, + "learning_rate": 4.103182304630625e-05, + "loss": 0.3322, + "num_input_tokens_seen": 27309096, + "step": 47075 + }, + { + "epoch": 7.012213285671731, + "grad_norm": 0.14321289956569672, + "learning_rate": 4.102932958799163e-05, + "loss": 0.1231, + "num_input_tokens_seen": 27311880, + "step": 47080 + }, + { + "epoch": 7.0129579982126895, + "grad_norm": 65.6889419555664, + "learning_rate": 4.102683585887777e-05, + "loss": 0.214, + "num_input_tokens_seen": 27314440, + "step": 47085 + }, + { + "epoch": 7.013702710753649, + "grad_norm": 0.10924582183361053, + "learning_rate": 4.102434185900681e-05, + "loss": 0.2889, + "num_input_tokens_seen": 27317192, + "step": 47090 + }, + { + "epoch": 7.014447423294608, + "grad_norm": 1.5912264585494995, + "learning_rate": 4.1021847588420876e-05, + "loss": 0.1926, + "num_input_tokens_seen": 27320264, + "step": 47095 + }, + { + "epoch": 7.0151921358355676, + "grad_norm": 0.07004391402006149, + "learning_rate": 4.101935304716211e-05, + "loss": 0.205, + "num_input_tokens_seen": 27322952, + "step": 47100 + }, + { + "epoch": 7.015936848376526, + "grad_norm": 57.49339294433594, + "learning_rate": 4.101685823527266e-05, + "loss": 0.2155, + "num_input_tokens_seen": 27325672, + "step": 47105 + }, + { + "epoch": 7.016681560917486, + "grad_norm": 14.991350173950195, + "learning_rate": 4.1014363152794664e-05, + "loss": 0.0975, + "num_input_tokens_seen": 27328616, + "step": 47110 + }, + { + "epoch": 7.017426273458445, + "grad_norm": 0.06501106172800064, + "learning_rate": 4.101186779977029e-05, + "loss": 0.1527, + "num_input_tokens_seen": 27331656, + "step": 47115 + }, + { + "epoch": 7.018170985999404, + "grad_norm": 8.184375762939453, + "learning_rate": 4.1009372176241675e-05, + "loss": 0.0838, + "num_input_tokens_seen": 27334600, + "step": 47120 + }, + { + "epoch": 7.018915698540363, + "grad_norm": 14.603534698486328, + "learning_rate": 4.100687628225099e-05, + "loss": 0.23, + "num_input_tokens_seen": 27337384, + "step": 47125 + }, + { + "epoch": 7.019660411081323, + "grad_norm": 0.9689487218856812, + "learning_rate": 4.10043801178404e-05, + "loss": 0.2031, + "num_input_tokens_seen": 27340328, + "step": 47130 + }, + { + "epoch": 7.0204051236222815, + "grad_norm": 0.06935387849807739, + "learning_rate": 4.100188368305207e-05, + "loss": 0.0078, + "num_input_tokens_seen": 27343208, + "step": 47135 + }, + { + "epoch": 7.021149836163241, + "grad_norm": 23.887067794799805, + "learning_rate": 4.099938697792818e-05, + "loss": 0.2054, + "num_input_tokens_seen": 27345992, + "step": 47140 + }, + { + "epoch": 7.0218945487042, + "grad_norm": 0.004680742044001818, + "learning_rate": 4.099689000251091e-05, + "loss": 0.0631, + "num_input_tokens_seen": 27348680, + "step": 47145 + }, + { + "epoch": 7.02263926124516, + "grad_norm": 0.04116229712963104, + "learning_rate": 4.0994392756842444e-05, + "loss": 0.2529, + "num_input_tokens_seen": 27351560, + "step": 47150 + }, + { + "epoch": 7.023383973786118, + "grad_norm": 95.57456970214844, + "learning_rate": 4.099189524096496e-05, + "loss": 0.1908, + "num_input_tokens_seen": 27354824, + "step": 47155 + }, + { + "epoch": 7.024128686327078, + "grad_norm": 0.1303243488073349, + "learning_rate": 4.098939745492066e-05, + "loss": 0.0011, + "num_input_tokens_seen": 27357704, + "step": 47160 + }, + { + "epoch": 7.024873398868037, + "grad_norm": 1.2413945198059082, + "learning_rate": 4.0986899398751754e-05, + "loss": 0.248, + "num_input_tokens_seen": 27360616, + "step": 47165 + }, + { + "epoch": 7.025618111408996, + "grad_norm": 12.322970390319824, + "learning_rate": 4.098440107250042e-05, + "loss": 0.022, + "num_input_tokens_seen": 27363688, + "step": 47170 + }, + { + "epoch": 7.026362823949955, + "grad_norm": 25.53248405456543, + "learning_rate": 4.098190247620888e-05, + "loss": 0.0977, + "num_input_tokens_seen": 27366504, + "step": 47175 + }, + { + "epoch": 7.027107536490915, + "grad_norm": 0.07211656123399734, + "learning_rate": 4.097940360991934e-05, + "loss": 0.0781, + "num_input_tokens_seen": 27369128, + "step": 47180 + }, + { + "epoch": 7.0278522490318736, + "grad_norm": 28.649917602539062, + "learning_rate": 4.097690447367402e-05, + "loss": 0.2249, + "num_input_tokens_seen": 27372072, + "step": 47185 + }, + { + "epoch": 7.028596961572833, + "grad_norm": 63.243595123291016, + "learning_rate": 4.097440506751513e-05, + "loss": 0.0517, + "num_input_tokens_seen": 27374984, + "step": 47190 + }, + { + "epoch": 7.029341674113792, + "grad_norm": 0.02129884622991085, + "learning_rate": 4.097190539148491e-05, + "loss": 0.1992, + "num_input_tokens_seen": 27378152, + "step": 47195 + }, + { + "epoch": 7.030086386654752, + "grad_norm": 71.8897705078125, + "learning_rate": 4.096940544562557e-05, + "loss": 0.0526, + "num_input_tokens_seen": 27382760, + "step": 47200 + }, + { + "epoch": 7.03083109919571, + "grad_norm": 0.004462086129933596, + "learning_rate": 4.096690522997936e-05, + "loss": 0.0335, + "num_input_tokens_seen": 27385672, + "step": 47205 + }, + { + "epoch": 7.03157581173667, + "grad_norm": 3.993476390838623, + "learning_rate": 4.096440474458852e-05, + "loss": 0.0039, + "num_input_tokens_seen": 27388520, + "step": 47210 + }, + { + "epoch": 7.032320524277629, + "grad_norm": 144.07498168945312, + "learning_rate": 4.096190398949529e-05, + "loss": 0.4026, + "num_input_tokens_seen": 27391368, + "step": 47215 + }, + { + "epoch": 7.033065236818588, + "grad_norm": 122.8138656616211, + "learning_rate": 4.09594029647419e-05, + "loss": 0.2602, + "num_input_tokens_seen": 27394248, + "step": 47220 + }, + { + "epoch": 7.033809949359547, + "grad_norm": 0.6562849283218384, + "learning_rate": 4.095690167037063e-05, + "loss": 0.0018, + "num_input_tokens_seen": 27397480, + "step": 47225 + }, + { + "epoch": 7.034554661900507, + "grad_norm": 0.05242868885397911, + "learning_rate": 4.095440010642372e-05, + "loss": 0.1341, + "num_input_tokens_seen": 27400360, + "step": 47230 + }, + { + "epoch": 7.035299374441466, + "grad_norm": 0.006865806877613068, + "learning_rate": 4.0951898272943436e-05, + "loss": 0.0102, + "num_input_tokens_seen": 27403240, + "step": 47235 + }, + { + "epoch": 7.036044086982425, + "grad_norm": 0.10502069443464279, + "learning_rate": 4.094939616997204e-05, + "loss": 0.1842, + "num_input_tokens_seen": 27406344, + "step": 47240 + }, + { + "epoch": 7.036788799523384, + "grad_norm": 65.12823486328125, + "learning_rate": 4.094689379755181e-05, + "loss": 0.2545, + "num_input_tokens_seen": 27409224, + "step": 47245 + }, + { + "epoch": 7.037533512064343, + "grad_norm": 26.933591842651367, + "learning_rate": 4.094439115572502e-05, + "loss": 0.1245, + "num_input_tokens_seen": 27412008, + "step": 47250 + }, + { + "epoch": 7.038278224605302, + "grad_norm": 0.021582825109362602, + "learning_rate": 4.094188824453394e-05, + "loss": 0.1968, + "num_input_tokens_seen": 27414856, + "step": 47255 + }, + { + "epoch": 7.039022937146261, + "grad_norm": 0.00852080900222063, + "learning_rate": 4.0939385064020866e-05, + "loss": 0.3616, + "num_input_tokens_seen": 27417800, + "step": 47260 + }, + { + "epoch": 7.039767649687221, + "grad_norm": 12.924233436584473, + "learning_rate": 4.093688161422808e-05, + "loss": 0.112, + "num_input_tokens_seen": 27420712, + "step": 47265 + }, + { + "epoch": 7.0405123622281796, + "grad_norm": 54.49153518676758, + "learning_rate": 4.093437789519787e-05, + "loss": 0.0466, + "num_input_tokens_seen": 27423688, + "step": 47270 + }, + { + "epoch": 7.041257074769139, + "grad_norm": 13.783525466918945, + "learning_rate": 4.093187390697255e-05, + "loss": 0.4552, + "num_input_tokens_seen": 27426952, + "step": 47275 + }, + { + "epoch": 7.042001787310098, + "grad_norm": 5.175258159637451, + "learning_rate": 4.0929369649594416e-05, + "loss": 0.1091, + "num_input_tokens_seen": 27429544, + "step": 47280 + }, + { + "epoch": 7.042746499851058, + "grad_norm": 0.1524088978767395, + "learning_rate": 4.092686512310576e-05, + "loss": 0.132, + "num_input_tokens_seen": 27432456, + "step": 47285 + }, + { + "epoch": 7.043491212392016, + "grad_norm": 1.1797083616256714, + "learning_rate": 4.09243603275489e-05, + "loss": 0.0017, + "num_input_tokens_seen": 27435336, + "step": 47290 + }, + { + "epoch": 7.044235924932976, + "grad_norm": 0.02781287208199501, + "learning_rate": 4.092185526296618e-05, + "loss": 0.1328, + "num_input_tokens_seen": 27438760, + "step": 47295 + }, + { + "epoch": 7.044980637473935, + "grad_norm": 0.22878916561603546, + "learning_rate": 4.091934992939989e-05, + "loss": 0.1468, + "num_input_tokens_seen": 27441608, + "step": 47300 + }, + { + "epoch": 7.045725350014894, + "grad_norm": 0.0065803686156868935, + "learning_rate": 4.0916844326892344e-05, + "loss": 0.0998, + "num_input_tokens_seen": 27444168, + "step": 47305 + }, + { + "epoch": 7.046470062555853, + "grad_norm": 154.4503173828125, + "learning_rate": 4.091433845548591e-05, + "loss": 0.3978, + "num_input_tokens_seen": 27447080, + "step": 47310 + }, + { + "epoch": 7.047214775096813, + "grad_norm": 19.200885772705078, + "learning_rate": 4.0911832315222896e-05, + "loss": 0.2301, + "num_input_tokens_seen": 27450248, + "step": 47315 + }, + { + "epoch": 7.047959487637772, + "grad_norm": 41.512760162353516, + "learning_rate": 4.090932590614565e-05, + "loss": 0.4202, + "num_input_tokens_seen": 27453512, + "step": 47320 + }, + { + "epoch": 7.048704200178731, + "grad_norm": 13.440164566040039, + "learning_rate": 4.09068192282965e-05, + "loss": 0.2611, + "num_input_tokens_seen": 27456904, + "step": 47325 + }, + { + "epoch": 7.04944891271969, + "grad_norm": 0.010292630642652512, + "learning_rate": 4.090431228171782e-05, + "loss": 0.2827, + "num_input_tokens_seen": 27459624, + "step": 47330 + }, + { + "epoch": 7.05019362526065, + "grad_norm": 0.7360701560974121, + "learning_rate": 4.0901805066451946e-05, + "loss": 0.1265, + "num_input_tokens_seen": 27462472, + "step": 47335 + }, + { + "epoch": 7.050938337801608, + "grad_norm": 76.5500259399414, + "learning_rate": 4.089929758254123e-05, + "loss": 0.2092, + "num_input_tokens_seen": 27465224, + "step": 47340 + }, + { + "epoch": 7.051683050342568, + "grad_norm": 125.8465576171875, + "learning_rate": 4.089678983002805e-05, + "loss": 0.0443, + "num_input_tokens_seen": 27468008, + "step": 47345 + }, + { + "epoch": 7.052427762883527, + "grad_norm": 16.077354431152344, + "learning_rate": 4.089428180895476e-05, + "loss": 0.1824, + "num_input_tokens_seen": 27470824, + "step": 47350 + }, + { + "epoch": 7.053172475424486, + "grad_norm": 0.01824994944036007, + "learning_rate": 4.089177351936373e-05, + "loss": 0.2601, + "num_input_tokens_seen": 27473800, + "step": 47355 + }, + { + "epoch": 7.053917187965445, + "grad_norm": 0.025390632450580597, + "learning_rate": 4.0889264961297336e-05, + "loss": 0.0589, + "num_input_tokens_seen": 27477000, + "step": 47360 + }, + { + "epoch": 7.054661900506405, + "grad_norm": 18.227815628051758, + "learning_rate": 4.0886756134797964e-05, + "loss": 0.2057, + "num_input_tokens_seen": 27479688, + "step": 47365 + }, + { + "epoch": 7.055406613047364, + "grad_norm": 0.014269270934164524, + "learning_rate": 4.0884247039907984e-05, + "loss": 0.0008, + "num_input_tokens_seen": 27482632, + "step": 47370 + }, + { + "epoch": 7.056151325588323, + "grad_norm": 0.6687119603157043, + "learning_rate": 4.0881737676669813e-05, + "loss": 0.0079, + "num_input_tokens_seen": 27485512, + "step": 47375 + }, + { + "epoch": 7.056896038129282, + "grad_norm": 32.54817199707031, + "learning_rate": 4.087922804512582e-05, + "loss": 0.0574, + "num_input_tokens_seen": 27488360, + "step": 47380 + }, + { + "epoch": 7.057640750670242, + "grad_norm": 18.65304183959961, + "learning_rate": 4.087671814531839e-05, + "loss": 0.0292, + "num_input_tokens_seen": 27491656, + "step": 47385 + }, + { + "epoch": 7.0583854632112, + "grad_norm": 32.84172439575195, + "learning_rate": 4.087420797728996e-05, + "loss": 0.283, + "num_input_tokens_seen": 27494728, + "step": 47390 + }, + { + "epoch": 7.05913017575216, + "grad_norm": 0.031376905739307404, + "learning_rate": 4.087169754108292e-05, + "loss": 0.0008, + "num_input_tokens_seen": 27497768, + "step": 47395 + }, + { + "epoch": 7.059874888293119, + "grad_norm": 0.007220784202218056, + "learning_rate": 4.0869186836739674e-05, + "loss": 0.0135, + "num_input_tokens_seen": 27500648, + "step": 47400 + }, + { + "epoch": 7.0606196008340785, + "grad_norm": 46.560367584228516, + "learning_rate": 4.086667586430265e-05, + "loss": 0.2579, + "num_input_tokens_seen": 27503624, + "step": 47405 + }, + { + "epoch": 7.061364313375037, + "grad_norm": 4.3777289390563965, + "learning_rate": 4.086416462381426e-05, + "loss": 0.0142, + "num_input_tokens_seen": 27506504, + "step": 47410 + }, + { + "epoch": 7.062109025915996, + "grad_norm": 0.5235101580619812, + "learning_rate": 4.086165311531694e-05, + "loss": 0.298, + "num_input_tokens_seen": 27509224, + "step": 47415 + }, + { + "epoch": 7.062853738456956, + "grad_norm": 12.068378448486328, + "learning_rate": 4.085914133885311e-05, + "loss": 0.0866, + "num_input_tokens_seen": 27512040, + "step": 47420 + }, + { + "epoch": 7.063598450997914, + "grad_norm": 0.28944990038871765, + "learning_rate": 4.08566292944652e-05, + "loss": 0.0045, + "num_input_tokens_seen": 27515272, + "step": 47425 + }, + { + "epoch": 7.064343163538874, + "grad_norm": 0.0076327575370669365, + "learning_rate": 4.085411698219566e-05, + "loss": 0.0536, + "num_input_tokens_seen": 27518376, + "step": 47430 + }, + { + "epoch": 7.065087876079833, + "grad_norm": 9.561519622802734, + "learning_rate": 4.085160440208692e-05, + "loss": 0.0115, + "num_input_tokens_seen": 27521192, + "step": 47435 + }, + { + "epoch": 7.065832588620792, + "grad_norm": 0.015947677195072174, + "learning_rate": 4.084909155418143e-05, + "loss": 0.1093, + "num_input_tokens_seen": 27524104, + "step": 47440 + }, + { + "epoch": 7.066577301161751, + "grad_norm": 0.4742138981819153, + "learning_rate": 4.084657843852166e-05, + "loss": 0.0954, + "num_input_tokens_seen": 27526984, + "step": 47445 + }, + { + "epoch": 7.067322013702711, + "grad_norm": 0.03988231346011162, + "learning_rate": 4.0844065055150046e-05, + "loss": 0.1846, + "num_input_tokens_seen": 27529960, + "step": 47450 + }, + { + "epoch": 7.06806672624367, + "grad_norm": 19.18065071105957, + "learning_rate": 4.0841551404109056e-05, + "loss": 0.0064, + "num_input_tokens_seen": 27533032, + "step": 47455 + }, + { + "epoch": 7.068811438784629, + "grad_norm": 0.19325228035449982, + "learning_rate": 4.083903748544116e-05, + "loss": 0.0132, + "num_input_tokens_seen": 27535848, + "step": 47460 + }, + { + "epoch": 7.069556151325588, + "grad_norm": 108.17124938964844, + "learning_rate": 4.0836523299188826e-05, + "loss": 0.0639, + "num_input_tokens_seen": 27538760, + "step": 47465 + }, + { + "epoch": 7.070300863866548, + "grad_norm": 0.042579520493745804, + "learning_rate": 4.083400884539452e-05, + "loss": 0.1909, + "num_input_tokens_seen": 27541544, + "step": 47470 + }, + { + "epoch": 7.071045576407506, + "grad_norm": 18.4677677154541, + "learning_rate": 4.083149412410072e-05, + "loss": 0.5091, + "num_input_tokens_seen": 27544616, + "step": 47475 + }, + { + "epoch": 7.071790288948466, + "grad_norm": 11.05893325805664, + "learning_rate": 4.082897913534993e-05, + "loss": 0.0319, + "num_input_tokens_seen": 27547432, + "step": 47480 + }, + { + "epoch": 7.072535001489425, + "grad_norm": 0.985521674156189, + "learning_rate": 4.0826463879184615e-05, + "loss": 0.1157, + "num_input_tokens_seen": 27550216, + "step": 47485 + }, + { + "epoch": 7.0732797140303845, + "grad_norm": 0.17268680036067963, + "learning_rate": 4.082394835564729e-05, + "loss": 0.1757, + "num_input_tokens_seen": 27553544, + "step": 47490 + }, + { + "epoch": 7.074024426571343, + "grad_norm": 0.036665577441453934, + "learning_rate": 4.082143256478044e-05, + "loss": 0.0017, + "num_input_tokens_seen": 27556168, + "step": 47495 + }, + { + "epoch": 7.074769139112303, + "grad_norm": 0.00805983878672123, + "learning_rate": 4.081891650662656e-05, + "loss": 0.0497, + "num_input_tokens_seen": 27559272, + "step": 47500 + }, + { + "epoch": 7.075513851653262, + "grad_norm": 0.008248048834502697, + "learning_rate": 4.0816400181228165e-05, + "loss": 0.1382, + "num_input_tokens_seen": 27562024, + "step": 47505 + }, + { + "epoch": 7.076258564194221, + "grad_norm": 0.04483509436249733, + "learning_rate": 4.081388358862776e-05, + "loss": 0.2057, + "num_input_tokens_seen": 27564904, + "step": 47510 + }, + { + "epoch": 7.07700327673518, + "grad_norm": 0.021829620003700256, + "learning_rate": 4.0811366728867874e-05, + "loss": 0.1566, + "num_input_tokens_seen": 27567688, + "step": 47515 + }, + { + "epoch": 7.07774798927614, + "grad_norm": 14.745342254638672, + "learning_rate": 4.080884960199101e-05, + "loss": 0.2757, + "num_input_tokens_seen": 27570664, + "step": 47520 + }, + { + "epoch": 7.078492701817098, + "grad_norm": 0.34985870122909546, + "learning_rate": 4.08063322080397e-05, + "loss": 0.1603, + "num_input_tokens_seen": 27573864, + "step": 47525 + }, + { + "epoch": 7.079237414358058, + "grad_norm": 0.044973477721214294, + "learning_rate": 4.080381454705647e-05, + "loss": 0.0931, + "num_input_tokens_seen": 27576680, + "step": 47530 + }, + { + "epoch": 7.079982126899017, + "grad_norm": 0.06396611779928207, + "learning_rate": 4.080129661908386e-05, + "loss": 0.1229, + "num_input_tokens_seen": 27579688, + "step": 47535 + }, + { + "epoch": 7.0807268394399765, + "grad_norm": 23.08064079284668, + "learning_rate": 4.07987784241644e-05, + "loss": 0.5301, + "num_input_tokens_seen": 27582472, + "step": 47540 + }, + { + "epoch": 7.081471551980935, + "grad_norm": 0.038064006716012955, + "learning_rate": 4.0796259962340636e-05, + "loss": 0.0207, + "num_input_tokens_seen": 27585160, + "step": 47545 + }, + { + "epoch": 7.082216264521895, + "grad_norm": 0.031680770218372345, + "learning_rate": 4.079374123365512e-05, + "loss": 0.136, + "num_input_tokens_seen": 27588072, + "step": 47550 + }, + { + "epoch": 7.082960977062854, + "grad_norm": 14.129857063293457, + "learning_rate": 4.079122223815039e-05, + "loss": 0.3514, + "num_input_tokens_seen": 27590952, + "step": 47555 + }, + { + "epoch": 7.083705689603813, + "grad_norm": 39.96210861206055, + "learning_rate": 4.0788702975869013e-05, + "loss": 0.2709, + "num_input_tokens_seen": 27594120, + "step": 47560 + }, + { + "epoch": 7.084450402144772, + "grad_norm": 0.20311430096626282, + "learning_rate": 4.0786183446853545e-05, + "loss": 0.1276, + "num_input_tokens_seen": 27597128, + "step": 47565 + }, + { + "epoch": 7.085195114685732, + "grad_norm": 0.025911057367920876, + "learning_rate": 4.0783663651146555e-05, + "loss": 0.145, + "num_input_tokens_seen": 27600264, + "step": 47570 + }, + { + "epoch": 7.0859398272266905, + "grad_norm": 0.12296182662248611, + "learning_rate": 4.078114358879061e-05, + "loss": 0.1546, + "num_input_tokens_seen": 27603464, + "step": 47575 + }, + { + "epoch": 7.08668453976765, + "grad_norm": 20.0693416595459, + "learning_rate": 4.077862325982828e-05, + "loss": 0.2077, + "num_input_tokens_seen": 27606152, + "step": 47580 + }, + { + "epoch": 7.087429252308609, + "grad_norm": 0.23862101137638092, + "learning_rate": 4.077610266430215e-05, + "loss": 0.0023, + "num_input_tokens_seen": 27608776, + "step": 47585 + }, + { + "epoch": 7.088173964849568, + "grad_norm": 0.040631070733070374, + "learning_rate": 4.0773581802254795e-05, + "loss": 0.0098, + "num_input_tokens_seen": 27611496, + "step": 47590 + }, + { + "epoch": 7.088918677390527, + "grad_norm": 32.69941329956055, + "learning_rate": 4.077106067372881e-05, + "loss": 0.3715, + "num_input_tokens_seen": 27614472, + "step": 47595 + }, + { + "epoch": 7.089663389931486, + "grad_norm": 0.14726299047470093, + "learning_rate": 4.0768539278766784e-05, + "loss": 0.0178, + "num_input_tokens_seen": 27617448, + "step": 47600 + }, + { + "epoch": 7.090408102472446, + "grad_norm": 187.2246551513672, + "learning_rate": 4.076601761741131e-05, + "loss": 0.6103, + "num_input_tokens_seen": 27620456, + "step": 47605 + }, + { + "epoch": 7.091152815013404, + "grad_norm": 1.0720465183258057, + "learning_rate": 4.0763495689705004e-05, + "loss": 0.0053, + "num_input_tokens_seen": 27623368, + "step": 47610 + }, + { + "epoch": 7.091897527554364, + "grad_norm": 0.5550590753555298, + "learning_rate": 4.076097349569044e-05, + "loss": 0.0013, + "num_input_tokens_seen": 27626184, + "step": 47615 + }, + { + "epoch": 7.092642240095323, + "grad_norm": 123.6817398071289, + "learning_rate": 4.075845103541026e-05, + "loss": 0.2733, + "num_input_tokens_seen": 27629064, + "step": 47620 + }, + { + "epoch": 7.0933869526362825, + "grad_norm": 0.011682127602398396, + "learning_rate": 4.0755928308907065e-05, + "loss": 0.0855, + "num_input_tokens_seen": 27631752, + "step": 47625 + }, + { + "epoch": 7.094131665177241, + "grad_norm": 0.06535408645868301, + "learning_rate": 4.0753405316223476e-05, + "loss": 0.1276, + "num_input_tokens_seen": 27634664, + "step": 47630 + }, + { + "epoch": 7.094876377718201, + "grad_norm": 24.68415069580078, + "learning_rate": 4.0750882057402116e-05, + "loss": 0.1443, + "num_input_tokens_seen": 27637640, + "step": 47635 + }, + { + "epoch": 7.09562109025916, + "grad_norm": 14.38442611694336, + "learning_rate": 4.074835853248561e-05, + "loss": 0.1457, + "num_input_tokens_seen": 27640488, + "step": 47640 + }, + { + "epoch": 7.096365802800119, + "grad_norm": 125.35257720947266, + "learning_rate": 4.074583474151659e-05, + "loss": 0.2877, + "num_input_tokens_seen": 27643368, + "step": 47645 + }, + { + "epoch": 7.097110515341078, + "grad_norm": 12.407931327819824, + "learning_rate": 4.074331068453769e-05, + "loss": 0.2026, + "num_input_tokens_seen": 27646472, + "step": 47650 + }, + { + "epoch": 7.097855227882038, + "grad_norm": 50.046417236328125, + "learning_rate": 4.0740786361591565e-05, + "loss": 0.1963, + "num_input_tokens_seen": 27649288, + "step": 47655 + }, + { + "epoch": 7.0985999404229965, + "grad_norm": 0.006548416335135698, + "learning_rate": 4.073826177272085e-05, + "loss": 0.0677, + "num_input_tokens_seen": 27652360, + "step": 47660 + }, + { + "epoch": 7.099344652963956, + "grad_norm": 0.03942638263106346, + "learning_rate": 4.0735736917968205e-05, + "loss": 0.0761, + "num_input_tokens_seen": 27655304, + "step": 47665 + }, + { + "epoch": 7.100089365504915, + "grad_norm": 0.5891758799552917, + "learning_rate": 4.073321179737627e-05, + "loss": 0.0041, + "num_input_tokens_seen": 27658184, + "step": 47670 + }, + { + "epoch": 7.1008340780458745, + "grad_norm": 0.46899375319480896, + "learning_rate": 4.073068641098772e-05, + "loss": 0.443, + "num_input_tokens_seen": 27661064, + "step": 47675 + }, + { + "epoch": 7.101578790586833, + "grad_norm": 0.08594334870576859, + "learning_rate": 4.07281607588452e-05, + "loss": 0.1862, + "num_input_tokens_seen": 27663816, + "step": 47680 + }, + { + "epoch": 7.102323503127793, + "grad_norm": 4.320290565490723, + "learning_rate": 4.07256348409914e-05, + "loss": 0.1275, + "num_input_tokens_seen": 27666760, + "step": 47685 + }, + { + "epoch": 7.103068215668752, + "grad_norm": 48.974342346191406, + "learning_rate": 4.072310865746898e-05, + "loss": 0.3249, + "num_input_tokens_seen": 27669576, + "step": 47690 + }, + { + "epoch": 7.103812928209711, + "grad_norm": 29.72541046142578, + "learning_rate": 4.072058220832061e-05, + "loss": 0.214, + "num_input_tokens_seen": 27672360, + "step": 47695 + }, + { + "epoch": 7.10455764075067, + "grad_norm": 0.12180253863334656, + "learning_rate": 4.071805549358899e-05, + "loss": 0.0073, + "num_input_tokens_seen": 27675048, + "step": 47700 + }, + { + "epoch": 7.10530235329163, + "grad_norm": 0.00523665314540267, + "learning_rate": 4.0715528513316796e-05, + "loss": 0.2073, + "num_input_tokens_seen": 27678024, + "step": 47705 + }, + { + "epoch": 7.1060470658325885, + "grad_norm": 0.04081132635474205, + "learning_rate": 4.0713001267546724e-05, + "loss": 0.2411, + "num_input_tokens_seen": 27681192, + "step": 47710 + }, + { + "epoch": 7.106791778373548, + "grad_norm": 1.0836416482925415, + "learning_rate": 4.0710473756321453e-05, + "loss": 0.0778, + "num_input_tokens_seen": 27684072, + "step": 47715 + }, + { + "epoch": 7.107536490914507, + "grad_norm": 0.04374474287033081, + "learning_rate": 4.07079459796837e-05, + "loss": 0.2939, + "num_input_tokens_seen": 27687080, + "step": 47720 + }, + { + "epoch": 7.1082812034554665, + "grad_norm": 42.10198211669922, + "learning_rate": 4.070541793767618e-05, + "loss": 0.3848, + "num_input_tokens_seen": 27689800, + "step": 47725 + }, + { + "epoch": 7.109025915996425, + "grad_norm": 0.10417788475751877, + "learning_rate": 4.0702889630341566e-05, + "loss": 0.0021, + "num_input_tokens_seen": 27692776, + "step": 47730 + }, + { + "epoch": 7.109770628537385, + "grad_norm": 2.4791154861450195, + "learning_rate": 4.07003610577226e-05, + "loss": 0.0373, + "num_input_tokens_seen": 27695592, + "step": 47735 + }, + { + "epoch": 7.110515341078344, + "grad_norm": 2.503801107406616, + "learning_rate": 4.0697832219862e-05, + "loss": 0.0017, + "num_input_tokens_seen": 27698312, + "step": 47740 + }, + { + "epoch": 7.111260053619303, + "grad_norm": 0.030890559777617455, + "learning_rate": 4.069530311680247e-05, + "loss": 0.0682, + "num_input_tokens_seen": 27701480, + "step": 47745 + }, + { + "epoch": 7.112004766160262, + "grad_norm": 0.019355973228812218, + "learning_rate": 4.0692773748586743e-05, + "loss": 0.0228, + "num_input_tokens_seen": 27704424, + "step": 47750 + }, + { + "epoch": 7.112749478701222, + "grad_norm": 0.008443296886980534, + "learning_rate": 4.069024411525756e-05, + "loss": 0.0004, + "num_input_tokens_seen": 27707464, + "step": 47755 + }, + { + "epoch": 7.1134941912421805, + "grad_norm": 4.235771179199219, + "learning_rate": 4.0687714216857645e-05, + "loss": 0.1948, + "num_input_tokens_seen": 27710152, + "step": 47760 + }, + { + "epoch": 7.114238903783139, + "grad_norm": 13.597216606140137, + "learning_rate": 4.068518405342974e-05, + "loss": 0.1727, + "num_input_tokens_seen": 27713000, + "step": 47765 + }, + { + "epoch": 7.114983616324099, + "grad_norm": 0.028482740744948387, + "learning_rate": 4.068265362501659e-05, + "loss": 0.1251, + "num_input_tokens_seen": 27715944, + "step": 47770 + }, + { + "epoch": 7.115728328865058, + "grad_norm": 0.03137160837650299, + "learning_rate": 4.0680122931660955e-05, + "loss": 0.0007, + "num_input_tokens_seen": 27718824, + "step": 47775 + }, + { + "epoch": 7.116473041406017, + "grad_norm": 0.03408890217542648, + "learning_rate": 4.067759197340558e-05, + "loss": 0.1217, + "num_input_tokens_seen": 27721640, + "step": 47780 + }, + { + "epoch": 7.117217753946976, + "grad_norm": 0.03722771629691124, + "learning_rate": 4.0675060750293216e-05, + "loss": 0.2007, + "num_input_tokens_seen": 27724456, + "step": 47785 + }, + { + "epoch": 7.117962466487936, + "grad_norm": 32.49979782104492, + "learning_rate": 4.067252926236663e-05, + "loss": 0.6149, + "num_input_tokens_seen": 27727144, + "step": 47790 + }, + { + "epoch": 7.1187071790288945, + "grad_norm": 31.20987319946289, + "learning_rate": 4.06699975096686e-05, + "loss": 0.1455, + "num_input_tokens_seen": 27729864, + "step": 47795 + }, + { + "epoch": 7.119451891569854, + "grad_norm": 0.059561263769865036, + "learning_rate": 4.066746549224189e-05, + "loss": 0.023, + "num_input_tokens_seen": 27732936, + "step": 47800 + }, + { + "epoch": 7.120196604110813, + "grad_norm": 44.76969909667969, + "learning_rate": 4.0664933210129265e-05, + "loss": 0.203, + "num_input_tokens_seen": 27735848, + "step": 47805 + }, + { + "epoch": 7.1209413166517725, + "grad_norm": 16.638792037963867, + "learning_rate": 4.066240066337351e-05, + "loss": 0.1718, + "num_input_tokens_seen": 27738792, + "step": 47810 + }, + { + "epoch": 7.121686029192731, + "grad_norm": 1.2935079336166382, + "learning_rate": 4.065986785201743e-05, + "loss": 0.2013, + "num_input_tokens_seen": 27741736, + "step": 47815 + }, + { + "epoch": 7.122430741733691, + "grad_norm": 3.389005661010742, + "learning_rate": 4.065733477610379e-05, + "loss": 0.0977, + "num_input_tokens_seen": 27744648, + "step": 47820 + }, + { + "epoch": 7.12317545427465, + "grad_norm": 1.7251063585281372, + "learning_rate": 4.065480143567539e-05, + "loss": 0.0032, + "num_input_tokens_seen": 27747560, + "step": 47825 + }, + { + "epoch": 7.123920166815609, + "grad_norm": 17.052701950073242, + "learning_rate": 4.065226783077504e-05, + "loss": 0.2211, + "num_input_tokens_seen": 27750408, + "step": 47830 + }, + { + "epoch": 7.124664879356568, + "grad_norm": 17.176013946533203, + "learning_rate": 4.0649733961445525e-05, + "loss": 0.1092, + "num_input_tokens_seen": 27753384, + "step": 47835 + }, + { + "epoch": 7.125409591897528, + "grad_norm": 11.356547355651855, + "learning_rate": 4.064719982772965e-05, + "loss": 0.351, + "num_input_tokens_seen": 27756136, + "step": 47840 + }, + { + "epoch": 7.1261543044384865, + "grad_norm": 52.62064743041992, + "learning_rate": 4.064466542967026e-05, + "loss": 0.2247, + "num_input_tokens_seen": 27759080, + "step": 47845 + }, + { + "epoch": 7.126899016979446, + "grad_norm": 0.00901476014405489, + "learning_rate": 4.0642130767310136e-05, + "loss": 0.0283, + "num_input_tokens_seen": 27761864, + "step": 47850 + }, + { + "epoch": 7.127643729520405, + "grad_norm": 1.415727138519287, + "learning_rate": 4.0639595840692116e-05, + "loss": 0.3043, + "num_input_tokens_seen": 27764776, + "step": 47855 + }, + { + "epoch": 7.128388442061365, + "grad_norm": 144.32608032226562, + "learning_rate": 4.063706064985901e-05, + "loss": 0.2902, + "num_input_tokens_seen": 27767560, + "step": 47860 + }, + { + "epoch": 7.129133154602323, + "grad_norm": 2.6670923233032227, + "learning_rate": 4.063452519485367e-05, + "loss": 0.0027, + "num_input_tokens_seen": 27770728, + "step": 47865 + }, + { + "epoch": 7.129877867143283, + "grad_norm": 185.76986694335938, + "learning_rate": 4.06319894757189e-05, + "loss": 0.5614, + "num_input_tokens_seen": 27773480, + "step": 47870 + }, + { + "epoch": 7.130622579684242, + "grad_norm": 5.231051445007324, + "learning_rate": 4.062945349249757e-05, + "loss": 0.011, + "num_input_tokens_seen": 27776424, + "step": 47875 + }, + { + "epoch": 7.131367292225201, + "grad_norm": 0.027607720345258713, + "learning_rate": 4.06269172452325e-05, + "loss": 0.0499, + "num_input_tokens_seen": 27779560, + "step": 47880 + }, + { + "epoch": 7.13211200476616, + "grad_norm": 27.150615692138672, + "learning_rate": 4.0624380733966546e-05, + "loss": 0.2631, + "num_input_tokens_seen": 27782472, + "step": 47885 + }, + { + "epoch": 7.13285671730712, + "grad_norm": 0.039271339774131775, + "learning_rate": 4.062184395874257e-05, + "loss": 0.0017, + "num_input_tokens_seen": 27785064, + "step": 47890 + }, + { + "epoch": 7.1336014298480785, + "grad_norm": 0.08218482881784439, + "learning_rate": 4.0619306919603405e-05, + "loss": 0.0013, + "num_input_tokens_seen": 27787880, + "step": 47895 + }, + { + "epoch": 7.134346142389038, + "grad_norm": 0.0110985292121768, + "learning_rate": 4.061676961659193e-05, + "loss": 0.0008, + "num_input_tokens_seen": 27790920, + "step": 47900 + }, + { + "epoch": 7.135090854929997, + "grad_norm": 62.186370849609375, + "learning_rate": 4.061423204975101e-05, + "loss": 0.1804, + "num_input_tokens_seen": 27793896, + "step": 47905 + }, + { + "epoch": 7.135835567470957, + "grad_norm": 87.61914825439453, + "learning_rate": 4.06116942191235e-05, + "loss": 0.064, + "num_input_tokens_seen": 27797064, + "step": 47910 + }, + { + "epoch": 7.136580280011915, + "grad_norm": 0.0374133475124836, + "learning_rate": 4.060915612475229e-05, + "loss": 0.1456, + "num_input_tokens_seen": 27799752, + "step": 47915 + }, + { + "epoch": 7.137324992552875, + "grad_norm": 96.02776336669922, + "learning_rate": 4.060661776668024e-05, + "loss": 0.0919, + "num_input_tokens_seen": 27802472, + "step": 47920 + }, + { + "epoch": 7.138069705093834, + "grad_norm": 0.006370666436851025, + "learning_rate": 4.060407914495026e-05, + "loss": 0.0024, + "num_input_tokens_seen": 27805288, + "step": 47925 + }, + { + "epoch": 7.1388144176347925, + "grad_norm": 1.1886520385742188, + "learning_rate": 4.060154025960521e-05, + "loss": 0.0573, + "num_input_tokens_seen": 27808040, + "step": 47930 + }, + { + "epoch": 7.139559130175752, + "grad_norm": 53.65839385986328, + "learning_rate": 4.0599001110688e-05, + "loss": 0.0529, + "num_input_tokens_seen": 27811080, + "step": 47935 + }, + { + "epoch": 7.140303842716711, + "grad_norm": 5.630482196807861, + "learning_rate": 4.0596461698241524e-05, + "loss": 0.1877, + "num_input_tokens_seen": 27813640, + "step": 47940 + }, + { + "epoch": 7.141048555257671, + "grad_norm": 0.02058854140341282, + "learning_rate": 4.059392202230867e-05, + "loss": 0.0855, + "num_input_tokens_seen": 27816392, + "step": 47945 + }, + { + "epoch": 7.141793267798629, + "grad_norm": 0.010472571477293968, + "learning_rate": 4.059138208293236e-05, + "loss": 0.3457, + "num_input_tokens_seen": 27819272, + "step": 47950 + }, + { + "epoch": 7.142537980339589, + "grad_norm": 0.24827983975410461, + "learning_rate": 4.058884188015549e-05, + "loss": 0.2868, + "num_input_tokens_seen": 27822184, + "step": 47955 + }, + { + "epoch": 7.143282692880548, + "grad_norm": 38.99718475341797, + "learning_rate": 4.058630141402099e-05, + "loss": 0.329, + "num_input_tokens_seen": 27824936, + "step": 47960 + }, + { + "epoch": 7.144027405421507, + "grad_norm": 0.05138818919658661, + "learning_rate": 4.058376068457176e-05, + "loss": 0.1055, + "num_input_tokens_seen": 27827720, + "step": 47965 + }, + { + "epoch": 7.144772117962466, + "grad_norm": 3.7525954246520996, + "learning_rate": 4.058121969185073e-05, + "loss": 0.1582, + "num_input_tokens_seen": 27830824, + "step": 47970 + }, + { + "epoch": 7.145516830503426, + "grad_norm": 0.016809383407235146, + "learning_rate": 4.057867843590083e-05, + "loss": 0.0017, + "num_input_tokens_seen": 27833512, + "step": 47975 + }, + { + "epoch": 7.1462615430443845, + "grad_norm": 0.016901206225156784, + "learning_rate": 4.0576136916765e-05, + "loss": 0.0009, + "num_input_tokens_seen": 27836552, + "step": 47980 + }, + { + "epoch": 7.147006255585344, + "grad_norm": 0.024648718535900116, + "learning_rate": 4.0573595134486166e-05, + "loss": 0.2575, + "num_input_tokens_seen": 27839400, + "step": 47985 + }, + { + "epoch": 7.147750968126303, + "grad_norm": 0.06044595688581467, + "learning_rate": 4.0571053089107256e-05, + "loss": 0.367, + "num_input_tokens_seen": 27842472, + "step": 47990 + }, + { + "epoch": 7.148495680667263, + "grad_norm": 0.2727493941783905, + "learning_rate": 4.056851078067124e-05, + "loss": 0.2034, + "num_input_tokens_seen": 27845288, + "step": 47995 + }, + { + "epoch": 7.149240393208221, + "grad_norm": 0.00617943936958909, + "learning_rate": 4.056596820922106e-05, + "loss": 0.1904, + "num_input_tokens_seen": 27848520, + "step": 48000 + }, + { + "epoch": 7.149985105749181, + "grad_norm": 0.01825210452079773, + "learning_rate": 4.0563425374799665e-05, + "loss": 0.1156, + "num_input_tokens_seen": 27851496, + "step": 48005 + }, + { + "epoch": 7.15072981829014, + "grad_norm": 12.724492073059082, + "learning_rate": 4.0560882277450017e-05, + "loss": 0.0097, + "num_input_tokens_seen": 27854248, + "step": 48010 + }, + { + "epoch": 7.151474530831099, + "grad_norm": 0.009147404693067074, + "learning_rate": 4.055833891721508e-05, + "loss": 0.3446, + "num_input_tokens_seen": 27857000, + "step": 48015 + }, + { + "epoch": 7.152219243372058, + "grad_norm": 0.0230014119297266, + "learning_rate": 4.0555795294137824e-05, + "loss": 0.2738, + "num_input_tokens_seen": 27859848, + "step": 48020 + }, + { + "epoch": 7.152963955913018, + "grad_norm": 15.791687965393066, + "learning_rate": 4.05532514082612e-05, + "loss": 0.4815, + "num_input_tokens_seen": 27862888, + "step": 48025 + }, + { + "epoch": 7.153708668453977, + "grad_norm": 0.009888424538075924, + "learning_rate": 4.055070725962822e-05, + "loss": 0.001, + "num_input_tokens_seen": 27865992, + "step": 48030 + }, + { + "epoch": 7.154453380994936, + "grad_norm": 66.85346221923828, + "learning_rate": 4.0548162848281835e-05, + "loss": 0.1476, + "num_input_tokens_seen": 27868936, + "step": 48035 + }, + { + "epoch": 7.155198093535895, + "grad_norm": 40.386653900146484, + "learning_rate": 4.0545618174265045e-05, + "loss": 0.122, + "num_input_tokens_seen": 27871816, + "step": 48040 + }, + { + "epoch": 7.155942806076855, + "grad_norm": 0.03613187372684479, + "learning_rate": 4.054307323762083e-05, + "loss": 0.2029, + "num_input_tokens_seen": 27874504, + "step": 48045 + }, + { + "epoch": 7.156687518617813, + "grad_norm": 0.08116739243268967, + "learning_rate": 4.05405280383922e-05, + "loss": 0.2017, + "num_input_tokens_seen": 27877672, + "step": 48050 + }, + { + "epoch": 7.157432231158773, + "grad_norm": 1.6402355432510376, + "learning_rate": 4.053798257662213e-05, + "loss": 0.01, + "num_input_tokens_seen": 27880392, + "step": 48055 + }, + { + "epoch": 7.158176943699732, + "grad_norm": 0.1690255105495453, + "learning_rate": 4.053543685235365e-05, + "loss": 0.0015, + "num_input_tokens_seen": 27883016, + "step": 48060 + }, + { + "epoch": 7.158921656240691, + "grad_norm": 17.43071174621582, + "learning_rate": 4.0532890865629744e-05, + "loss": 0.1989, + "num_input_tokens_seen": 27885832, + "step": 48065 + }, + { + "epoch": 7.15966636878165, + "grad_norm": 0.06574725359678268, + "learning_rate": 4.053034461649344e-05, + "loss": 0.0048, + "num_input_tokens_seen": 27888520, + "step": 48070 + }, + { + "epoch": 7.16041108132261, + "grad_norm": 68.6132583618164, + "learning_rate": 4.0527798104987745e-05, + "loss": 0.0264, + "num_input_tokens_seen": 27891400, + "step": 48075 + }, + { + "epoch": 7.161155793863569, + "grad_norm": 0.13580621778964996, + "learning_rate": 4.052525133115569e-05, + "loss": 0.0074, + "num_input_tokens_seen": 27894440, + "step": 48080 + }, + { + "epoch": 7.161900506404528, + "grad_norm": 0.023456072434782982, + "learning_rate": 4.052270429504028e-05, + "loss": 0.0013, + "num_input_tokens_seen": 27897480, + "step": 48085 + }, + { + "epoch": 7.162645218945487, + "grad_norm": 0.19830723106861115, + "learning_rate": 4.0520156996684565e-05, + "loss": 0.0026, + "num_input_tokens_seen": 27900648, + "step": 48090 + }, + { + "epoch": 7.163389931486447, + "grad_norm": 0.3247873783111572, + "learning_rate": 4.051760943613158e-05, + "loss": 0.0274, + "num_input_tokens_seen": 27903720, + "step": 48095 + }, + { + "epoch": 7.164134644027405, + "grad_norm": 21.198062896728516, + "learning_rate": 4.0515061613424345e-05, + "loss": 0.4097, + "num_input_tokens_seen": 27906472, + "step": 48100 + }, + { + "epoch": 7.164879356568365, + "grad_norm": 0.005560097750276327, + "learning_rate": 4.051251352860591e-05, + "loss": 0.2583, + "num_input_tokens_seen": 27909384, + "step": 48105 + }, + { + "epoch": 7.165624069109324, + "grad_norm": 3.777017593383789, + "learning_rate": 4.0509965181719326e-05, + "loss": 0.3772, + "num_input_tokens_seen": 27912584, + "step": 48110 + }, + { + "epoch": 7.166368781650283, + "grad_norm": 0.02037322148680687, + "learning_rate": 4.050741657280765e-05, + "loss": 0.595, + "num_input_tokens_seen": 27915720, + "step": 48115 + }, + { + "epoch": 7.167113494191242, + "grad_norm": 0.47083738446235657, + "learning_rate": 4.050486770191393e-05, + "loss": 0.2554, + "num_input_tokens_seen": 27918344, + "step": 48120 + }, + { + "epoch": 7.167858206732201, + "grad_norm": 0.0111991036683321, + "learning_rate": 4.050231856908122e-05, + "loss": 0.2734, + "num_input_tokens_seen": 27921320, + "step": 48125 + }, + { + "epoch": 7.168602919273161, + "grad_norm": 195.00692749023438, + "learning_rate": 4.04997691743526e-05, + "loss": 0.2795, + "num_input_tokens_seen": 27924520, + "step": 48130 + }, + { + "epoch": 7.169347631814119, + "grad_norm": 130.43995666503906, + "learning_rate": 4.0497219517771137e-05, + "loss": 0.4401, + "num_input_tokens_seen": 27927496, + "step": 48135 + }, + { + "epoch": 7.170092344355079, + "grad_norm": 0.27830490469932556, + "learning_rate": 4.04946695993799e-05, + "loss": 0.0154, + "num_input_tokens_seen": 27930504, + "step": 48140 + }, + { + "epoch": 7.170837056896038, + "grad_norm": 0.05271156504750252, + "learning_rate": 4.0492119419221966e-05, + "loss": 0.2546, + "num_input_tokens_seen": 27933608, + "step": 48145 + }, + { + "epoch": 7.171581769436997, + "grad_norm": 30.345394134521484, + "learning_rate": 4.048956897734042e-05, + "loss": 0.2185, + "num_input_tokens_seen": 27936424, + "step": 48150 + }, + { + "epoch": 7.172326481977956, + "grad_norm": 0.046734292060136795, + "learning_rate": 4.048701827377835e-05, + "loss": 0.0069, + "num_input_tokens_seen": 27939208, + "step": 48155 + }, + { + "epoch": 7.173071194518916, + "grad_norm": 0.06347218155860901, + "learning_rate": 4.0484467308578844e-05, + "loss": 0.0023, + "num_input_tokens_seen": 27942056, + "step": 48160 + }, + { + "epoch": 7.173815907059875, + "grad_norm": 0.02109243907034397, + "learning_rate": 4.0481916081785e-05, + "loss": 0.2731, + "num_input_tokens_seen": 27945416, + "step": 48165 + }, + { + "epoch": 7.174560619600834, + "grad_norm": 20.779020309448242, + "learning_rate": 4.047936459343992e-05, + "loss": 0.2256, + "num_input_tokens_seen": 27948072, + "step": 48170 + }, + { + "epoch": 7.175305332141793, + "grad_norm": 0.019597617909312248, + "learning_rate": 4.047681284358671e-05, + "loss": 0.0138, + "num_input_tokens_seen": 27950856, + "step": 48175 + }, + { + "epoch": 7.176050044682753, + "grad_norm": 0.0256367065012455, + "learning_rate": 4.0474260832268476e-05, + "loss": 0.0011, + "num_input_tokens_seen": 27953736, + "step": 48180 + }, + { + "epoch": 7.176794757223711, + "grad_norm": 0.5148619413375854, + "learning_rate": 4.047170855952833e-05, + "loss": 0.0017, + "num_input_tokens_seen": 27956488, + "step": 48185 + }, + { + "epoch": 7.177539469764671, + "grad_norm": 0.013815443962812424, + "learning_rate": 4.04691560254094e-05, + "loss": 0.0003, + "num_input_tokens_seen": 27959272, + "step": 48190 + }, + { + "epoch": 7.17828418230563, + "grad_norm": 15.16320514678955, + "learning_rate": 4.046660322995479e-05, + "loss": 0.2282, + "num_input_tokens_seen": 27962024, + "step": 48195 + }, + { + "epoch": 7.1790288948465895, + "grad_norm": 0.2841576039791107, + "learning_rate": 4.046405017320765e-05, + "loss": 0.2737, + "num_input_tokens_seen": 27964840, + "step": 48200 + }, + { + "epoch": 7.179773607387548, + "grad_norm": 0.10936105251312256, + "learning_rate": 4.046149685521109e-05, + "loss": 0.1129, + "num_input_tokens_seen": 27967688, + "step": 48205 + }, + { + "epoch": 7.180518319928508, + "grad_norm": 0.01501579862087965, + "learning_rate": 4.045894327600826e-05, + "loss": 0.2086, + "num_input_tokens_seen": 27970632, + "step": 48210 + }, + { + "epoch": 7.181263032469467, + "grad_norm": 0.047545354813337326, + "learning_rate": 4.04563894356423e-05, + "loss": 0.0015, + "num_input_tokens_seen": 27973800, + "step": 48215 + }, + { + "epoch": 7.182007745010426, + "grad_norm": 0.08688477426767349, + "learning_rate": 4.045383533415634e-05, + "loss": 0.2115, + "num_input_tokens_seen": 27976488, + "step": 48220 + }, + { + "epoch": 7.182752457551385, + "grad_norm": 0.037970416247844696, + "learning_rate": 4.045128097159354e-05, + "loss": 0.1416, + "num_input_tokens_seen": 27979592, + "step": 48225 + }, + { + "epoch": 7.183497170092345, + "grad_norm": 1.9304375648498535, + "learning_rate": 4.044872634799706e-05, + "loss": 0.0602, + "num_input_tokens_seen": 27982344, + "step": 48230 + }, + { + "epoch": 7.184241882633303, + "grad_norm": 35.26533126831055, + "learning_rate": 4.044617146341003e-05, + "loss": 0.0947, + "num_input_tokens_seen": 27985480, + "step": 48235 + }, + { + "epoch": 7.184986595174263, + "grad_norm": 0.05653112754225731, + "learning_rate": 4.044361631787565e-05, + "loss": 0.0382, + "num_input_tokens_seen": 27988328, + "step": 48240 + }, + { + "epoch": 7.185731307715222, + "grad_norm": 197.4742889404297, + "learning_rate": 4.044106091143707e-05, + "loss": 0.3497, + "num_input_tokens_seen": 27991112, + "step": 48245 + }, + { + "epoch": 7.1864760202561815, + "grad_norm": 0.020641209557652473, + "learning_rate": 4.043850524413745e-05, + "loss": 0.211, + "num_input_tokens_seen": 27993672, + "step": 48250 + }, + { + "epoch": 7.18722073279714, + "grad_norm": 3.8209521770477295, + "learning_rate": 4.0435949316019974e-05, + "loss": 0.4566, + "num_input_tokens_seen": 27996616, + "step": 48255 + }, + { + "epoch": 7.1879654453381, + "grad_norm": 2.572371006011963, + "learning_rate": 4.0433393127127827e-05, + "loss": 0.2052, + "num_input_tokens_seen": 27999496, + "step": 48260 + }, + { + "epoch": 7.188710157879059, + "grad_norm": 9.9447660446167, + "learning_rate": 4.043083667750419e-05, + "loss": 0.1121, + "num_input_tokens_seen": 28002312, + "step": 48265 + }, + { + "epoch": 7.189454870420018, + "grad_norm": 9.127978324890137, + "learning_rate": 4.042827996719225e-05, + "loss": 0.0137, + "num_input_tokens_seen": 28005352, + "step": 48270 + }, + { + "epoch": 7.190199582960977, + "grad_norm": 1.136707067489624, + "learning_rate": 4.04257229962352e-05, + "loss": 0.1022, + "num_input_tokens_seen": 28008296, + "step": 48275 + }, + { + "epoch": 7.190944295501936, + "grad_norm": 24.329471588134766, + "learning_rate": 4.042316576467624e-05, + "loss": 0.1725, + "num_input_tokens_seen": 28011400, + "step": 48280 + }, + { + "epoch": 7.1916890080428955, + "grad_norm": 106.86140441894531, + "learning_rate": 4.0420608272558566e-05, + "loss": 0.3652, + "num_input_tokens_seen": 28014376, + "step": 48285 + }, + { + "epoch": 7.192433720583854, + "grad_norm": 42.192996978759766, + "learning_rate": 4.0418050519925386e-05, + "loss": 0.2183, + "num_input_tokens_seen": 28017480, + "step": 48290 + }, + { + "epoch": 7.193178433124814, + "grad_norm": 17.589258193969727, + "learning_rate": 4.041549250681992e-05, + "loss": 0.1646, + "num_input_tokens_seen": 28020328, + "step": 48295 + }, + { + "epoch": 7.193923145665773, + "grad_norm": 0.4998256266117096, + "learning_rate": 4.041293423328537e-05, + "loss": 0.098, + "num_input_tokens_seen": 28023176, + "step": 48300 + }, + { + "epoch": 7.194667858206732, + "grad_norm": 11.618240356445312, + "learning_rate": 4.0410375699364964e-05, + "loss": 0.1286, + "num_input_tokens_seen": 28026312, + "step": 48305 + }, + { + "epoch": 7.195412570747691, + "grad_norm": 17.518930435180664, + "learning_rate": 4.040781690510193e-05, + "loss": 0.0293, + "num_input_tokens_seen": 28029416, + "step": 48310 + }, + { + "epoch": 7.196157283288651, + "grad_norm": 9.496368408203125, + "learning_rate": 4.0405257850539474e-05, + "loss": 0.3254, + "num_input_tokens_seen": 28032232, + "step": 48315 + }, + { + "epoch": 7.196901995829609, + "grad_norm": 0.12986722588539124, + "learning_rate": 4.040269853572085e-05, + "loss": 0.1518, + "num_input_tokens_seen": 28035080, + "step": 48320 + }, + { + "epoch": 7.197646708370569, + "grad_norm": 14.284668922424316, + "learning_rate": 4.0400138960689295e-05, + "loss": 0.6321, + "num_input_tokens_seen": 28037864, + "step": 48325 + }, + { + "epoch": 7.198391420911528, + "grad_norm": 3.3008127212524414, + "learning_rate": 4.0397579125488036e-05, + "loss": 0.0051, + "num_input_tokens_seen": 28040904, + "step": 48330 + }, + { + "epoch": 7.1991361334524875, + "grad_norm": 0.19529089331626892, + "learning_rate": 4.039501903016033e-05, + "loss": 0.248, + "num_input_tokens_seen": 28043560, + "step": 48335 + }, + { + "epoch": 7.199880845993446, + "grad_norm": 25.563180923461914, + "learning_rate": 4.039245867474942e-05, + "loss": 0.1401, + "num_input_tokens_seen": 28046344, + "step": 48340 + }, + { + "epoch": 7.200625558534406, + "grad_norm": 1.981722354888916, + "learning_rate": 4.038989805929857e-05, + "loss": 0.0613, + "num_input_tokens_seen": 28049352, + "step": 48345 + }, + { + "epoch": 7.201370271075365, + "grad_norm": 40.075035095214844, + "learning_rate": 4.0387337183851035e-05, + "loss": 0.1007, + "num_input_tokens_seen": 28052360, + "step": 48350 + }, + { + "epoch": 7.202114983616324, + "grad_norm": 48.99912643432617, + "learning_rate": 4.038477604845008e-05, + "loss": 0.3357, + "num_input_tokens_seen": 28055048, + "step": 48355 + }, + { + "epoch": 7.202859696157283, + "grad_norm": 80.62604522705078, + "learning_rate": 4.038221465313896e-05, + "loss": 0.3056, + "num_input_tokens_seen": 28057960, + "step": 48360 + }, + { + "epoch": 7.203604408698243, + "grad_norm": 16.10933494567871, + "learning_rate": 4.037965299796096e-05, + "loss": 0.2514, + "num_input_tokens_seen": 28061032, + "step": 48365 + }, + { + "epoch": 7.2043491212392015, + "grad_norm": 0.21013429760932922, + "learning_rate": 4.037709108295936e-05, + "loss": 0.359, + "num_input_tokens_seen": 28064072, + "step": 48370 + }, + { + "epoch": 7.205093833780161, + "grad_norm": 0.23349301517009735, + "learning_rate": 4.037452890817743e-05, + "loss": 0.0542, + "num_input_tokens_seen": 28067592, + "step": 48375 + }, + { + "epoch": 7.20583854632112, + "grad_norm": 0.2967873215675354, + "learning_rate": 4.0371966473658465e-05, + "loss": 0.0742, + "num_input_tokens_seen": 28070632, + "step": 48380 + }, + { + "epoch": 7.2065832588620795, + "grad_norm": 67.8662338256836, + "learning_rate": 4.0369403779445744e-05, + "loss": 0.3478, + "num_input_tokens_seen": 28073704, + "step": 48385 + }, + { + "epoch": 7.207327971403038, + "grad_norm": 0.3161981403827667, + "learning_rate": 4.0366840825582574e-05, + "loss": 0.0032, + "num_input_tokens_seen": 28076520, + "step": 48390 + }, + { + "epoch": 7.208072683943998, + "grad_norm": 0.05729537084698677, + "learning_rate": 4.036427761211224e-05, + "loss": 0.0719, + "num_input_tokens_seen": 28079688, + "step": 48395 + }, + { + "epoch": 7.208817396484957, + "grad_norm": 16.89933204650879, + "learning_rate": 4.0361714139078055e-05, + "loss": 0.0617, + "num_input_tokens_seen": 28082184, + "step": 48400 + }, + { + "epoch": 7.209562109025916, + "grad_norm": 0.14341704547405243, + "learning_rate": 4.0359150406523314e-05, + "loss": 0.2971, + "num_input_tokens_seen": 28085128, + "step": 48405 + }, + { + "epoch": 7.210306821566875, + "grad_norm": 0.020022893324494362, + "learning_rate": 4.0356586414491345e-05, + "loss": 0.0015, + "num_input_tokens_seen": 28087912, + "step": 48410 + }, + { + "epoch": 7.211051534107835, + "grad_norm": 18.865581512451172, + "learning_rate": 4.035402216302546e-05, + "loss": 0.1642, + "num_input_tokens_seen": 28091176, + "step": 48415 + }, + { + "epoch": 7.2117962466487935, + "grad_norm": 0.37615272402763367, + "learning_rate": 4.035145765216897e-05, + "loss": 0.0116, + "num_input_tokens_seen": 28093928, + "step": 48420 + }, + { + "epoch": 7.212540959189753, + "grad_norm": 0.664425790309906, + "learning_rate": 4.03488928819652e-05, + "loss": 0.1538, + "num_input_tokens_seen": 28096744, + "step": 48425 + }, + { + "epoch": 7.213285671730712, + "grad_norm": 29.931589126586914, + "learning_rate": 4.03463278524575e-05, + "loss": 0.3112, + "num_input_tokens_seen": 28099528, + "step": 48430 + }, + { + "epoch": 7.2140303842716715, + "grad_norm": 0.4736468195915222, + "learning_rate": 4.034376256368917e-05, + "loss": 0.1878, + "num_input_tokens_seen": 28102536, + "step": 48435 + }, + { + "epoch": 7.21477509681263, + "grad_norm": 19.901845932006836, + "learning_rate": 4.034119701570358e-05, + "loss": 0.0116, + "num_input_tokens_seen": 28105800, + "step": 48440 + }, + { + "epoch": 7.21551980935359, + "grad_norm": 20.064373016357422, + "learning_rate": 4.033863120854405e-05, + "loss": 0.1148, + "num_input_tokens_seen": 28108744, + "step": 48445 + }, + { + "epoch": 7.216264521894549, + "grad_norm": 1.3435993194580078, + "learning_rate": 4.0336065142253945e-05, + "loss": 0.1183, + "num_input_tokens_seen": 28111848, + "step": 48450 + }, + { + "epoch": 7.217009234435508, + "grad_norm": 0.05979885160923004, + "learning_rate": 4.03334988168766e-05, + "loss": 0.135, + "num_input_tokens_seen": 28114984, + "step": 48455 + }, + { + "epoch": 7.217753946976467, + "grad_norm": 26.849294662475586, + "learning_rate": 4.0330932232455376e-05, + "loss": 0.2496, + "num_input_tokens_seen": 28117544, + "step": 48460 + }, + { + "epoch": 7.218498659517426, + "grad_norm": 45.91472244262695, + "learning_rate": 4.0328365389033636e-05, + "loss": 0.2241, + "num_input_tokens_seen": 28120872, + "step": 48465 + }, + { + "epoch": 7.2192433720583855, + "grad_norm": 0.08838444948196411, + "learning_rate": 4.0325798286654734e-05, + "loss": 0.2545, + "num_input_tokens_seen": 28123624, + "step": 48470 + }, + { + "epoch": 7.219988084599344, + "grad_norm": 0.3162168860435486, + "learning_rate": 4.032323092536206e-05, + "loss": 0.0029, + "num_input_tokens_seen": 28126536, + "step": 48475 + }, + { + "epoch": 7.220732797140304, + "grad_norm": 2.527153491973877, + "learning_rate": 4.032066330519896e-05, + "loss": 0.0032, + "num_input_tokens_seen": 28129512, + "step": 48480 + }, + { + "epoch": 7.221477509681263, + "grad_norm": 41.62015914916992, + "learning_rate": 4.0318095426208835e-05, + "loss": 0.3432, + "num_input_tokens_seen": 28132648, + "step": 48485 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 8.562302589416504, + "learning_rate": 4.031552728843505e-05, + "loss": 0.0025, + "num_input_tokens_seen": 28135336, + "step": 48490 + }, + { + "epoch": 7.222966934763181, + "grad_norm": 0.02948830835521221, + "learning_rate": 4.0312958891921e-05, + "loss": 0.2619, + "num_input_tokens_seen": 28138536, + "step": 48495 + }, + { + "epoch": 7.223711647304141, + "grad_norm": 0.5610650181770325, + "learning_rate": 4.031039023671007e-05, + "loss": 0.1338, + "num_input_tokens_seen": 28141096, + "step": 48500 + }, + { + "epoch": 7.2244563598450995, + "grad_norm": 0.0010729030473157763, + "learning_rate": 4.0307821322845664e-05, + "loss": 0.3201, + "num_input_tokens_seen": 28143976, + "step": 48505 + }, + { + "epoch": 7.225201072386059, + "grad_norm": 0.00940894428640604, + "learning_rate": 4.0305252150371175e-05, + "loss": 0.0592, + "num_input_tokens_seen": 28147080, + "step": 48510 + }, + { + "epoch": 7.225945784927018, + "grad_norm": 0.008607619442045689, + "learning_rate": 4.030268271933e-05, + "loss": 0.1539, + "num_input_tokens_seen": 28149960, + "step": 48515 + }, + { + "epoch": 7.2266904974679775, + "grad_norm": 0.45835527777671814, + "learning_rate": 4.030011302976555e-05, + "loss": 0.1034, + "num_input_tokens_seen": 28152680, + "step": 48520 + }, + { + "epoch": 7.227435210008936, + "grad_norm": 2.4681503772735596, + "learning_rate": 4.0297543081721254e-05, + "loss": 0.2435, + "num_input_tokens_seen": 28155624, + "step": 48525 + }, + { + "epoch": 7.228179922549896, + "grad_norm": 102.97300720214844, + "learning_rate": 4.02949728752405e-05, + "loss": 0.1811, + "num_input_tokens_seen": 28158728, + "step": 48530 + }, + { + "epoch": 7.228924635090855, + "grad_norm": 0.061193957924842834, + "learning_rate": 4.0292402410366734e-05, + "loss": 0.0825, + "num_input_tokens_seen": 28161288, + "step": 48535 + }, + { + "epoch": 7.229669347631814, + "grad_norm": 0.14954012632369995, + "learning_rate": 4.0289831687143376e-05, + "loss": 0.1769, + "num_input_tokens_seen": 28164296, + "step": 48540 + }, + { + "epoch": 7.230414060172773, + "grad_norm": 0.10674431920051575, + "learning_rate": 4.028726070561385e-05, + "loss": 0.2633, + "num_input_tokens_seen": 28167112, + "step": 48545 + }, + { + "epoch": 7.231158772713733, + "grad_norm": 0.05879191681742668, + "learning_rate": 4.028468946582158e-05, + "loss": 0.1867, + "num_input_tokens_seen": 28170216, + "step": 48550 + }, + { + "epoch": 7.2319034852546915, + "grad_norm": 47.057010650634766, + "learning_rate": 4.028211796781003e-05, + "loss": 0.0746, + "num_input_tokens_seen": 28173448, + "step": 48555 + }, + { + "epoch": 7.232648197795651, + "grad_norm": 22.753185272216797, + "learning_rate": 4.027954621162262e-05, + "loss": 0.1516, + "num_input_tokens_seen": 28176264, + "step": 48560 + }, + { + "epoch": 7.23339291033661, + "grad_norm": 0.11105937510728836, + "learning_rate": 4.027697419730281e-05, + "loss": 0.1056, + "num_input_tokens_seen": 28179112, + "step": 48565 + }, + { + "epoch": 7.23413762287757, + "grad_norm": 0.0678618997335434, + "learning_rate": 4.027440192489404e-05, + "loss": 0.1269, + "num_input_tokens_seen": 28182408, + "step": 48570 + }, + { + "epoch": 7.234882335418528, + "grad_norm": 35.69390869140625, + "learning_rate": 4.0271829394439786e-05, + "loss": 0.0648, + "num_input_tokens_seen": 28185288, + "step": 48575 + }, + { + "epoch": 7.235627047959488, + "grad_norm": 10.252765655517578, + "learning_rate": 4.026925660598349e-05, + "loss": 0.3693, + "num_input_tokens_seen": 28188072, + "step": 48580 + }, + { + "epoch": 7.236371760500447, + "grad_norm": 4.5979766845703125, + "learning_rate": 4.0266683559568625e-05, + "loss": 0.1875, + "num_input_tokens_seen": 28190984, + "step": 48585 + }, + { + "epoch": 7.237116473041406, + "grad_norm": 1.1674050092697144, + "learning_rate": 4.0264110255238654e-05, + "loss": 0.2392, + "num_input_tokens_seen": 28194216, + "step": 48590 + }, + { + "epoch": 7.237861185582365, + "grad_norm": 10.949026107788086, + "learning_rate": 4.026153669303706e-05, + "loss": 0.0165, + "num_input_tokens_seen": 28197064, + "step": 48595 + }, + { + "epoch": 7.238605898123325, + "grad_norm": 0.1144467294216156, + "learning_rate": 4.0258962873007305e-05, + "loss": 0.1547, + "num_input_tokens_seen": 28200040, + "step": 48600 + }, + { + "epoch": 7.2393506106642835, + "grad_norm": 0.6821315288543701, + "learning_rate": 4.025638879519289e-05, + "loss": 0.0116, + "num_input_tokens_seen": 28202984, + "step": 48605 + }, + { + "epoch": 7.240095323205243, + "grad_norm": 75.39000701904297, + "learning_rate": 4.025381445963728e-05, + "loss": 0.5718, + "num_input_tokens_seen": 28205800, + "step": 48610 + }, + { + "epoch": 7.240840035746202, + "grad_norm": 0.01052092108875513, + "learning_rate": 4.025123986638399e-05, + "loss": 0.1088, + "num_input_tokens_seen": 28208648, + "step": 48615 + }, + { + "epoch": 7.241584748287162, + "grad_norm": 0.005917652510106564, + "learning_rate": 4.02486650154765e-05, + "loss": 0.0944, + "num_input_tokens_seen": 28211240, + "step": 48620 + }, + { + "epoch": 7.24232946082812, + "grad_norm": 0.07910105586051941, + "learning_rate": 4.0246089906958317e-05, + "loss": 0.2987, + "num_input_tokens_seen": 28214056, + "step": 48625 + }, + { + "epoch": 7.243074173369079, + "grad_norm": 47.96892166137695, + "learning_rate": 4.024351454087293e-05, + "loss": 0.1216, + "num_input_tokens_seen": 28217064, + "step": 48630 + }, + { + "epoch": 7.243818885910039, + "grad_norm": 0.005446525756269693, + "learning_rate": 4.0240938917263864e-05, + "loss": 0.045, + "num_input_tokens_seen": 28219944, + "step": 48635 + }, + { + "epoch": 7.2445635984509975, + "grad_norm": 0.08904507756233215, + "learning_rate": 4.0238363036174625e-05, + "loss": 0.2143, + "num_input_tokens_seen": 28222760, + "step": 48640 + }, + { + "epoch": 7.245308310991957, + "grad_norm": 0.023750489577651024, + "learning_rate": 4.023578689764873e-05, + "loss": 0.1126, + "num_input_tokens_seen": 28225608, + "step": 48645 + }, + { + "epoch": 7.246053023532916, + "grad_norm": 0.21322768926620483, + "learning_rate": 4.02332105017297e-05, + "loss": 0.0007, + "num_input_tokens_seen": 28228296, + "step": 48650 + }, + { + "epoch": 7.246797736073876, + "grad_norm": 0.020871620625257492, + "learning_rate": 4.0230633848461056e-05, + "loss": 0.1957, + "num_input_tokens_seen": 28231752, + "step": 48655 + }, + { + "epoch": 7.247542448614834, + "grad_norm": 0.11308237165212631, + "learning_rate": 4.022805693788634e-05, + "loss": 0.0543, + "num_input_tokens_seen": 28234536, + "step": 48660 + }, + { + "epoch": 7.248287161155794, + "grad_norm": 0.994793176651001, + "learning_rate": 4.0225479770049076e-05, + "loss": 0.0102, + "num_input_tokens_seen": 28237576, + "step": 48665 + }, + { + "epoch": 7.249031873696753, + "grad_norm": 53.359676361083984, + "learning_rate": 4.022290234499281e-05, + "loss": 0.2139, + "num_input_tokens_seen": 28240392, + "step": 48670 + }, + { + "epoch": 7.249776586237712, + "grad_norm": 185.14610290527344, + "learning_rate": 4.0220324662761076e-05, + "loss": 0.769, + "num_input_tokens_seen": 28243368, + "step": 48675 + }, + { + "epoch": 7.250521298778671, + "grad_norm": 39.68642807006836, + "learning_rate": 4.021774672339743e-05, + "loss": 0.0731, + "num_input_tokens_seen": 28246184, + "step": 48680 + }, + { + "epoch": 7.251266011319631, + "grad_norm": 0.020454498007893562, + "learning_rate": 4.021516852694541e-05, + "loss": 0.091, + "num_input_tokens_seen": 28249192, + "step": 48685 + }, + { + "epoch": 7.2520107238605895, + "grad_norm": 4.648962497711182, + "learning_rate": 4.021259007344859e-05, + "loss": 0.1263, + "num_input_tokens_seen": 28252200, + "step": 48690 + }, + { + "epoch": 7.252755436401549, + "grad_norm": 19.40935516357422, + "learning_rate": 4.021001136295052e-05, + "loss": 0.2437, + "num_input_tokens_seen": 28254888, + "step": 48695 + }, + { + "epoch": 7.253500148942508, + "grad_norm": 11.64527416229248, + "learning_rate": 4.020743239549477e-05, + "loss": 0.0544, + "num_input_tokens_seen": 28257960, + "step": 48700 + }, + { + "epoch": 7.254244861483468, + "grad_norm": 1.0366711616516113, + "learning_rate": 4.0204853171124904e-05, + "loss": 0.0017, + "num_input_tokens_seen": 28260648, + "step": 48705 + }, + { + "epoch": 7.254989574024426, + "grad_norm": 0.035424020141363144, + "learning_rate": 4.0202273689884496e-05, + "loss": 0.0112, + "num_input_tokens_seen": 28263688, + "step": 48710 + }, + { + "epoch": 7.255734286565386, + "grad_norm": 0.09732764959335327, + "learning_rate": 4.019969395181713e-05, + "loss": 0.3111, + "num_input_tokens_seen": 28266792, + "step": 48715 + }, + { + "epoch": 7.256478999106345, + "grad_norm": 32.9077033996582, + "learning_rate": 4.0197113956966376e-05, + "loss": 0.112, + "num_input_tokens_seen": 28269832, + "step": 48720 + }, + { + "epoch": 7.257223711647304, + "grad_norm": 7.583802700042725, + "learning_rate": 4.019453370537583e-05, + "loss": 0.1912, + "num_input_tokens_seen": 28272296, + "step": 48725 + }, + { + "epoch": 7.257968424188263, + "grad_norm": 32.30284118652344, + "learning_rate": 4.019195319708908e-05, + "loss": 0.1679, + "num_input_tokens_seen": 28275144, + "step": 48730 + }, + { + "epoch": 7.258713136729223, + "grad_norm": 0.08583023399114609, + "learning_rate": 4.018937243214972e-05, + "loss": 0.0521, + "num_input_tokens_seen": 28277768, + "step": 48735 + }, + { + "epoch": 7.259457849270182, + "grad_norm": 54.1800651550293, + "learning_rate": 4.018679141060136e-05, + "loss": 0.1634, + "num_input_tokens_seen": 28280392, + "step": 48740 + }, + { + "epoch": 7.260202561811141, + "grad_norm": 67.02891540527344, + "learning_rate": 4.0184210132487576e-05, + "loss": 0.2627, + "num_input_tokens_seen": 28283240, + "step": 48745 + }, + { + "epoch": 7.2609472743521, + "grad_norm": 53.31513214111328, + "learning_rate": 4.018162859785201e-05, + "loss": 0.219, + "num_input_tokens_seen": 28286280, + "step": 48750 + }, + { + "epoch": 7.26169198689306, + "grad_norm": 0.12125779688358307, + "learning_rate": 4.017904680673825e-05, + "loss": 0.1422, + "num_input_tokens_seen": 28289064, + "step": 48755 + }, + { + "epoch": 7.262436699434018, + "grad_norm": 0.015021061524748802, + "learning_rate": 4.0176464759189924e-05, + "loss": 0.2303, + "num_input_tokens_seen": 28291976, + "step": 48760 + }, + { + "epoch": 7.263181411974978, + "grad_norm": 0.17716188728809357, + "learning_rate": 4.017388245525065e-05, + "loss": 0.0123, + "num_input_tokens_seen": 28294696, + "step": 48765 + }, + { + "epoch": 7.263926124515937, + "grad_norm": 78.13723754882812, + "learning_rate": 4.017129989496405e-05, + "loss": 0.2875, + "num_input_tokens_seen": 28297608, + "step": 48770 + }, + { + "epoch": 7.264670837056896, + "grad_norm": 0.7764342427253723, + "learning_rate": 4.0168717078373763e-05, + "loss": 0.0027, + "num_input_tokens_seen": 28300360, + "step": 48775 + }, + { + "epoch": 7.265415549597855, + "grad_norm": 17.403079986572266, + "learning_rate": 4.016613400552342e-05, + "loss": 0.1767, + "num_input_tokens_seen": 28303144, + "step": 48780 + }, + { + "epoch": 7.266160262138815, + "grad_norm": 0.042319267988204956, + "learning_rate": 4.016355067645666e-05, + "loss": 0.1298, + "num_input_tokens_seen": 28306024, + "step": 48785 + }, + { + "epoch": 7.266904974679774, + "grad_norm": 66.63883972167969, + "learning_rate": 4.0160967091217114e-05, + "loss": 0.0624, + "num_input_tokens_seen": 28308712, + "step": 48790 + }, + { + "epoch": 7.267649687220732, + "grad_norm": 42.119911193847656, + "learning_rate": 4.015838324984844e-05, + "loss": 0.4968, + "num_input_tokens_seen": 28311528, + "step": 48795 + }, + { + "epoch": 7.268394399761692, + "grad_norm": 31.22420310974121, + "learning_rate": 4.015579915239429e-05, + "loss": 0.4524, + "num_input_tokens_seen": 28314664, + "step": 48800 + }, + { + "epoch": 7.269139112302652, + "grad_norm": 9.236430168151855, + "learning_rate": 4.015321479889832e-05, + "loss": 0.0258, + "num_input_tokens_seen": 28317544, + "step": 48805 + }, + { + "epoch": 7.26988382484361, + "grad_norm": 38.55046844482422, + "learning_rate": 4.015063018940418e-05, + "loss": 0.0171, + "num_input_tokens_seen": 28320424, + "step": 48810 + }, + { + "epoch": 7.270628537384569, + "grad_norm": 0.05668896064162254, + "learning_rate": 4.014804532395554e-05, + "loss": 0.0817, + "num_input_tokens_seen": 28323368, + "step": 48815 + }, + { + "epoch": 7.271373249925529, + "grad_norm": 14.364341735839844, + "learning_rate": 4.014546020259607e-05, + "loss": 0.304, + "num_input_tokens_seen": 28326440, + "step": 48820 + }, + { + "epoch": 7.272117962466488, + "grad_norm": 64.95722198486328, + "learning_rate": 4.014287482536945e-05, + "loss": 0.341, + "num_input_tokens_seen": 28329256, + "step": 48825 + }, + { + "epoch": 7.272862675007447, + "grad_norm": 9.917405128479004, + "learning_rate": 4.0140289192319355e-05, + "loss": 0.1295, + "num_input_tokens_seen": 28331944, + "step": 48830 + }, + { + "epoch": 7.273607387548406, + "grad_norm": 0.1542646884918213, + "learning_rate": 4.013770330348945e-05, + "loss": 0.078, + "num_input_tokens_seen": 28334696, + "step": 48835 + }, + { + "epoch": 7.274352100089366, + "grad_norm": 0.022163311019539833, + "learning_rate": 4.013511715892344e-05, + "loss": 0.0062, + "num_input_tokens_seen": 28337480, + "step": 48840 + }, + { + "epoch": 7.275096812630324, + "grad_norm": 57.06816101074219, + "learning_rate": 4.0132530758665006e-05, + "loss": 0.1828, + "num_input_tokens_seen": 28340232, + "step": 48845 + }, + { + "epoch": 7.275841525171284, + "grad_norm": 31.55699348449707, + "learning_rate": 4.0129944102757847e-05, + "loss": 0.0819, + "num_input_tokens_seen": 28342952, + "step": 48850 + }, + { + "epoch": 7.276586237712243, + "grad_norm": 0.09502364695072174, + "learning_rate": 4.0127357191245654e-05, + "loss": 0.0038, + "num_input_tokens_seen": 28346024, + "step": 48855 + }, + { + "epoch": 7.277330950253202, + "grad_norm": 0.19058653712272644, + "learning_rate": 4.0124770024172135e-05, + "loss": 0.0013, + "num_input_tokens_seen": 28348936, + "step": 48860 + }, + { + "epoch": 7.278075662794161, + "grad_norm": 0.049952052533626556, + "learning_rate": 4.0122182601581005e-05, + "loss": 0.2101, + "num_input_tokens_seen": 28351944, + "step": 48865 + }, + { + "epoch": 7.278820375335121, + "grad_norm": 21.93574333190918, + "learning_rate": 4.011959492351597e-05, + "loss": 0.0877, + "num_input_tokens_seen": 28354600, + "step": 48870 + }, + { + "epoch": 7.27956508787608, + "grad_norm": 10.069564819335938, + "learning_rate": 4.011700699002075e-05, + "loss": 0.2054, + "num_input_tokens_seen": 28357288, + "step": 48875 + }, + { + "epoch": 7.280309800417039, + "grad_norm": 0.015448827296495438, + "learning_rate": 4.011441880113905e-05, + "loss": 0.1163, + "num_input_tokens_seen": 28360264, + "step": 48880 + }, + { + "epoch": 7.281054512957998, + "grad_norm": 42.421791076660156, + "learning_rate": 4.0111830356914605e-05, + "loss": 0.1609, + "num_input_tokens_seen": 28363368, + "step": 48885 + }, + { + "epoch": 7.281799225498958, + "grad_norm": 13.989453315734863, + "learning_rate": 4.010924165739115e-05, + "loss": 0.0529, + "num_input_tokens_seen": 28366312, + "step": 48890 + }, + { + "epoch": 7.282543938039916, + "grad_norm": 31.02651596069336, + "learning_rate": 4.0106652702612416e-05, + "loss": 0.2139, + "num_input_tokens_seen": 28369288, + "step": 48895 + }, + { + "epoch": 7.283288650580876, + "grad_norm": 0.004525783471763134, + "learning_rate": 4.010406349262214e-05, + "loss": 0.3835, + "num_input_tokens_seen": 28372072, + "step": 48900 + }, + { + "epoch": 7.284033363121835, + "grad_norm": 0.031169431284070015, + "learning_rate": 4.010147402746405e-05, + "loss": 0.116, + "num_input_tokens_seen": 28374952, + "step": 48905 + }, + { + "epoch": 7.2847780756627944, + "grad_norm": 17.09250831604004, + "learning_rate": 4.009888430718192e-05, + "loss": 0.3136, + "num_input_tokens_seen": 28377768, + "step": 48910 + }, + { + "epoch": 7.285522788203753, + "grad_norm": 0.011500981636345387, + "learning_rate": 4.009629433181947e-05, + "loss": 0.1848, + "num_input_tokens_seen": 28380744, + "step": 48915 + }, + { + "epoch": 7.286267500744713, + "grad_norm": 0.010605390183627605, + "learning_rate": 4.009370410142049e-05, + "loss": 0.2178, + "num_input_tokens_seen": 28383720, + "step": 48920 + }, + { + "epoch": 7.287012213285672, + "grad_norm": 0.06573536992073059, + "learning_rate": 4.00911136160287e-05, + "loss": 0.0013, + "num_input_tokens_seen": 28386536, + "step": 48925 + }, + { + "epoch": 7.287756925826631, + "grad_norm": 0.09278630465269089, + "learning_rate": 4.00885228756879e-05, + "loss": 0.2548, + "num_input_tokens_seen": 28389640, + "step": 48930 + }, + { + "epoch": 7.28850163836759, + "grad_norm": 0.04190068691968918, + "learning_rate": 4.008593188044183e-05, + "loss": 0.3011, + "num_input_tokens_seen": 28392520, + "step": 48935 + }, + { + "epoch": 7.28924635090855, + "grad_norm": 64.42726135253906, + "learning_rate": 4.008334063033428e-05, + "loss": 0.3222, + "num_input_tokens_seen": 28395368, + "step": 48940 + }, + { + "epoch": 7.289991063449508, + "grad_norm": 4.870441436767578, + "learning_rate": 4.008074912540901e-05, + "loss": 0.0016, + "num_input_tokens_seen": 28398280, + "step": 48945 + }, + { + "epoch": 7.290735775990468, + "grad_norm": 0.6695472598075867, + "learning_rate": 4.0078157365709823e-05, + "loss": 0.1652, + "num_input_tokens_seen": 28401288, + "step": 48950 + }, + { + "epoch": 7.291480488531427, + "grad_norm": 102.91969299316406, + "learning_rate": 4.0075565351280485e-05, + "loss": 0.0189, + "num_input_tokens_seen": 28403816, + "step": 48955 + }, + { + "epoch": 7.292225201072386, + "grad_norm": 0.06339335441589355, + "learning_rate": 4.00729730821648e-05, + "loss": 0.0542, + "num_input_tokens_seen": 28406760, + "step": 48960 + }, + { + "epoch": 7.292969913613345, + "grad_norm": 0.004517192021012306, + "learning_rate": 4.007038055840654e-05, + "loss": 0.0006, + "num_input_tokens_seen": 28410024, + "step": 48965 + }, + { + "epoch": 7.293714626154305, + "grad_norm": 79.1805419921875, + "learning_rate": 4.0067787780049535e-05, + "loss": 0.1101, + "num_input_tokens_seen": 28412744, + "step": 48970 + }, + { + "epoch": 7.294459338695264, + "grad_norm": 0.1052912250161171, + "learning_rate": 4.0065194747137555e-05, + "loss": 0.1089, + "num_input_tokens_seen": 28415624, + "step": 48975 + }, + { + "epoch": 7.295204051236222, + "grad_norm": 0.42631208896636963, + "learning_rate": 4.006260145971443e-05, + "loss": 0.1822, + "num_input_tokens_seen": 28418248, + "step": 48980 + }, + { + "epoch": 7.295948763777182, + "grad_norm": 0.052684009075164795, + "learning_rate": 4.006000791782396e-05, + "loss": 0.1998, + "num_input_tokens_seen": 28420872, + "step": 48985 + }, + { + "epoch": 7.296693476318141, + "grad_norm": 53.8248291015625, + "learning_rate": 4.0057414121509965e-05, + "loss": 0.3317, + "num_input_tokens_seen": 28423752, + "step": 48990 + }, + { + "epoch": 7.2974381888591004, + "grad_norm": 0.10875385999679565, + "learning_rate": 4.005482007081626e-05, + "loss": 0.2042, + "num_input_tokens_seen": 28426600, + "step": 48995 + }, + { + "epoch": 7.298182901400059, + "grad_norm": 4.778923988342285, + "learning_rate": 4.005222576578667e-05, + "loss": 0.6488, + "num_input_tokens_seen": 28429640, + "step": 49000 + }, + { + "epoch": 7.298927613941019, + "grad_norm": 0.13005493581295013, + "learning_rate": 4.004963120646502e-05, + "loss": 0.1013, + "num_input_tokens_seen": 28432520, + "step": 49005 + }, + { + "epoch": 7.299672326481978, + "grad_norm": 0.9364989399909973, + "learning_rate": 4.004703639289515e-05, + "loss": 0.1547, + "num_input_tokens_seen": 28435624, + "step": 49010 + }, + { + "epoch": 7.300417039022937, + "grad_norm": 2.1343114376068115, + "learning_rate": 4.004444132512089e-05, + "loss": 0.1572, + "num_input_tokens_seen": 28438792, + "step": 49015 + }, + { + "epoch": 7.301161751563896, + "grad_norm": 0.0654415488243103, + "learning_rate": 4.004184600318609e-05, + "loss": 0.1818, + "num_input_tokens_seen": 28441704, + "step": 49020 + }, + { + "epoch": 7.301906464104856, + "grad_norm": 9.953044891357422, + "learning_rate": 4.003925042713459e-05, + "loss": 0.182, + "num_input_tokens_seen": 28444424, + "step": 49025 + }, + { + "epoch": 7.302651176645814, + "grad_norm": 1.1889187097549438, + "learning_rate": 4.003665459701024e-05, + "loss": 0.0968, + "num_input_tokens_seen": 28447016, + "step": 49030 + }, + { + "epoch": 7.303395889186774, + "grad_norm": 72.93470764160156, + "learning_rate": 4.003405851285689e-05, + "loss": 0.2004, + "num_input_tokens_seen": 28449928, + "step": 49035 + }, + { + "epoch": 7.304140601727733, + "grad_norm": 92.83299255371094, + "learning_rate": 4.00314621747184e-05, + "loss": 0.3111, + "num_input_tokens_seen": 28453288, + "step": 49040 + }, + { + "epoch": 7.3048853142686925, + "grad_norm": 13.382278442382812, + "learning_rate": 4.002886558263863e-05, + "loss": 0.3001, + "num_input_tokens_seen": 28456296, + "step": 49045 + }, + { + "epoch": 7.305630026809651, + "grad_norm": 0.44028058648109436, + "learning_rate": 4.0026268736661457e-05, + "loss": 0.2955, + "num_input_tokens_seen": 28459560, + "step": 49050 + }, + { + "epoch": 7.306374739350611, + "grad_norm": 45.26716613769531, + "learning_rate": 4.002367163683075e-05, + "loss": 0.1977, + "num_input_tokens_seen": 28462376, + "step": 49055 + }, + { + "epoch": 7.30711945189157, + "grad_norm": 34.89590072631836, + "learning_rate": 4.002107428319037e-05, + "loss": 0.1818, + "num_input_tokens_seen": 28465064, + "step": 49060 + }, + { + "epoch": 7.307864164432529, + "grad_norm": 44.73863983154297, + "learning_rate": 4.0018476675784214e-05, + "loss": 0.2759, + "num_input_tokens_seen": 28468072, + "step": 49065 + }, + { + "epoch": 7.308608876973488, + "grad_norm": 5.631623268127441, + "learning_rate": 4.001587881465616e-05, + "loss": 0.0042, + "num_input_tokens_seen": 28471016, + "step": 49070 + }, + { + "epoch": 7.309353589514448, + "grad_norm": 13.901177406311035, + "learning_rate": 4.001328069985009e-05, + "loss": 0.4703, + "num_input_tokens_seen": 28473768, + "step": 49075 + }, + { + "epoch": 7.3100983020554064, + "grad_norm": 0.37003594636917114, + "learning_rate": 4.00106823314099e-05, + "loss": 0.2057, + "num_input_tokens_seen": 28476680, + "step": 49080 + }, + { + "epoch": 7.310843014596366, + "grad_norm": 0.11774049699306488, + "learning_rate": 4.0008083709379496e-05, + "loss": 0.1387, + "num_input_tokens_seen": 28479400, + "step": 49085 + }, + { + "epoch": 7.311587727137325, + "grad_norm": 0.06126413494348526, + "learning_rate": 4.0005484833802765e-05, + "loss": 0.0332, + "num_input_tokens_seen": 28482248, + "step": 49090 + }, + { + "epoch": 7.3123324396782845, + "grad_norm": 18.40099334716797, + "learning_rate": 4.0002885704723614e-05, + "loss": 0.2236, + "num_input_tokens_seen": 28485064, + "step": 49095 + }, + { + "epoch": 7.313077152219243, + "grad_norm": 3.6828837394714355, + "learning_rate": 4.000028632218596e-05, + "loss": 0.0056, + "num_input_tokens_seen": 28487784, + "step": 49100 + }, + { + "epoch": 7.313821864760203, + "grad_norm": 0.051925379782915115, + "learning_rate": 3.9997686686233724e-05, + "loss": 0.0834, + "num_input_tokens_seen": 28490568, + "step": 49105 + }, + { + "epoch": 7.314566577301162, + "grad_norm": 0.5096065998077393, + "learning_rate": 3.999508679691081e-05, + "loss": 0.1635, + "num_input_tokens_seen": 28493320, + "step": 49110 + }, + { + "epoch": 7.315311289842121, + "grad_norm": 0.07483378052711487, + "learning_rate": 3.999248665426114e-05, + "loss": 0.0694, + "num_input_tokens_seen": 28496392, + "step": 49115 + }, + { + "epoch": 7.31605600238308, + "grad_norm": 4.067879676818848, + "learning_rate": 3.998988625832865e-05, + "loss": 0.0807, + "num_input_tokens_seen": 28499336, + "step": 49120 + }, + { + "epoch": 7.31680071492404, + "grad_norm": 13.903965950012207, + "learning_rate": 3.998728560915726e-05, + "loss": 0.2524, + "num_input_tokens_seen": 28502408, + "step": 49125 + }, + { + "epoch": 7.3175454274649985, + "grad_norm": 0.08410069346427917, + "learning_rate": 3.9984684706790915e-05, + "loss": 0.072, + "num_input_tokens_seen": 28505544, + "step": 49130 + }, + { + "epoch": 7.318290140005958, + "grad_norm": 107.79756927490234, + "learning_rate": 3.998208355127355e-05, + "loss": 0.1642, + "num_input_tokens_seen": 28508392, + "step": 49135 + }, + { + "epoch": 7.319034852546917, + "grad_norm": 60.00721740722656, + "learning_rate": 3.997948214264911e-05, + "loss": 0.2403, + "num_input_tokens_seen": 28511464, + "step": 49140 + }, + { + "epoch": 7.319779565087876, + "grad_norm": 83.94611358642578, + "learning_rate": 3.9976880480961556e-05, + "loss": 0.1141, + "num_input_tokens_seen": 28514472, + "step": 49145 + }, + { + "epoch": 7.320524277628835, + "grad_norm": 50.109535217285156, + "learning_rate": 3.997427856625482e-05, + "loss": 0.2718, + "num_input_tokens_seen": 28517576, + "step": 49150 + }, + { + "epoch": 7.321268990169794, + "grad_norm": 25.462154388427734, + "learning_rate": 3.997167639857287e-05, + "loss": 0.0101, + "num_input_tokens_seen": 28520328, + "step": 49155 + }, + { + "epoch": 7.322013702710754, + "grad_norm": 0.01811484433710575, + "learning_rate": 3.996907397795966e-05, + "loss": 0.0208, + "num_input_tokens_seen": 28523240, + "step": 49160 + }, + { + "epoch": 7.3227584152517124, + "grad_norm": 0.013056664727628231, + "learning_rate": 3.9966471304459154e-05, + "loss": 0.0003, + "num_input_tokens_seen": 28525992, + "step": 49165 + }, + { + "epoch": 7.323503127792672, + "grad_norm": 40.8286247253418, + "learning_rate": 3.996386837811533e-05, + "loss": 0.2147, + "num_input_tokens_seen": 28528712, + "step": 49170 + }, + { + "epoch": 7.324247840333631, + "grad_norm": 1.9131278991699219, + "learning_rate": 3.996126519897216e-05, + "loss": 0.1193, + "num_input_tokens_seen": 28531432, + "step": 49175 + }, + { + "epoch": 7.3249925528745905, + "grad_norm": 0.01961270533502102, + "learning_rate": 3.995866176707363e-05, + "loss": 0.0021, + "num_input_tokens_seen": 28534184, + "step": 49180 + }, + { + "epoch": 7.325737265415549, + "grad_norm": 15.56574535369873, + "learning_rate": 3.99560580824637e-05, + "loss": 0.5331, + "num_input_tokens_seen": 28537064, + "step": 49185 + }, + { + "epoch": 7.326481977956509, + "grad_norm": 0.31066858768463135, + "learning_rate": 3.995345414518638e-05, + "loss": 0.0222, + "num_input_tokens_seen": 28540008, + "step": 49190 + }, + { + "epoch": 7.327226690497468, + "grad_norm": 29.415882110595703, + "learning_rate": 3.995084995528563e-05, + "loss": 0.0076, + "num_input_tokens_seen": 28543368, + "step": 49195 + }, + { + "epoch": 7.327971403038427, + "grad_norm": 0.027097035199403763, + "learning_rate": 3.9948245512805484e-05, + "loss": 0.1319, + "num_input_tokens_seen": 28546344, + "step": 49200 + }, + { + "epoch": 7.328716115579386, + "grad_norm": 32.29693603515625, + "learning_rate": 3.994564081778992e-05, + "loss": 0.0865, + "num_input_tokens_seen": 28549160, + "step": 49205 + }, + { + "epoch": 7.329460828120346, + "grad_norm": 0.0072524226270616055, + "learning_rate": 3.994303587028294e-05, + "loss": 0.0376, + "num_input_tokens_seen": 28552136, + "step": 49210 + }, + { + "epoch": 7.3302055406613045, + "grad_norm": 0.09900101274251938, + "learning_rate": 3.9940430670328556e-05, + "loss": 0.1731, + "num_input_tokens_seen": 28555048, + "step": 49215 + }, + { + "epoch": 7.330950253202264, + "grad_norm": 0.0017878992948681116, + "learning_rate": 3.993782521797078e-05, + "loss": 0.191, + "num_input_tokens_seen": 28557992, + "step": 49220 + }, + { + "epoch": 7.331694965743223, + "grad_norm": 0.3914126753807068, + "learning_rate": 3.993521951325363e-05, + "loss": 0.3707, + "num_input_tokens_seen": 28561128, + "step": 49225 + }, + { + "epoch": 7.3324396782841825, + "grad_norm": 0.2631402313709259, + "learning_rate": 3.993261355622113e-05, + "loss": 0.0021, + "num_input_tokens_seen": 28563816, + "step": 49230 + }, + { + "epoch": 7.333184390825141, + "grad_norm": 0.008588496595621109, + "learning_rate": 3.99300073469173e-05, + "loss": 0.1558, + "num_input_tokens_seen": 28566664, + "step": 49235 + }, + { + "epoch": 7.333929103366101, + "grad_norm": 23.09966278076172, + "learning_rate": 3.9927400885386165e-05, + "loss": 0.3506, + "num_input_tokens_seen": 28569448, + "step": 49240 + }, + { + "epoch": 7.33467381590706, + "grad_norm": 138.759521484375, + "learning_rate": 3.992479417167177e-05, + "loss": 0.1381, + "num_input_tokens_seen": 28572360, + "step": 49245 + }, + { + "epoch": 7.335418528448019, + "grad_norm": 0.09801708906888962, + "learning_rate": 3.992218720581814e-05, + "loss": 0.0032, + "num_input_tokens_seen": 28575176, + "step": 49250 + }, + { + "epoch": 7.336163240988978, + "grad_norm": 55.1378173828125, + "learning_rate": 3.9919579987869324e-05, + "loss": 0.154, + "num_input_tokens_seen": 28578056, + "step": 49255 + }, + { + "epoch": 7.336907953529938, + "grad_norm": 0.08107782900333405, + "learning_rate": 3.991697251786938e-05, + "loss": 0.0301, + "num_input_tokens_seen": 28580808, + "step": 49260 + }, + { + "epoch": 7.3376526660708965, + "grad_norm": 46.51158905029297, + "learning_rate": 3.991436479586233e-05, + "loss": 0.1764, + "num_input_tokens_seen": 28583464, + "step": 49265 + }, + { + "epoch": 7.338397378611856, + "grad_norm": 0.025470132008194923, + "learning_rate": 3.9911756821892256e-05, + "loss": 0.2113, + "num_input_tokens_seen": 28586312, + "step": 49270 + }, + { + "epoch": 7.339142091152815, + "grad_norm": 0.09038600325584412, + "learning_rate": 3.99091485960032e-05, + "loss": 0.1487, + "num_input_tokens_seen": 28588936, + "step": 49275 + }, + { + "epoch": 7.3398868036937746, + "grad_norm": 13.447674751281738, + "learning_rate": 3.990654011823923e-05, + "loss": 0.0259, + "num_input_tokens_seen": 28591880, + "step": 49280 + }, + { + "epoch": 7.340631516234733, + "grad_norm": 14.281678199768066, + "learning_rate": 3.990393138864442e-05, + "loss": 0.4409, + "num_input_tokens_seen": 28594856, + "step": 49285 + }, + { + "epoch": 7.341376228775693, + "grad_norm": 0.143254354596138, + "learning_rate": 3.990132240726284e-05, + "loss": 0.1368, + "num_input_tokens_seen": 28597544, + "step": 49290 + }, + { + "epoch": 7.342120941316652, + "grad_norm": 0.03677089139819145, + "learning_rate": 3.989871317413855e-05, + "loss": 0.0177, + "num_input_tokens_seen": 28600296, + "step": 49295 + }, + { + "epoch": 7.342865653857611, + "grad_norm": 168.7242431640625, + "learning_rate": 3.989610368931566e-05, + "loss": 0.1882, + "num_input_tokens_seen": 28603208, + "step": 49300 + }, + { + "epoch": 7.34361036639857, + "grad_norm": 35.448699951171875, + "learning_rate": 3.9893493952838226e-05, + "loss": 0.3248, + "num_input_tokens_seen": 28606056, + "step": 49305 + }, + { + "epoch": 7.344355078939529, + "grad_norm": 56.16918182373047, + "learning_rate": 3.9890883964750355e-05, + "loss": 0.0964, + "num_input_tokens_seen": 28608872, + "step": 49310 + }, + { + "epoch": 7.3450997914804885, + "grad_norm": 0.2708792984485626, + "learning_rate": 3.9888273725096126e-05, + "loss": 0.0776, + "num_input_tokens_seen": 28611688, + "step": 49315 + }, + { + "epoch": 7.345844504021448, + "grad_norm": 0.8812747597694397, + "learning_rate": 3.988566323391965e-05, + "loss": 0.0017, + "num_input_tokens_seen": 28614920, + "step": 49320 + }, + { + "epoch": 7.346589216562407, + "grad_norm": 0.02818094938993454, + "learning_rate": 3.988305249126502e-05, + "loss": 0.161, + "num_input_tokens_seen": 28617768, + "step": 49325 + }, + { + "epoch": 7.347333929103366, + "grad_norm": 138.11773681640625, + "learning_rate": 3.988044149717635e-05, + "loss": 0.0361, + "num_input_tokens_seen": 28620712, + "step": 49330 + }, + { + "epoch": 7.348078641644325, + "grad_norm": 0.20463678240776062, + "learning_rate": 3.987783025169773e-05, + "loss": 0.001, + "num_input_tokens_seen": 28623368, + "step": 49335 + }, + { + "epoch": 7.348823354185284, + "grad_norm": 36.92169189453125, + "learning_rate": 3.987521875487331e-05, + "loss": 0.0898, + "num_input_tokens_seen": 28626376, + "step": 49340 + }, + { + "epoch": 7.349568066726244, + "grad_norm": 0.01684308610856533, + "learning_rate": 3.9872607006747174e-05, + "loss": 0.0033, + "num_input_tokens_seen": 28629448, + "step": 49345 + }, + { + "epoch": 7.3503127792672025, + "grad_norm": 80.29747772216797, + "learning_rate": 3.986999500736346e-05, + "loss": 0.1359, + "num_input_tokens_seen": 28632360, + "step": 49350 + }, + { + "epoch": 7.351057491808162, + "grad_norm": 0.19348672032356262, + "learning_rate": 3.98673827567663e-05, + "loss": 0.0005, + "num_input_tokens_seen": 28635400, + "step": 49355 + }, + { + "epoch": 7.351802204349121, + "grad_norm": 16.8228702545166, + "learning_rate": 3.9864770254999814e-05, + "loss": 0.1814, + "num_input_tokens_seen": 28638312, + "step": 49360 + }, + { + "epoch": 7.3525469168900806, + "grad_norm": 0.038835737854242325, + "learning_rate": 3.986215750210814e-05, + "loss": 0.5225, + "num_input_tokens_seen": 28641480, + "step": 49365 + }, + { + "epoch": 7.353291629431039, + "grad_norm": 0.9400147199630737, + "learning_rate": 3.985954449813543e-05, + "loss": 0.3565, + "num_input_tokens_seen": 28644424, + "step": 49370 + }, + { + "epoch": 7.354036341971999, + "grad_norm": 0.6838083267211914, + "learning_rate": 3.9856931243125804e-05, + "loss": 0.2723, + "num_input_tokens_seen": 28647272, + "step": 49375 + }, + { + "epoch": 7.354781054512958, + "grad_norm": 0.011533262208104134, + "learning_rate": 3.985431773712344e-05, + "loss": 0.1424, + "num_input_tokens_seen": 28650120, + "step": 49380 + }, + { + "epoch": 7.355525767053917, + "grad_norm": 0.04386027529835701, + "learning_rate": 3.9851703980172464e-05, + "loss": 0.0456, + "num_input_tokens_seen": 28652808, + "step": 49385 + }, + { + "epoch": 7.356270479594876, + "grad_norm": 0.13135580718517303, + "learning_rate": 3.984908997231704e-05, + "loss": 0.001, + "num_input_tokens_seen": 28656040, + "step": 49390 + }, + { + "epoch": 7.357015192135836, + "grad_norm": 18.448312759399414, + "learning_rate": 3.984647571360135e-05, + "loss": 0.0119, + "num_input_tokens_seen": 28658792, + "step": 49395 + }, + { + "epoch": 7.3577599046767945, + "grad_norm": 0.8258163928985596, + "learning_rate": 3.9843861204069536e-05, + "loss": 0.0097, + "num_input_tokens_seen": 28661640, + "step": 49400 + }, + { + "epoch": 7.358504617217754, + "grad_norm": 0.04122038930654526, + "learning_rate": 3.9841246443765765e-05, + "loss": 0.3997, + "num_input_tokens_seen": 28664488, + "step": 49405 + }, + { + "epoch": 7.359249329758713, + "grad_norm": 4.263121604919434, + "learning_rate": 3.983863143273422e-05, + "loss": 0.1743, + "num_input_tokens_seen": 28667368, + "step": 49410 + }, + { + "epoch": 7.359994042299673, + "grad_norm": 18.722801208496094, + "learning_rate": 3.983601617101909e-05, + "loss": 0.3168, + "num_input_tokens_seen": 28670600, + "step": 49415 + }, + { + "epoch": 7.360738754840631, + "grad_norm": 0.09243332594633102, + "learning_rate": 3.983340065866453e-05, + "loss": 0.1746, + "num_input_tokens_seen": 28673288, + "step": 49420 + }, + { + "epoch": 7.361483467381591, + "grad_norm": 0.0009972377447411418, + "learning_rate": 3.9830784895714744e-05, + "loss": 0.2143, + "num_input_tokens_seen": 28676072, + "step": 49425 + }, + { + "epoch": 7.36222817992255, + "grad_norm": 0.8623891472816467, + "learning_rate": 3.982816888221394e-05, + "loss": 0.0097, + "num_input_tokens_seen": 28678920, + "step": 49430 + }, + { + "epoch": 7.362972892463509, + "grad_norm": 0.006507906597107649, + "learning_rate": 3.9825552618206274e-05, + "loss": 0.0174, + "num_input_tokens_seen": 28681480, + "step": 49435 + }, + { + "epoch": 7.363717605004468, + "grad_norm": 0.16526657342910767, + "learning_rate": 3.982293610373597e-05, + "loss": 0.1506, + "num_input_tokens_seen": 28684360, + "step": 49440 + }, + { + "epoch": 7.364462317545428, + "grad_norm": 0.005956913344562054, + "learning_rate": 3.9820319338847224e-05, + "loss": 0.245, + "num_input_tokens_seen": 28687048, + "step": 49445 + }, + { + "epoch": 7.3652070300863866, + "grad_norm": 0.03507940098643303, + "learning_rate": 3.981770232358425e-05, + "loss": 0.0648, + "num_input_tokens_seen": 28689704, + "step": 49450 + }, + { + "epoch": 7.365951742627346, + "grad_norm": 0.0520702563226223, + "learning_rate": 3.9815085057991254e-05, + "loss": 0.0008, + "num_input_tokens_seen": 28692680, + "step": 49455 + }, + { + "epoch": 7.366696455168305, + "grad_norm": 47.07737731933594, + "learning_rate": 3.981246754211244e-05, + "loss": 0.3616, + "num_input_tokens_seen": 28695304, + "step": 49460 + }, + { + "epoch": 7.367441167709265, + "grad_norm": 0.8385913372039795, + "learning_rate": 3.980984977599206e-05, + "loss": 0.0327, + "num_input_tokens_seen": 28698088, + "step": 49465 + }, + { + "epoch": 7.368185880250223, + "grad_norm": 4.964663028717041, + "learning_rate": 3.980723175967431e-05, + "loss": 0.0606, + "num_input_tokens_seen": 28701032, + "step": 49470 + }, + { + "epoch": 7.368930592791183, + "grad_norm": 0.127742201089859, + "learning_rate": 3.980461349320344e-05, + "loss": 0.0466, + "num_input_tokens_seen": 28704136, + "step": 49475 + }, + { + "epoch": 7.369675305332142, + "grad_norm": 0.013123712502419949, + "learning_rate": 3.9801994976623655e-05, + "loss": 0.0619, + "num_input_tokens_seen": 28707240, + "step": 49480 + }, + { + "epoch": 7.370420017873101, + "grad_norm": 7.256356239318848, + "learning_rate": 3.979937620997922e-05, + "loss": 0.1781, + "num_input_tokens_seen": 28710376, + "step": 49485 + }, + { + "epoch": 7.37116473041406, + "grad_norm": 10.31050968170166, + "learning_rate": 3.979675719331437e-05, + "loss": 0.1867, + "num_input_tokens_seen": 28713128, + "step": 49490 + }, + { + "epoch": 7.371909442955019, + "grad_norm": 46.562828063964844, + "learning_rate": 3.9794137926673337e-05, + "loss": 0.4987, + "num_input_tokens_seen": 28716232, + "step": 49495 + }, + { + "epoch": 7.372654155495979, + "grad_norm": 0.553329586982727, + "learning_rate": 3.979151841010038e-05, + "loss": 0.3486, + "num_input_tokens_seen": 28719208, + "step": 49500 + }, + { + "epoch": 7.373398868036937, + "grad_norm": 0.010425165295600891, + "learning_rate": 3.978889864363975e-05, + "loss": 0.0965, + "num_input_tokens_seen": 28721960, + "step": 49505 + }, + { + "epoch": 7.374143580577897, + "grad_norm": 0.01301993615925312, + "learning_rate": 3.978627862733572e-05, + "loss": 0.1983, + "num_input_tokens_seen": 28724712, + "step": 49510 + }, + { + "epoch": 7.374888293118856, + "grad_norm": 0.19207213819026947, + "learning_rate": 3.978365836123254e-05, + "loss": 0.0663, + "num_input_tokens_seen": 28727528, + "step": 49515 + }, + { + "epoch": 7.375633005659815, + "grad_norm": 1.347874402999878, + "learning_rate": 3.978103784537447e-05, + "loss": 0.003, + "num_input_tokens_seen": 28730728, + "step": 49520 + }, + { + "epoch": 7.376377718200774, + "grad_norm": 6.619323253631592, + "learning_rate": 3.977841707980578e-05, + "loss": 0.2279, + "num_input_tokens_seen": 28733768, + "step": 49525 + }, + { + "epoch": 7.377122430741734, + "grad_norm": 32.45796203613281, + "learning_rate": 3.977579606457077e-05, + "loss": 0.3584, + "num_input_tokens_seen": 28736520, + "step": 49530 + }, + { + "epoch": 7.3778671432826926, + "grad_norm": 0.006315652746707201, + "learning_rate": 3.97731747997137e-05, + "loss": 0.0495, + "num_input_tokens_seen": 28739240, + "step": 49535 + }, + { + "epoch": 7.378611855823652, + "grad_norm": 0.08532994240522385, + "learning_rate": 3.9770553285278846e-05, + "loss": 0.0047, + "num_input_tokens_seen": 28741992, + "step": 49540 + }, + { + "epoch": 7.379356568364611, + "grad_norm": 0.3343640863895416, + "learning_rate": 3.9767931521310514e-05, + "loss": 0.2403, + "num_input_tokens_seen": 28744744, + "step": 49545 + }, + { + "epoch": 7.380101280905571, + "grad_norm": 0.17903146147727966, + "learning_rate": 3.976530950785299e-05, + "loss": 0.0103, + "num_input_tokens_seen": 28747432, + "step": 49550 + }, + { + "epoch": 7.380845993446529, + "grad_norm": 0.002928544767200947, + "learning_rate": 3.976268724495057e-05, + "loss": 0.3961, + "num_input_tokens_seen": 28750184, + "step": 49555 + }, + { + "epoch": 7.381590705987489, + "grad_norm": 23.14078140258789, + "learning_rate": 3.9760064732647545e-05, + "loss": 0.0987, + "num_input_tokens_seen": 28753032, + "step": 49560 + }, + { + "epoch": 7.382335418528448, + "grad_norm": 85.51737213134766, + "learning_rate": 3.975744197098823e-05, + "loss": 0.171, + "num_input_tokens_seen": 28755624, + "step": 49565 + }, + { + "epoch": 7.383080131069407, + "grad_norm": 0.021448591724038124, + "learning_rate": 3.9754818960016934e-05, + "loss": 0.0014, + "num_input_tokens_seen": 28758472, + "step": 49570 + }, + { + "epoch": 7.383824843610366, + "grad_norm": 38.340782165527344, + "learning_rate": 3.975219569977797e-05, + "loss": 0.157, + "num_input_tokens_seen": 28761704, + "step": 49575 + }, + { + "epoch": 7.384569556151326, + "grad_norm": 13.443389892578125, + "learning_rate": 3.974957219031565e-05, + "loss": 0.1893, + "num_input_tokens_seen": 28764616, + "step": 49580 + }, + { + "epoch": 7.385314268692285, + "grad_norm": 5.8205037117004395, + "learning_rate": 3.9746948431674304e-05, + "loss": 0.0074, + "num_input_tokens_seen": 28767976, + "step": 49585 + }, + { + "epoch": 7.386058981233244, + "grad_norm": 0.20701193809509277, + "learning_rate": 3.974432442389824e-05, + "loss": 0.1579, + "num_input_tokens_seen": 28771016, + "step": 49590 + }, + { + "epoch": 7.386803693774203, + "grad_norm": 8.129292488098145, + "learning_rate": 3.974170016703181e-05, + "loss": 0.1667, + "num_input_tokens_seen": 28774376, + "step": 49595 + }, + { + "epoch": 7.387548406315163, + "grad_norm": 14.772331237792969, + "learning_rate": 3.973907566111934e-05, + "loss": 0.1514, + "num_input_tokens_seen": 28777160, + "step": 49600 + }, + { + "epoch": 7.388293118856121, + "grad_norm": 0.013200144283473492, + "learning_rate": 3.9736450906205156e-05, + "loss": 0.0151, + "num_input_tokens_seen": 28779784, + "step": 49605 + }, + { + "epoch": 7.389037831397081, + "grad_norm": 0.010494556277990341, + "learning_rate": 3.973382590233362e-05, + "loss": 0.074, + "num_input_tokens_seen": 28782600, + "step": 49610 + }, + { + "epoch": 7.38978254393804, + "grad_norm": 0.4684585928916931, + "learning_rate": 3.973120064954907e-05, + "loss": 0.2014, + "num_input_tokens_seen": 28785416, + "step": 49615 + }, + { + "epoch": 7.390527256478999, + "grad_norm": 36.717525482177734, + "learning_rate": 3.972857514789586e-05, + "loss": 0.2398, + "num_input_tokens_seen": 28788264, + "step": 49620 + }, + { + "epoch": 7.391271969019958, + "grad_norm": 72.98719787597656, + "learning_rate": 3.972594939741834e-05, + "loss": 0.2668, + "num_input_tokens_seen": 28791208, + "step": 49625 + }, + { + "epoch": 7.392016681560918, + "grad_norm": 12.927631378173828, + "learning_rate": 3.9723323398160863e-05, + "loss": 0.039, + "num_input_tokens_seen": 28793992, + "step": 49630 + }, + { + "epoch": 7.392761394101877, + "grad_norm": 34.2084846496582, + "learning_rate": 3.972069715016782e-05, + "loss": 0.178, + "num_input_tokens_seen": 28796840, + "step": 49635 + }, + { + "epoch": 7.393506106642836, + "grad_norm": 2.465566396713257, + "learning_rate": 3.971807065348354e-05, + "loss": 0.1478, + "num_input_tokens_seen": 28799496, + "step": 49640 + }, + { + "epoch": 7.394250819183795, + "grad_norm": 0.006579895969480276, + "learning_rate": 3.9715443908152426e-05, + "loss": 0.1759, + "num_input_tokens_seen": 28802696, + "step": 49645 + }, + { + "epoch": 7.394995531724755, + "grad_norm": 0.06336487829685211, + "learning_rate": 3.971281691421884e-05, + "loss": 0.0095, + "num_input_tokens_seen": 28805736, + "step": 49650 + }, + { + "epoch": 7.395740244265713, + "grad_norm": 0.014602974988520145, + "learning_rate": 3.971018967172717e-05, + "loss": 0.351, + "num_input_tokens_seen": 28808488, + "step": 49655 + }, + { + "epoch": 7.396484956806672, + "grad_norm": 0.010300244204699993, + "learning_rate": 3.970756218072179e-05, + "loss": 0.1748, + "num_input_tokens_seen": 28811400, + "step": 49660 + }, + { + "epoch": 7.397229669347632, + "grad_norm": 0.1858830749988556, + "learning_rate": 3.97049344412471e-05, + "loss": 0.1698, + "num_input_tokens_seen": 28814376, + "step": 49665 + }, + { + "epoch": 7.3979743818885915, + "grad_norm": 0.017807453870773315, + "learning_rate": 3.970230645334748e-05, + "loss": 0.2431, + "num_input_tokens_seen": 28817480, + "step": 49670 + }, + { + "epoch": 7.39871909442955, + "grad_norm": 0.02100956067442894, + "learning_rate": 3.9699678217067346e-05, + "loss": 0.0035, + "num_input_tokens_seen": 28820136, + "step": 49675 + }, + { + "epoch": 7.399463806970509, + "grad_norm": 97.71916198730469, + "learning_rate": 3.9697049732451084e-05, + "loss": 0.2509, + "num_input_tokens_seen": 28822984, + "step": 49680 + }, + { + "epoch": 7.400208519511469, + "grad_norm": 1.1372175216674805, + "learning_rate": 3.9694420999543105e-05, + "loss": 0.1725, + "num_input_tokens_seen": 28825512, + "step": 49685 + }, + { + "epoch": 7.400953232052427, + "grad_norm": 38.83624267578125, + "learning_rate": 3.969179201838782e-05, + "loss": 0.3995, + "num_input_tokens_seen": 28828584, + "step": 49690 + }, + { + "epoch": 7.401697944593387, + "grad_norm": 3.3103649616241455, + "learning_rate": 3.968916278902963e-05, + "loss": 0.1596, + "num_input_tokens_seen": 28831464, + "step": 49695 + }, + { + "epoch": 7.402442657134346, + "grad_norm": 0.15844041109085083, + "learning_rate": 3.968653331151297e-05, + "loss": 0.0231, + "num_input_tokens_seen": 28834248, + "step": 49700 + }, + { + "epoch": 7.403187369675305, + "grad_norm": 1.2424862384796143, + "learning_rate": 3.9683903585882264e-05, + "loss": 0.0293, + "num_input_tokens_seen": 28837384, + "step": 49705 + }, + { + "epoch": 7.403932082216264, + "grad_norm": 0.0659579411149025, + "learning_rate": 3.9681273612181924e-05, + "loss": 0.2328, + "num_input_tokens_seen": 28840488, + "step": 49710 + }, + { + "epoch": 7.404676794757224, + "grad_norm": 0.42455464601516724, + "learning_rate": 3.967864339045639e-05, + "loss": 0.0042, + "num_input_tokens_seen": 28843368, + "step": 49715 + }, + { + "epoch": 7.405421507298183, + "grad_norm": 0.8795915246009827, + "learning_rate": 3.967601292075009e-05, + "loss": 0.3626, + "num_input_tokens_seen": 28846280, + "step": 49720 + }, + { + "epoch": 7.406166219839142, + "grad_norm": 1.3801652193069458, + "learning_rate": 3.967338220310748e-05, + "loss": 0.1879, + "num_input_tokens_seen": 28849384, + "step": 49725 + }, + { + "epoch": 7.406910932380101, + "grad_norm": 28.788150787353516, + "learning_rate": 3.967075123757298e-05, + "loss": 0.2625, + "num_input_tokens_seen": 28852360, + "step": 49730 + }, + { + "epoch": 7.407655644921061, + "grad_norm": 0.07225540280342102, + "learning_rate": 3.9668120024191046e-05, + "loss": 0.0468, + "num_input_tokens_seen": 28855336, + "step": 49735 + }, + { + "epoch": 7.408400357462019, + "grad_norm": 2.5631818771362305, + "learning_rate": 3.966548856300614e-05, + "loss": 0.0653, + "num_input_tokens_seen": 28858088, + "step": 49740 + }, + { + "epoch": 7.409145070002979, + "grad_norm": 0.061043497174978256, + "learning_rate": 3.9662856854062706e-05, + "loss": 0.1156, + "num_input_tokens_seen": 28861160, + "step": 49745 + }, + { + "epoch": 7.409889782543938, + "grad_norm": 0.06316567957401276, + "learning_rate": 3.9660224897405206e-05, + "loss": 0.3254, + "num_input_tokens_seen": 28864072, + "step": 49750 + }, + { + "epoch": 7.4106344950848975, + "grad_norm": 1.068486213684082, + "learning_rate": 3.965759269307812e-05, + "loss": 0.1674, + "num_input_tokens_seen": 28866984, + "step": 49755 + }, + { + "epoch": 7.411379207625856, + "grad_norm": 30.198312759399414, + "learning_rate": 3.965496024112589e-05, + "loss": 0.4921, + "num_input_tokens_seen": 28869768, + "step": 49760 + }, + { + "epoch": 7.412123920166816, + "grad_norm": 0.0049836826510727406, + "learning_rate": 3.9652327541593e-05, + "loss": 0.3446, + "num_input_tokens_seen": 28872584, + "step": 49765 + }, + { + "epoch": 7.412868632707775, + "grad_norm": 24.999284744262695, + "learning_rate": 3.964969459452393e-05, + "loss": 0.3211, + "num_input_tokens_seen": 28875848, + "step": 49770 + }, + { + "epoch": 7.413613345248734, + "grad_norm": 0.014673618599772453, + "learning_rate": 3.964706139996316e-05, + "loss": 0.2869, + "num_input_tokens_seen": 28878728, + "step": 49775 + }, + { + "epoch": 7.414358057789693, + "grad_norm": 38.558143615722656, + "learning_rate": 3.9644427957955174e-05, + "loss": 0.3112, + "num_input_tokens_seen": 28881544, + "step": 49780 + }, + { + "epoch": 7.415102770330653, + "grad_norm": 0.11468261480331421, + "learning_rate": 3.9641794268544465e-05, + "loss": 0.0228, + "num_input_tokens_seen": 28884456, + "step": 49785 + }, + { + "epoch": 7.415847482871611, + "grad_norm": 2.465358257293701, + "learning_rate": 3.963916033177552e-05, + "loss": 0.2317, + "num_input_tokens_seen": 28887304, + "step": 49790 + }, + { + "epoch": 7.416592195412571, + "grad_norm": 10.501544952392578, + "learning_rate": 3.963652614769284e-05, + "loss": 0.4409, + "num_input_tokens_seen": 28890024, + "step": 49795 + }, + { + "epoch": 7.41733690795353, + "grad_norm": 0.2605099678039551, + "learning_rate": 3.963389171634093e-05, + "loss": 0.0014, + "num_input_tokens_seen": 28893000, + "step": 49800 + }, + { + "epoch": 7.4180816204944895, + "grad_norm": 65.11907958984375, + "learning_rate": 3.963125703776429e-05, + "loss": 0.147, + "num_input_tokens_seen": 28895656, + "step": 49805 + }, + { + "epoch": 7.418826333035448, + "grad_norm": 0.13886407017707825, + "learning_rate": 3.962862211200744e-05, + "loss": 0.3199, + "num_input_tokens_seen": 28898792, + "step": 49810 + }, + { + "epoch": 7.419571045576408, + "grad_norm": 0.0655008926987648, + "learning_rate": 3.962598693911488e-05, + "loss": 0.1728, + "num_input_tokens_seen": 28901576, + "step": 49815 + }, + { + "epoch": 7.420315758117367, + "grad_norm": 0.22709709405899048, + "learning_rate": 3.962335151913113e-05, + "loss": 0.1929, + "num_input_tokens_seen": 28904520, + "step": 49820 + }, + { + "epoch": 7.421060470658326, + "grad_norm": 18.82444953918457, + "learning_rate": 3.962071585210072e-05, + "loss": 0.1506, + "num_input_tokens_seen": 28907496, + "step": 49825 + }, + { + "epoch": 7.421805183199285, + "grad_norm": 0.29763612151145935, + "learning_rate": 3.961807993806819e-05, + "loss": 0.0326, + "num_input_tokens_seen": 28910184, + "step": 49830 + }, + { + "epoch": 7.422549895740245, + "grad_norm": 2.7199227809906006, + "learning_rate": 3.9615443777078046e-05, + "loss": 0.2648, + "num_input_tokens_seen": 28913192, + "step": 49835 + }, + { + "epoch": 7.4232946082812035, + "grad_norm": 0.06629273295402527, + "learning_rate": 3.961280736917483e-05, + "loss": 0.0934, + "num_input_tokens_seen": 28916040, + "step": 49840 + }, + { + "epoch": 7.424039320822162, + "grad_norm": 0.07437688112258911, + "learning_rate": 3.961017071440309e-05, + "loss": 0.1668, + "num_input_tokens_seen": 28918760, + "step": 49845 + }, + { + "epoch": 7.424784033363122, + "grad_norm": 0.007907666265964508, + "learning_rate": 3.960753381280737e-05, + "loss": 0.0859, + "num_input_tokens_seen": 28921736, + "step": 49850 + }, + { + "epoch": 7.425528745904081, + "grad_norm": 0.002394221257418394, + "learning_rate": 3.96048966644322e-05, + "loss": 0.0893, + "num_input_tokens_seen": 28924616, + "step": 49855 + }, + { + "epoch": 7.42627345844504, + "grad_norm": 5.546396255493164, + "learning_rate": 3.9602259269322155e-05, + "loss": 0.4672, + "num_input_tokens_seen": 28927528, + "step": 49860 + }, + { + "epoch": 7.427018170985999, + "grad_norm": 0.0973086729645729, + "learning_rate": 3.9599621627521774e-05, + "loss": 0.1358, + "num_input_tokens_seen": 28930312, + "step": 49865 + }, + { + "epoch": 7.427762883526959, + "grad_norm": 0.7462515830993652, + "learning_rate": 3.959698373907563e-05, + "loss": 0.0031, + "num_input_tokens_seen": 28933320, + "step": 49870 + }, + { + "epoch": 7.428507596067917, + "grad_norm": 0.5689547061920166, + "learning_rate": 3.959434560402828e-05, + "loss": 0.2999, + "num_input_tokens_seen": 28936136, + "step": 49875 + }, + { + "epoch": 7.429252308608877, + "grad_norm": 0.10122242569923401, + "learning_rate": 3.9591707222424294e-05, + "loss": 0.1742, + "num_input_tokens_seen": 28939080, + "step": 49880 + }, + { + "epoch": 7.429997021149836, + "grad_norm": 2.829247236251831, + "learning_rate": 3.958906859430825e-05, + "loss": 0.2288, + "num_input_tokens_seen": 28941960, + "step": 49885 + }, + { + "epoch": 7.4307417336907955, + "grad_norm": 31.366641998291016, + "learning_rate": 3.958642971972471e-05, + "loss": 0.0829, + "num_input_tokens_seen": 28945032, + "step": 49890 + }, + { + "epoch": 7.431486446231754, + "grad_norm": 14.624231338500977, + "learning_rate": 3.958379059871827e-05, + "loss": 0.0183, + "num_input_tokens_seen": 28947688, + "step": 49895 + }, + { + "epoch": 7.432231158772714, + "grad_norm": 66.9095687866211, + "learning_rate": 3.9581151231333506e-05, + "loss": 0.1369, + "num_input_tokens_seen": 28950344, + "step": 49900 + }, + { + "epoch": 7.432975871313673, + "grad_norm": 0.0014383455272763968, + "learning_rate": 3.957851161761502e-05, + "loss": 0.1021, + "num_input_tokens_seen": 28953160, + "step": 49905 + }, + { + "epoch": 7.433720583854632, + "grad_norm": 0.010056931525468826, + "learning_rate": 3.9575871757607385e-05, + "loss": 0.1793, + "num_input_tokens_seen": 28956136, + "step": 49910 + }, + { + "epoch": 7.434465296395591, + "grad_norm": 0.020651699975132942, + "learning_rate": 3.9573231651355225e-05, + "loss": 0.1521, + "num_input_tokens_seen": 28958920, + "step": 49915 + }, + { + "epoch": 7.435210008936551, + "grad_norm": 71.32376861572266, + "learning_rate": 3.957059129890311e-05, + "loss": 0.2053, + "num_input_tokens_seen": 28962152, + "step": 49920 + }, + { + "epoch": 7.4359547214775095, + "grad_norm": 0.6021569967269897, + "learning_rate": 3.956795070029568e-05, + "loss": 0.2222, + "num_input_tokens_seen": 28965256, + "step": 49925 + }, + { + "epoch": 7.436699434018469, + "grad_norm": 12.98448371887207, + "learning_rate": 3.956530985557753e-05, + "loss": 0.3229, + "num_input_tokens_seen": 28968008, + "step": 49930 + }, + { + "epoch": 7.437444146559428, + "grad_norm": 2.060129404067993, + "learning_rate": 3.9562668764793264e-05, + "loss": 0.0043, + "num_input_tokens_seen": 28970952, + "step": 49935 + }, + { + "epoch": 7.4381888591003875, + "grad_norm": 53.492671966552734, + "learning_rate": 3.9560027427987515e-05, + "loss": 0.1947, + "num_input_tokens_seen": 28974056, + "step": 49940 + }, + { + "epoch": 7.438933571641346, + "grad_norm": 36.976016998291016, + "learning_rate": 3.9557385845204895e-05, + "loss": 0.102, + "num_input_tokens_seen": 28976840, + "step": 49945 + }, + { + "epoch": 7.439678284182306, + "grad_norm": 0.03726522624492645, + "learning_rate": 3.955474401649004e-05, + "loss": 0.0031, + "num_input_tokens_seen": 28979560, + "step": 49950 + }, + { + "epoch": 7.440422996723265, + "grad_norm": 0.2660480737686157, + "learning_rate": 3.955210194188758e-05, + "loss": 0.0028, + "num_input_tokens_seen": 28982536, + "step": 49955 + }, + { + "epoch": 7.441167709264224, + "grad_norm": 0.011332607828080654, + "learning_rate": 3.954945962144214e-05, + "loss": 0.0055, + "num_input_tokens_seen": 28985608, + "step": 49960 + }, + { + "epoch": 7.441912421805183, + "grad_norm": 0.010809678584337234, + "learning_rate": 3.9546817055198385e-05, + "loss": 0.114, + "num_input_tokens_seen": 28988360, + "step": 49965 + }, + { + "epoch": 7.442657134346143, + "grad_norm": 0.009003547951579094, + "learning_rate": 3.954417424320092e-05, + "loss": 0.1705, + "num_input_tokens_seen": 28991464, + "step": 49970 + }, + { + "epoch": 7.4434018468871015, + "grad_norm": 0.010545962490141392, + "learning_rate": 3.954153118549442e-05, + "loss": 0.0895, + "num_input_tokens_seen": 28994472, + "step": 49975 + }, + { + "epoch": 7.444146559428061, + "grad_norm": 1.0360116958618164, + "learning_rate": 3.953888788212353e-05, + "loss": 0.0155, + "num_input_tokens_seen": 28997512, + "step": 49980 + }, + { + "epoch": 7.44489127196902, + "grad_norm": 0.0047636693343520164, + "learning_rate": 3.953624433313291e-05, + "loss": 0.0431, + "num_input_tokens_seen": 29000168, + "step": 49985 + }, + { + "epoch": 7.4456359845099795, + "grad_norm": 0.12300056964159012, + "learning_rate": 3.9533600538567214e-05, + "loss": 0.1977, + "num_input_tokens_seen": 29003080, + "step": 49990 + }, + { + "epoch": 7.446380697050938, + "grad_norm": 7.447999477386475, + "learning_rate": 3.953095649847111e-05, + "loss": 0.3596, + "num_input_tokens_seen": 29006088, + "step": 49995 + }, + { + "epoch": 7.447125409591898, + "grad_norm": 0.08589323610067368, + "learning_rate": 3.952831221288926e-05, + "loss": 0.0338, + "num_input_tokens_seen": 29009096, + "step": 50000 + }, + { + "epoch": 7.447870122132857, + "grad_norm": 5.11366605758667, + "learning_rate": 3.9525667681866344e-05, + "loss": 0.1125, + "num_input_tokens_seen": 29012264, + "step": 50005 + }, + { + "epoch": 7.4486148346738155, + "grad_norm": 0.0034242719411849976, + "learning_rate": 3.952302290544704e-05, + "loss": 0.0048, + "num_input_tokens_seen": 29015720, + "step": 50010 + }, + { + "epoch": 7.449359547214775, + "grad_norm": 0.4009133279323578, + "learning_rate": 3.952037788367602e-05, + "loss": 0.1828, + "num_input_tokens_seen": 29018856, + "step": 50015 + }, + { + "epoch": 7.450104259755734, + "grad_norm": 0.08454746007919312, + "learning_rate": 3.951773261659797e-05, + "loss": 0.0003, + "num_input_tokens_seen": 29021544, + "step": 50020 + }, + { + "epoch": 7.4508489722966935, + "grad_norm": 2.8759572505950928, + "learning_rate": 3.951508710425758e-05, + "loss": 0.0017, + "num_input_tokens_seen": 29024584, + "step": 50025 + }, + { + "epoch": 7.451593684837652, + "grad_norm": 0.025959203019738197, + "learning_rate": 3.9512441346699554e-05, + "loss": 0.2739, + "num_input_tokens_seen": 29027656, + "step": 50030 + }, + { + "epoch": 7.452338397378612, + "grad_norm": 66.10919189453125, + "learning_rate": 3.950979534396858e-05, + "loss": 0.2406, + "num_input_tokens_seen": 29030568, + "step": 50035 + }, + { + "epoch": 7.453083109919571, + "grad_norm": 0.0025700917467474937, + "learning_rate": 3.9507149096109366e-05, + "loss": 0.145, + "num_input_tokens_seen": 29033288, + "step": 50040 + }, + { + "epoch": 7.45382782246053, + "grad_norm": 0.3655936121940613, + "learning_rate": 3.95045026031666e-05, + "loss": 0.222, + "num_input_tokens_seen": 29035976, + "step": 50045 + }, + { + "epoch": 7.454572535001489, + "grad_norm": 0.15295837819576263, + "learning_rate": 3.950185586518501e-05, + "loss": 0.3578, + "num_input_tokens_seen": 29039080, + "step": 50050 + }, + { + "epoch": 7.455317247542449, + "grad_norm": 0.14728599786758423, + "learning_rate": 3.94992088822093e-05, + "loss": 0.2711, + "num_input_tokens_seen": 29042024, + "step": 50055 + }, + { + "epoch": 7.4560619600834075, + "grad_norm": 0.03489389270544052, + "learning_rate": 3.94965616542842e-05, + "loss": 0.1356, + "num_input_tokens_seen": 29044616, + "step": 50060 + }, + { + "epoch": 7.456806672624367, + "grad_norm": 93.4433822631836, + "learning_rate": 3.949391418145442e-05, + "loss": 0.077, + "num_input_tokens_seen": 29047432, + "step": 50065 + }, + { + "epoch": 7.457551385165326, + "grad_norm": 0.11321199685335159, + "learning_rate": 3.9491266463764694e-05, + "loss": 0.382, + "num_input_tokens_seen": 29050248, + "step": 50070 + }, + { + "epoch": 7.4582960977062855, + "grad_norm": 79.99384307861328, + "learning_rate": 3.948861850125974e-05, + "loss": 0.3058, + "num_input_tokens_seen": 29053096, + "step": 50075 + }, + { + "epoch": 7.459040810247244, + "grad_norm": 0.07594100385904312, + "learning_rate": 3.948597029398432e-05, + "loss": 0.0027, + "num_input_tokens_seen": 29056264, + "step": 50080 + }, + { + "epoch": 7.459785522788204, + "grad_norm": 0.02393299527466297, + "learning_rate": 3.9483321841983146e-05, + "loss": 0.0932, + "num_input_tokens_seen": 29058888, + "step": 50085 + }, + { + "epoch": 7.460530235329163, + "grad_norm": 28.91753578186035, + "learning_rate": 3.948067314530096e-05, + "loss": 0.55, + "num_input_tokens_seen": 29061704, + "step": 50090 + }, + { + "epoch": 7.461274947870122, + "grad_norm": 6.317748069763184, + "learning_rate": 3.947802420398253e-05, + "loss": 0.0045, + "num_input_tokens_seen": 29064584, + "step": 50095 + }, + { + "epoch": 7.462019660411081, + "grad_norm": 21.9703426361084, + "learning_rate": 3.947537501807259e-05, + "loss": 0.1864, + "num_input_tokens_seen": 29067432, + "step": 50100 + }, + { + "epoch": 7.462764372952041, + "grad_norm": 0.1275806427001953, + "learning_rate": 3.947272558761591e-05, + "loss": 0.2327, + "num_input_tokens_seen": 29070344, + "step": 50105 + }, + { + "epoch": 7.4635090854929995, + "grad_norm": 0.5969505310058594, + "learning_rate": 3.947007591265723e-05, + "loss": 0.002, + "num_input_tokens_seen": 29073160, + "step": 50110 + }, + { + "epoch": 7.464253798033959, + "grad_norm": 62.73887634277344, + "learning_rate": 3.9467425993241326e-05, + "loss": 0.0823, + "num_input_tokens_seen": 29076104, + "step": 50115 + }, + { + "epoch": 7.464998510574918, + "grad_norm": 3.877609968185425, + "learning_rate": 3.946477582941297e-05, + "loss": 0.135, + "num_input_tokens_seen": 29079080, + "step": 50120 + }, + { + "epoch": 7.465743223115878, + "grad_norm": 0.038592271506786346, + "learning_rate": 3.946212542121692e-05, + "loss": 0.0784, + "num_input_tokens_seen": 29081768, + "step": 50125 + }, + { + "epoch": 7.466487935656836, + "grad_norm": 74.10360717773438, + "learning_rate": 3.945947476869797e-05, + "loss": 0.2023, + "num_input_tokens_seen": 29084840, + "step": 50130 + }, + { + "epoch": 7.467232648197796, + "grad_norm": 0.07862886041402817, + "learning_rate": 3.945682387190088e-05, + "loss": 0.1649, + "num_input_tokens_seen": 29087464, + "step": 50135 + }, + { + "epoch": 7.467977360738755, + "grad_norm": 57.496185302734375, + "learning_rate": 3.9454172730870445e-05, + "loss": 0.387, + "num_input_tokens_seen": 29090312, + "step": 50140 + }, + { + "epoch": 7.468722073279714, + "grad_norm": 54.70787048339844, + "learning_rate": 3.9451521345651456e-05, + "loss": 0.5134, + "num_input_tokens_seen": 29093448, + "step": 50145 + }, + { + "epoch": 7.469466785820673, + "grad_norm": 0.04551856592297554, + "learning_rate": 3.94488697162887e-05, + "loss": 0.0029, + "num_input_tokens_seen": 29096840, + "step": 50150 + }, + { + "epoch": 7.470211498361633, + "grad_norm": 0.04535811021924019, + "learning_rate": 3.944621784282697e-05, + "loss": 0.3178, + "num_input_tokens_seen": 29099720, + "step": 50155 + }, + { + "epoch": 7.4709562109025915, + "grad_norm": 0.24129609763622284, + "learning_rate": 3.944356572531108e-05, + "loss": 0.0472, + "num_input_tokens_seen": 29103016, + "step": 50160 + }, + { + "epoch": 7.471700923443551, + "grad_norm": 11.192296981811523, + "learning_rate": 3.944091336378583e-05, + "loss": 0.438, + "num_input_tokens_seen": 29105768, + "step": 50165 + }, + { + "epoch": 7.47244563598451, + "grad_norm": 29.198890686035156, + "learning_rate": 3.943826075829602e-05, + "loss": 0.0988, + "num_input_tokens_seen": 29108808, + "step": 50170 + }, + { + "epoch": 7.473190348525469, + "grad_norm": 0.09039201587438583, + "learning_rate": 3.943560790888647e-05, + "loss": 0.1522, + "num_input_tokens_seen": 29111336, + "step": 50175 + }, + { + "epoch": 7.473935061066428, + "grad_norm": 0.16218574345111847, + "learning_rate": 3.9432954815601995e-05, + "loss": 0.0588, + "num_input_tokens_seen": 29114216, + "step": 50180 + }, + { + "epoch": 7.474679773607388, + "grad_norm": 0.046027638018131256, + "learning_rate": 3.943030147848742e-05, + "loss": 0.1612, + "num_input_tokens_seen": 29117032, + "step": 50185 + }, + { + "epoch": 7.475424486148347, + "grad_norm": 0.014153323136270046, + "learning_rate": 3.9427647897587564e-05, + "loss": 0.2754, + "num_input_tokens_seen": 29120136, + "step": 50190 + }, + { + "epoch": 7.4761691986893055, + "grad_norm": 0.04747236147522926, + "learning_rate": 3.9424994072947256e-05, + "loss": 0.0004, + "num_input_tokens_seen": 29123016, + "step": 50195 + }, + { + "epoch": 7.476913911230265, + "grad_norm": 44.174678802490234, + "learning_rate": 3.942234000461135e-05, + "loss": 0.171, + "num_input_tokens_seen": 29126024, + "step": 50200 + }, + { + "epoch": 7.477658623771224, + "grad_norm": 159.42945861816406, + "learning_rate": 3.941968569262465e-05, + "loss": 0.7058, + "num_input_tokens_seen": 29129000, + "step": 50205 + }, + { + "epoch": 7.478403336312184, + "grad_norm": 0.42350852489471436, + "learning_rate": 3.9417031137032025e-05, + "loss": 0.2072, + "num_input_tokens_seen": 29132104, + "step": 50210 + }, + { + "epoch": 7.479148048853142, + "grad_norm": 8.915367126464844, + "learning_rate": 3.941437633787831e-05, + "loss": 0.2005, + "num_input_tokens_seen": 29135368, + "step": 50215 + }, + { + "epoch": 7.479892761394102, + "grad_norm": 0.08894331008195877, + "learning_rate": 3.941172129520836e-05, + "loss": 0.2599, + "num_input_tokens_seen": 29138088, + "step": 50220 + }, + { + "epoch": 7.480637473935061, + "grad_norm": 33.3951301574707, + "learning_rate": 3.940906600906702e-05, + "loss": 0.1305, + "num_input_tokens_seen": 29141000, + "step": 50225 + }, + { + "epoch": 7.48138218647602, + "grad_norm": 0.019683942198753357, + "learning_rate": 3.9406410479499155e-05, + "loss": 0.3064, + "num_input_tokens_seen": 29144104, + "step": 50230 + }, + { + "epoch": 7.482126899016979, + "grad_norm": 97.39601135253906, + "learning_rate": 3.940375470654963e-05, + "loss": 0.302, + "num_input_tokens_seen": 29146792, + "step": 50235 + }, + { + "epoch": 7.482871611557939, + "grad_norm": 33.28935241699219, + "learning_rate": 3.9401098690263316e-05, + "loss": 0.1505, + "num_input_tokens_seen": 29149736, + "step": 50240 + }, + { + "epoch": 7.4836163240988975, + "grad_norm": 13.377071380615234, + "learning_rate": 3.939844243068507e-05, + "loss": 0.168, + "num_input_tokens_seen": 29152584, + "step": 50245 + }, + { + "epoch": 7.484361036639857, + "grad_norm": 7.240306377410889, + "learning_rate": 3.939578592785977e-05, + "loss": 0.3265, + "num_input_tokens_seen": 29155656, + "step": 50250 + }, + { + "epoch": 7.485105749180816, + "grad_norm": 35.174232482910156, + "learning_rate": 3.93931291818323e-05, + "loss": 0.0697, + "num_input_tokens_seen": 29158472, + "step": 50255 + }, + { + "epoch": 7.485850461721776, + "grad_norm": 0.08072682470083237, + "learning_rate": 3.939047219264754e-05, + "loss": 0.0048, + "num_input_tokens_seen": 29161256, + "step": 50260 + }, + { + "epoch": 7.486595174262734, + "grad_norm": 0.37546610832214355, + "learning_rate": 3.938781496035038e-05, + "loss": 0.1926, + "num_input_tokens_seen": 29164104, + "step": 50265 + }, + { + "epoch": 7.487339886803694, + "grad_norm": 56.793067932128906, + "learning_rate": 3.93851574849857e-05, + "loss": 0.3126, + "num_input_tokens_seen": 29166824, + "step": 50270 + }, + { + "epoch": 7.488084599344653, + "grad_norm": 55.96242904663086, + "learning_rate": 3.9382499766598416e-05, + "loss": 0.1772, + "num_input_tokens_seen": 29169864, + "step": 50275 + }, + { + "epoch": 7.488829311885612, + "grad_norm": 0.10123957693576813, + "learning_rate": 3.937984180523342e-05, + "loss": 0.5044, + "num_input_tokens_seen": 29172712, + "step": 50280 + }, + { + "epoch": 7.489574024426571, + "grad_norm": 0.16949748992919922, + "learning_rate": 3.9377183600935595e-05, + "loss": 0.1345, + "num_input_tokens_seen": 29175592, + "step": 50285 + }, + { + "epoch": 7.490318736967531, + "grad_norm": 7.083148002624512, + "learning_rate": 3.937452515374987e-05, + "loss": 0.6264, + "num_input_tokens_seen": 29178440, + "step": 50290 + }, + { + "epoch": 7.49106344950849, + "grad_norm": 0.10241803526878357, + "learning_rate": 3.9371866463721165e-05, + "loss": 0.1299, + "num_input_tokens_seen": 29181352, + "step": 50295 + }, + { + "epoch": 7.491808162049449, + "grad_norm": 28.23838233947754, + "learning_rate": 3.9369207530894374e-05, + "loss": 0.1204, + "num_input_tokens_seen": 29183944, + "step": 50300 + }, + { + "epoch": 7.492552874590408, + "grad_norm": 42.167724609375, + "learning_rate": 3.9366548355314426e-05, + "loss": 0.2947, + "num_input_tokens_seen": 29186696, + "step": 50305 + }, + { + "epoch": 7.493297587131368, + "grad_norm": 1.245068073272705, + "learning_rate": 3.936388893702625e-05, + "loss": 0.116, + "num_input_tokens_seen": 29189480, + "step": 50310 + }, + { + "epoch": 7.494042299672326, + "grad_norm": 1.2975422143936157, + "learning_rate": 3.936122927607476e-05, + "loss": 0.1697, + "num_input_tokens_seen": 29192520, + "step": 50315 + }, + { + "epoch": 7.494787012213286, + "grad_norm": 4.078038692474365, + "learning_rate": 3.935856937250491e-05, + "loss": 0.1496, + "num_input_tokens_seen": 29195304, + "step": 50320 + }, + { + "epoch": 7.495531724754245, + "grad_norm": 0.046990152448415756, + "learning_rate": 3.935590922636161e-05, + "loss": 0.2288, + "num_input_tokens_seen": 29198184, + "step": 50325 + }, + { + "epoch": 7.496276437295204, + "grad_norm": 6.539234161376953, + "learning_rate": 3.935324883768983e-05, + "loss": 0.0451, + "num_input_tokens_seen": 29201096, + "step": 50330 + }, + { + "epoch": 7.497021149836163, + "grad_norm": 1.1752278804779053, + "learning_rate": 3.9350588206534486e-05, + "loss": 0.326, + "num_input_tokens_seen": 29204072, + "step": 50335 + }, + { + "epoch": 7.497765862377123, + "grad_norm": 3.6691389083862305, + "learning_rate": 3.934792733294054e-05, + "loss": 0.0648, + "num_input_tokens_seen": 29206952, + "step": 50340 + }, + { + "epoch": 7.498510574918082, + "grad_norm": 0.31030553579330444, + "learning_rate": 3.9345266216952945e-05, + "loss": 0.2776, + "num_input_tokens_seen": 29209800, + "step": 50345 + }, + { + "epoch": 7.499255287459041, + "grad_norm": 0.002478604204952717, + "learning_rate": 3.934260485861667e-05, + "loss": 0.0294, + "num_input_tokens_seen": 29212680, + "step": 50350 + }, + { + "epoch": 7.5, + "grad_norm": 0.3237518072128296, + "learning_rate": 3.933994325797665e-05, + "loss": 0.1607, + "num_input_tokens_seen": 29215400, + "step": 50355 + }, + { + "epoch": 7.500744712540959, + "grad_norm": 14.006380081176758, + "learning_rate": 3.9337281415077866e-05, + "loss": 0.1918, + "num_input_tokens_seen": 29218504, + "step": 50360 + }, + { + "epoch": 7.501489425081918, + "grad_norm": 1.8671404123306274, + "learning_rate": 3.933461932996528e-05, + "loss": 0.3069, + "num_input_tokens_seen": 29221896, + "step": 50365 + }, + { + "epoch": 7.502234137622878, + "grad_norm": 55.789955139160156, + "learning_rate": 3.933195700268388e-05, + "loss": 0.3709, + "num_input_tokens_seen": 29224520, + "step": 50370 + }, + { + "epoch": 7.502978850163837, + "grad_norm": 6.10519552230835, + "learning_rate": 3.932929443327862e-05, + "loss": 0.0917, + "num_input_tokens_seen": 29227528, + "step": 50375 + }, + { + "epoch": 7.503723562704796, + "grad_norm": 15.804722785949707, + "learning_rate": 3.932663162179451e-05, + "loss": 0.0958, + "num_input_tokens_seen": 29230632, + "step": 50380 + }, + { + "epoch": 7.504468275245755, + "grad_norm": 1.4600598812103271, + "learning_rate": 3.93239685682765e-05, + "loss": 0.24, + "num_input_tokens_seen": 29233320, + "step": 50385 + }, + { + "epoch": 7.505212987786714, + "grad_norm": 2.2937910556793213, + "learning_rate": 3.932130527276961e-05, + "loss": 0.0968, + "num_input_tokens_seen": 29236200, + "step": 50390 + }, + { + "epoch": 7.505957700327674, + "grad_norm": 0.14327271282672882, + "learning_rate": 3.931864173531883e-05, + "loss": 0.0305, + "num_input_tokens_seen": 29239112, + "step": 50395 + }, + { + "epoch": 7.506702412868632, + "grad_norm": 26.214345932006836, + "learning_rate": 3.931597795596914e-05, + "loss": 0.6585, + "num_input_tokens_seen": 29242248, + "step": 50400 + }, + { + "epoch": 7.507447125409592, + "grad_norm": 0.06018586456775665, + "learning_rate": 3.931331393476556e-05, + "loss": 0.0062, + "num_input_tokens_seen": 29244904, + "step": 50405 + }, + { + "epoch": 7.508191837950551, + "grad_norm": 15.250052452087402, + "learning_rate": 3.931064967175309e-05, + "loss": 0.1291, + "num_input_tokens_seen": 29247752, + "step": 50410 + }, + { + "epoch": 7.50893655049151, + "grad_norm": 0.17354966700077057, + "learning_rate": 3.9307985166976726e-05, + "loss": 0.2104, + "num_input_tokens_seen": 29250888, + "step": 50415 + }, + { + "epoch": 7.509681263032469, + "grad_norm": 18.09596824645996, + "learning_rate": 3.93053204204815e-05, + "loss": 0.3006, + "num_input_tokens_seen": 29253832, + "step": 50420 + }, + { + "epoch": 7.510425975573429, + "grad_norm": 0.015602074563503265, + "learning_rate": 3.930265543231243e-05, + "loss": 0.2029, + "num_input_tokens_seen": 29256648, + "step": 50425 + }, + { + "epoch": 7.511170688114388, + "grad_norm": 13.29266357421875, + "learning_rate": 3.9299990202514525e-05, + "loss": 0.2684, + "num_input_tokens_seen": 29259400, + "step": 50430 + }, + { + "epoch": 7.511915400655347, + "grad_norm": 17.47466278076172, + "learning_rate": 3.9297324731132826e-05, + "loss": 0.0254, + "num_input_tokens_seen": 29262376, + "step": 50435 + }, + { + "epoch": 7.512660113196306, + "grad_norm": 0.09315735846757889, + "learning_rate": 3.9294659018212356e-05, + "loss": 0.0843, + "num_input_tokens_seen": 29265160, + "step": 50440 + }, + { + "epoch": 7.513404825737266, + "grad_norm": 0.11321740597486496, + "learning_rate": 3.929199306379815e-05, + "loss": 0.011, + "num_input_tokens_seen": 29267880, + "step": 50445 + }, + { + "epoch": 7.514149538278224, + "grad_norm": 5.913218021392822, + "learning_rate": 3.928932686793524e-05, + "loss": 0.0159, + "num_input_tokens_seen": 29270600, + "step": 50450 + }, + { + "epoch": 7.514894250819184, + "grad_norm": 9.54274845123291, + "learning_rate": 3.9286660430668686e-05, + "loss": 0.0691, + "num_input_tokens_seen": 29273480, + "step": 50455 + }, + { + "epoch": 7.515638963360143, + "grad_norm": 0.008228876627981663, + "learning_rate": 3.928399375204352e-05, + "loss": 0.0557, + "num_input_tokens_seen": 29276040, + "step": 50460 + }, + { + "epoch": 7.5163836759011025, + "grad_norm": 50.20952606201172, + "learning_rate": 3.9281326832104795e-05, + "loss": 0.3228, + "num_input_tokens_seen": 29278472, + "step": 50465 + }, + { + "epoch": 7.517128388442061, + "grad_norm": 0.021346315741539, + "learning_rate": 3.9278659670897564e-05, + "loss": 0.1329, + "num_input_tokens_seen": 29281320, + "step": 50470 + }, + { + "epoch": 7.517873100983021, + "grad_norm": 38.48539352416992, + "learning_rate": 3.92759922684669e-05, + "loss": 0.3847, + "num_input_tokens_seen": 29284296, + "step": 50475 + }, + { + "epoch": 7.51861781352398, + "grad_norm": 38.71612548828125, + "learning_rate": 3.927332462485785e-05, + "loss": 0.1062, + "num_input_tokens_seen": 29287048, + "step": 50480 + }, + { + "epoch": 7.519362526064939, + "grad_norm": 0.02322794310748577, + "learning_rate": 3.92706567401155e-05, + "loss": 0.2274, + "num_input_tokens_seen": 29289992, + "step": 50485 + }, + { + "epoch": 7.520107238605898, + "grad_norm": 0.030586568638682365, + "learning_rate": 3.9267988614284886e-05, + "loss": 0.0214, + "num_input_tokens_seen": 29292840, + "step": 50490 + }, + { + "epoch": 7.520851951146858, + "grad_norm": 0.11387919634580612, + "learning_rate": 3.926532024741113e-05, + "loss": 0.4325, + "num_input_tokens_seen": 29296136, + "step": 50495 + }, + { + "epoch": 7.521596663687816, + "grad_norm": 0.027308005839586258, + "learning_rate": 3.926265163953927e-05, + "loss": 0.0842, + "num_input_tokens_seen": 29299368, + "step": 50500 + }, + { + "epoch": 7.522341376228776, + "grad_norm": 0.6762474179267883, + "learning_rate": 3.925998279071441e-05, + "loss": 0.1456, + "num_input_tokens_seen": 29302088, + "step": 50505 + }, + { + "epoch": 7.523086088769735, + "grad_norm": 38.9619140625, + "learning_rate": 3.9257313700981634e-05, + "loss": 0.2393, + "num_input_tokens_seen": 29305096, + "step": 50510 + }, + { + "epoch": 7.5238308013106945, + "grad_norm": 16.731977462768555, + "learning_rate": 3.9254644370386036e-05, + "loss": 0.2503, + "num_input_tokens_seen": 29308072, + "step": 50515 + }, + { + "epoch": 7.524575513851653, + "grad_norm": 42.14255142211914, + "learning_rate": 3.925197479897271e-05, + "loss": 0.1147, + "num_input_tokens_seen": 29310952, + "step": 50520 + }, + { + "epoch": 7.525320226392612, + "grad_norm": 0.014405792579054832, + "learning_rate": 3.924930498678675e-05, + "loss": 0.0031, + "num_input_tokens_seen": 29313832, + "step": 50525 + }, + { + "epoch": 7.526064938933572, + "grad_norm": 0.7782633900642395, + "learning_rate": 3.924663493387326e-05, + "loss": 0.0301, + "num_input_tokens_seen": 29316488, + "step": 50530 + }, + { + "epoch": 7.526809651474531, + "grad_norm": 0.00646628113463521, + "learning_rate": 3.924396464027736e-05, + "loss": 0.266, + "num_input_tokens_seen": 29319272, + "step": 50535 + }, + { + "epoch": 7.52755436401549, + "grad_norm": 0.3503442704677582, + "learning_rate": 3.924129410604416e-05, + "loss": 0.1914, + "num_input_tokens_seen": 29322184, + "step": 50540 + }, + { + "epoch": 7.528299076556449, + "grad_norm": 37.83698654174805, + "learning_rate": 3.923862333121876e-05, + "loss": 0.472, + "num_input_tokens_seen": 29325032, + "step": 50545 + }, + { + "epoch": 7.5290437890974085, + "grad_norm": 36.11445999145508, + "learning_rate": 3.92359523158463e-05, + "loss": 0.3718, + "num_input_tokens_seen": 29327880, + "step": 50550 + }, + { + "epoch": 7.529788501638367, + "grad_norm": 93.91636657714844, + "learning_rate": 3.923328105997188e-05, + "loss": 0.3973, + "num_input_tokens_seen": 29330568, + "step": 50555 + }, + { + "epoch": 7.530533214179327, + "grad_norm": 0.06291621178388596, + "learning_rate": 3.923060956364066e-05, + "loss": 0.2939, + "num_input_tokens_seen": 29333736, + "step": 50560 + }, + { + "epoch": 7.531277926720286, + "grad_norm": 0.03841327130794525, + "learning_rate": 3.922793782689774e-05, + "loss": 0.0016, + "num_input_tokens_seen": 29336712, + "step": 50565 + }, + { + "epoch": 7.532022639261245, + "grad_norm": 18.092851638793945, + "learning_rate": 3.922526584978829e-05, + "loss": 0.166, + "num_input_tokens_seen": 29339528, + "step": 50570 + }, + { + "epoch": 7.532767351802204, + "grad_norm": 0.36381369829177856, + "learning_rate": 3.922259363235741e-05, + "loss": 0.0633, + "num_input_tokens_seen": 29342920, + "step": 50575 + }, + { + "epoch": 7.533512064343164, + "grad_norm": 0.09808424860239029, + "learning_rate": 3.921992117465028e-05, + "loss": 0.0319, + "num_input_tokens_seen": 29345832, + "step": 50580 + }, + { + "epoch": 7.534256776884122, + "grad_norm": 0.011024316772818565, + "learning_rate": 3.921724847671202e-05, + "loss": 0.0596, + "num_input_tokens_seen": 29348712, + "step": 50585 + }, + { + "epoch": 7.535001489425082, + "grad_norm": 0.16155986487865448, + "learning_rate": 3.9214575538587804e-05, + "loss": 0.2238, + "num_input_tokens_seen": 29351720, + "step": 50590 + }, + { + "epoch": 7.535746201966041, + "grad_norm": 22.426733016967773, + "learning_rate": 3.921190236032278e-05, + "loss": 0.1248, + "num_input_tokens_seen": 29354504, + "step": 50595 + }, + { + "epoch": 7.5364909145070005, + "grad_norm": 0.0709160566329956, + "learning_rate": 3.920922894196212e-05, + "loss": 0.0007, + "num_input_tokens_seen": 29357736, + "step": 50600 + }, + { + "epoch": 7.537235627047959, + "grad_norm": 42.8265266418457, + "learning_rate": 3.920655528355097e-05, + "loss": 0.0558, + "num_input_tokens_seen": 29360808, + "step": 50605 + }, + { + "epoch": 7.537980339588919, + "grad_norm": 15.871869087219238, + "learning_rate": 3.920388138513451e-05, + "loss": 0.3623, + "num_input_tokens_seen": 29363976, + "step": 50610 + }, + { + "epoch": 7.538725052129878, + "grad_norm": 8.937009811401367, + "learning_rate": 3.920120724675791e-05, + "loss": 0.3938, + "num_input_tokens_seen": 29366728, + "step": 50615 + }, + { + "epoch": 7.539469764670837, + "grad_norm": 0.050562430173158646, + "learning_rate": 3.9198532868466345e-05, + "loss": 0.1745, + "num_input_tokens_seen": 29369544, + "step": 50620 + }, + { + "epoch": 7.540214477211796, + "grad_norm": 1.7103267908096313, + "learning_rate": 3.919585825030499e-05, + "loss": 0.0115, + "num_input_tokens_seen": 29372552, + "step": 50625 + }, + { + "epoch": 7.540959189752756, + "grad_norm": 16.08682632446289, + "learning_rate": 3.9193183392319054e-05, + "loss": 0.1281, + "num_input_tokens_seen": 29375784, + "step": 50630 + }, + { + "epoch": 7.5417039022937145, + "grad_norm": 0.10639046132564545, + "learning_rate": 3.9190508294553694e-05, + "loss": 0.179, + "num_input_tokens_seen": 29378792, + "step": 50635 + }, + { + "epoch": 7.542448614834674, + "grad_norm": 1.3780235052108765, + "learning_rate": 3.918783295705414e-05, + "loss": 0.2913, + "num_input_tokens_seen": 29381608, + "step": 50640 + }, + { + "epoch": 7.543193327375633, + "grad_norm": 87.82337188720703, + "learning_rate": 3.9185157379865553e-05, + "loss": 0.0755, + "num_input_tokens_seen": 29384904, + "step": 50645 + }, + { + "epoch": 7.5439380399165925, + "grad_norm": 18.004383087158203, + "learning_rate": 3.9182481563033155e-05, + "loss": 0.2385, + "num_input_tokens_seen": 29387720, + "step": 50650 + }, + { + "epoch": 7.544682752457551, + "grad_norm": 0.6128045916557312, + "learning_rate": 3.917980550660214e-05, + "loss": 0.0266, + "num_input_tokens_seen": 29390632, + "step": 50655 + }, + { + "epoch": 7.545427464998511, + "grad_norm": 115.42841339111328, + "learning_rate": 3.9177129210617725e-05, + "loss": 0.1868, + "num_input_tokens_seen": 29393672, + "step": 50660 + }, + { + "epoch": 7.54617217753947, + "grad_norm": 58.01664733886719, + "learning_rate": 3.9174452675125115e-05, + "loss": 0.1514, + "num_input_tokens_seen": 29396424, + "step": 50665 + }, + { + "epoch": 7.546916890080429, + "grad_norm": 112.03016662597656, + "learning_rate": 3.917177590016954e-05, + "loss": 0.2768, + "num_input_tokens_seen": 29399272, + "step": 50670 + }, + { + "epoch": 7.547661602621388, + "grad_norm": 35.27037048339844, + "learning_rate": 3.9169098885796216e-05, + "loss": 0.5038, + "num_input_tokens_seen": 29402472, + "step": 50675 + }, + { + "epoch": 7.548406315162348, + "grad_norm": 0.20330514013767242, + "learning_rate": 3.916642163205036e-05, + "loss": 0.3401, + "num_input_tokens_seen": 29405352, + "step": 50680 + }, + { + "epoch": 7.5491510277033065, + "grad_norm": 4.8179473876953125, + "learning_rate": 3.916374413897722e-05, + "loss": 0.0969, + "num_input_tokens_seen": 29408264, + "step": 50685 + }, + { + "epoch": 7.549895740244265, + "grad_norm": 0.021469390019774437, + "learning_rate": 3.916106640662201e-05, + "loss": 0.3803, + "num_input_tokens_seen": 29411144, + "step": 50690 + }, + { + "epoch": 7.550640452785225, + "grad_norm": 0.21686071157455444, + "learning_rate": 3.915838843502998e-05, + "loss": 0.1114, + "num_input_tokens_seen": 29414152, + "step": 50695 + }, + { + "epoch": 7.5513851653261845, + "grad_norm": 0.05551430955529213, + "learning_rate": 3.9155710224246365e-05, + "loss": 0.2345, + "num_input_tokens_seen": 29417224, + "step": 50700 + }, + { + "epoch": 7.552129877867143, + "grad_norm": 0.36282292008399963, + "learning_rate": 3.915303177431641e-05, + "loss": 0.125, + "num_input_tokens_seen": 29420040, + "step": 50705 + }, + { + "epoch": 7.552874590408102, + "grad_norm": 0.36560937762260437, + "learning_rate": 3.915035308528537e-05, + "loss": 0.0019, + "num_input_tokens_seen": 29422856, + "step": 50710 + }, + { + "epoch": 7.553619302949062, + "grad_norm": 27.661516189575195, + "learning_rate": 3.91476741571985e-05, + "loss": 0.0766, + "num_input_tokens_seen": 29425672, + "step": 50715 + }, + { + "epoch": 7.554364015490021, + "grad_norm": 41.0321044921875, + "learning_rate": 3.914499499010105e-05, + "loss": 0.3517, + "num_input_tokens_seen": 29428424, + "step": 50720 + }, + { + "epoch": 7.55510872803098, + "grad_norm": 0.013397532515227795, + "learning_rate": 3.9142315584038284e-05, + "loss": 0.006, + "num_input_tokens_seen": 29431208, + "step": 50725 + }, + { + "epoch": 7.555853440571939, + "grad_norm": 0.056155044585466385, + "learning_rate": 3.913963593905548e-05, + "loss": 0.1247, + "num_input_tokens_seen": 29434056, + "step": 50730 + }, + { + "epoch": 7.5565981531128985, + "grad_norm": 0.128759503364563, + "learning_rate": 3.913695605519788e-05, + "loss": 0.103, + "num_input_tokens_seen": 29437000, + "step": 50735 + }, + { + "epoch": 7.557342865653857, + "grad_norm": 0.7955138087272644, + "learning_rate": 3.913427593251079e-05, + "loss": 0.1376, + "num_input_tokens_seen": 29439752, + "step": 50740 + }, + { + "epoch": 7.558087578194817, + "grad_norm": 48.61556625366211, + "learning_rate": 3.913159557103947e-05, + "loss": 0.325, + "num_input_tokens_seen": 29442952, + "step": 50745 + }, + { + "epoch": 7.558832290735776, + "grad_norm": 0.015891456976532936, + "learning_rate": 3.912891497082921e-05, + "loss": 0.2819, + "num_input_tokens_seen": 29445800, + "step": 50750 + }, + { + "epoch": 7.559577003276735, + "grad_norm": 0.029332265257835388, + "learning_rate": 3.9126234131925285e-05, + "loss": 0.1038, + "num_input_tokens_seen": 29448456, + "step": 50755 + }, + { + "epoch": 7.560321715817694, + "grad_norm": 0.8614256381988525, + "learning_rate": 3.9123553054372994e-05, + "loss": 0.2294, + "num_input_tokens_seen": 29451176, + "step": 50760 + }, + { + "epoch": 7.561066428358654, + "grad_norm": 0.0697460025548935, + "learning_rate": 3.912087173821762e-05, + "loss": 0.2117, + "num_input_tokens_seen": 29454088, + "step": 50765 + }, + { + "epoch": 7.5618111408996125, + "grad_norm": 0.6684370636940002, + "learning_rate": 3.911819018350449e-05, + "loss": 0.1133, + "num_input_tokens_seen": 29456936, + "step": 50770 + }, + { + "epoch": 7.562555853440572, + "grad_norm": 1.5142728090286255, + "learning_rate": 3.9115508390278864e-05, + "loss": 0.0025, + "num_input_tokens_seen": 29459624, + "step": 50775 + }, + { + "epoch": 7.563300565981531, + "grad_norm": 0.033541541546583176, + "learning_rate": 3.9112826358586086e-05, + "loss": 0.0636, + "num_input_tokens_seen": 29462728, + "step": 50780 + }, + { + "epoch": 7.5640452785224905, + "grad_norm": 61.40641403198242, + "learning_rate": 3.9110144088471437e-05, + "loss": 0.1257, + "num_input_tokens_seen": 29465576, + "step": 50785 + }, + { + "epoch": 7.564789991063449, + "grad_norm": 33.86697006225586, + "learning_rate": 3.9107461579980255e-05, + "loss": 0.4451, + "num_input_tokens_seen": 29468680, + "step": 50790 + }, + { + "epoch": 7.565534703604409, + "grad_norm": 33.92306137084961, + "learning_rate": 3.910477883315785e-05, + "loss": 0.2972, + "num_input_tokens_seen": 29471528, + "step": 50795 + }, + { + "epoch": 7.566279416145368, + "grad_norm": 0.028565816581249237, + "learning_rate": 3.910209584804953e-05, + "loss": 0.1522, + "num_input_tokens_seen": 29474152, + "step": 50800 + }, + { + "epoch": 7.567024128686327, + "grad_norm": 173.2537384033203, + "learning_rate": 3.909941262470064e-05, + "loss": 0.2313, + "num_input_tokens_seen": 29477288, + "step": 50805 + }, + { + "epoch": 7.567768841227286, + "grad_norm": 33.87522506713867, + "learning_rate": 3.909672916315651e-05, + "loss": 0.3934, + "num_input_tokens_seen": 29480488, + "step": 50810 + }, + { + "epoch": 7.568513553768246, + "grad_norm": 9.876636505126953, + "learning_rate": 3.909404546346246e-05, + "loss": 0.2623, + "num_input_tokens_seen": 29483144, + "step": 50815 + }, + { + "epoch": 7.5692582663092045, + "grad_norm": 118.27008056640625, + "learning_rate": 3.909136152566384e-05, + "loss": 0.3884, + "num_input_tokens_seen": 29486152, + "step": 50820 + }, + { + "epoch": 7.570002978850164, + "grad_norm": 57.94940948486328, + "learning_rate": 3.908867734980599e-05, + "loss": 0.1471, + "num_input_tokens_seen": 29489032, + "step": 50825 + }, + { + "epoch": 7.570747691391123, + "grad_norm": 0.09616617113351822, + "learning_rate": 3.908599293593425e-05, + "loss": 0.0743, + "num_input_tokens_seen": 29491752, + "step": 50830 + }, + { + "epoch": 7.571492403932083, + "grad_norm": 0.5803902745246887, + "learning_rate": 3.908330828409397e-05, + "loss": 0.1464, + "num_input_tokens_seen": 29494408, + "step": 50835 + }, + { + "epoch": 7.572237116473041, + "grad_norm": 0.17179858684539795, + "learning_rate": 3.908062339433052e-05, + "loss": 0.0057, + "num_input_tokens_seen": 29497448, + "step": 50840 + }, + { + "epoch": 7.572981829014001, + "grad_norm": 0.14761576056480408, + "learning_rate": 3.907793826668925e-05, + "loss": 0.1806, + "num_input_tokens_seen": 29500200, + "step": 50845 + }, + { + "epoch": 7.57372654155496, + "grad_norm": 6.605748176574707, + "learning_rate": 3.907525290121552e-05, + "loss": 0.0243, + "num_input_tokens_seen": 29503208, + "step": 50850 + }, + { + "epoch": 7.5744712540959185, + "grad_norm": 2.6977012157440186, + "learning_rate": 3.9072567297954694e-05, + "loss": 0.1089, + "num_input_tokens_seen": 29505992, + "step": 50855 + }, + { + "epoch": 7.575215966636878, + "grad_norm": 0.7191528081893921, + "learning_rate": 3.906988145695215e-05, + "loss": 0.0038, + "num_input_tokens_seen": 29508744, + "step": 50860 + }, + { + "epoch": 7.575960679177838, + "grad_norm": 0.13175460696220398, + "learning_rate": 3.906719537825325e-05, + "loss": 0.1772, + "num_input_tokens_seen": 29511624, + "step": 50865 + }, + { + "epoch": 7.5767053917187965, + "grad_norm": 48.09111785888672, + "learning_rate": 3.906450906190339e-05, + "loss": 0.2108, + "num_input_tokens_seen": 29514664, + "step": 50870 + }, + { + "epoch": 7.577450104259755, + "grad_norm": 0.0596269927918911, + "learning_rate": 3.9061822507947945e-05, + "loss": 0.0028, + "num_input_tokens_seen": 29517288, + "step": 50875 + }, + { + "epoch": 7.578194816800715, + "grad_norm": 58.819278717041016, + "learning_rate": 3.9059135716432294e-05, + "loss": 0.1578, + "num_input_tokens_seen": 29520328, + "step": 50880 + }, + { + "epoch": 7.578939529341675, + "grad_norm": 0.039090484380722046, + "learning_rate": 3.905644868740184e-05, + "loss": 0.154, + "num_input_tokens_seen": 29523048, + "step": 50885 + }, + { + "epoch": 7.579684241882633, + "grad_norm": 110.1329116821289, + "learning_rate": 3.905376142090197e-05, + "loss": 0.2144, + "num_input_tokens_seen": 29525832, + "step": 50890 + }, + { + "epoch": 7.580428954423592, + "grad_norm": 0.18957819044589996, + "learning_rate": 3.9051073916978084e-05, + "loss": 0.0382, + "num_input_tokens_seen": 29528520, + "step": 50895 + }, + { + "epoch": 7.581173666964552, + "grad_norm": 23.284616470336914, + "learning_rate": 3.904838617567558e-05, + "loss": 0.5221, + "num_input_tokens_seen": 29531336, + "step": 50900 + }, + { + "epoch": 7.5819183795055105, + "grad_norm": 0.6830887794494629, + "learning_rate": 3.904569819703988e-05, + "loss": 0.2157, + "num_input_tokens_seen": 29534216, + "step": 50905 + }, + { + "epoch": 7.58266309204647, + "grad_norm": 38.19634246826172, + "learning_rate": 3.9043009981116376e-05, + "loss": 0.1531, + "num_input_tokens_seen": 29537448, + "step": 50910 + }, + { + "epoch": 7.583407804587429, + "grad_norm": 89.8890380859375, + "learning_rate": 3.9040321527950497e-05, + "loss": 0.1495, + "num_input_tokens_seen": 29540360, + "step": 50915 + }, + { + "epoch": 7.584152517128389, + "grad_norm": 0.026079997420310974, + "learning_rate": 3.903763283758765e-05, + "loss": 0.0434, + "num_input_tokens_seen": 29543336, + "step": 50920 + }, + { + "epoch": 7.584897229669347, + "grad_norm": 28.55622673034668, + "learning_rate": 3.903494391007327e-05, + "loss": 0.2737, + "num_input_tokens_seen": 29546280, + "step": 50925 + }, + { + "epoch": 7.585641942210307, + "grad_norm": 15.770024299621582, + "learning_rate": 3.9032254745452775e-05, + "loss": 0.4364, + "num_input_tokens_seen": 29549160, + "step": 50930 + }, + { + "epoch": 7.586386654751266, + "grad_norm": 61.83314895629883, + "learning_rate": 3.902956534377159e-05, + "loss": 0.3571, + "num_input_tokens_seen": 29552168, + "step": 50935 + }, + { + "epoch": 7.587131367292225, + "grad_norm": 0.037462156265974045, + "learning_rate": 3.902687570507517e-05, + "loss": 0.2154, + "num_input_tokens_seen": 29555336, + "step": 50940 + }, + { + "epoch": 7.587876079833184, + "grad_norm": 0.2111823409795761, + "learning_rate": 3.902418582940893e-05, + "loss": 0.1669, + "num_input_tokens_seen": 29558216, + "step": 50945 + }, + { + "epoch": 7.588620792374144, + "grad_norm": 24.379446029663086, + "learning_rate": 3.902149571681833e-05, + "loss": 0.2342, + "num_input_tokens_seen": 29561192, + "step": 50950 + }, + { + "epoch": 7.5893655049151025, + "grad_norm": 127.95616149902344, + "learning_rate": 3.901880536734881e-05, + "loss": 0.3605, + "num_input_tokens_seen": 29564104, + "step": 50955 + }, + { + "epoch": 7.590110217456062, + "grad_norm": 11.965272903442383, + "learning_rate": 3.901611478104582e-05, + "loss": 0.1773, + "num_input_tokens_seen": 29566568, + "step": 50960 + }, + { + "epoch": 7.590854929997021, + "grad_norm": 40.53447723388672, + "learning_rate": 3.901342395795482e-05, + "loss": 0.3346, + "num_input_tokens_seen": 29569608, + "step": 50965 + }, + { + "epoch": 7.591599642537981, + "grad_norm": 4.1939215660095215, + "learning_rate": 3.901073289812126e-05, + "loss": 0.1194, + "num_input_tokens_seen": 29572488, + "step": 50970 + }, + { + "epoch": 7.592344355078939, + "grad_norm": 15.252683639526367, + "learning_rate": 3.900804160159061e-05, + "loss": 0.089, + "num_input_tokens_seen": 29575464, + "step": 50975 + }, + { + "epoch": 7.593089067619899, + "grad_norm": 0.027903204783797264, + "learning_rate": 3.900535006840833e-05, + "loss": 0.351, + "num_input_tokens_seen": 29578344, + "step": 50980 + }, + { + "epoch": 7.593833780160858, + "grad_norm": 4.520352840423584, + "learning_rate": 3.90026582986199e-05, + "loss": 0.2463, + "num_input_tokens_seen": 29581320, + "step": 50985 + }, + { + "epoch": 7.594578492701817, + "grad_norm": 23.404075622558594, + "learning_rate": 3.899996629227079e-05, + "loss": 0.4024, + "num_input_tokens_seen": 29584200, + "step": 50990 + }, + { + "epoch": 7.595323205242776, + "grad_norm": 103.72479248046875, + "learning_rate": 3.899727404940647e-05, + "loss": 0.2189, + "num_input_tokens_seen": 29587144, + "step": 50995 + }, + { + "epoch": 7.596067917783736, + "grad_norm": 1.4388052225112915, + "learning_rate": 3.899458157007244e-05, + "loss": 0.1648, + "num_input_tokens_seen": 29589704, + "step": 51000 + }, + { + "epoch": 7.596812630324695, + "grad_norm": 20.7553653717041, + "learning_rate": 3.899188885431419e-05, + "loss": 0.2424, + "num_input_tokens_seen": 29592808, + "step": 51005 + }, + { + "epoch": 7.597557342865654, + "grad_norm": 0.033616743981838226, + "learning_rate": 3.898919590217718e-05, + "loss": 0.0734, + "num_input_tokens_seen": 29595784, + "step": 51010 + }, + { + "epoch": 7.598302055406613, + "grad_norm": 17.19158363342285, + "learning_rate": 3.898650271370692e-05, + "loss": 0.2107, + "num_input_tokens_seen": 29598664, + "step": 51015 + }, + { + "epoch": 7.599046767947573, + "grad_norm": 52.5131721496582, + "learning_rate": 3.898380928894892e-05, + "loss": 0.332, + "num_input_tokens_seen": 29601480, + "step": 51020 + }, + { + "epoch": 7.599791480488531, + "grad_norm": 0.11788823455572128, + "learning_rate": 3.8981115627948675e-05, + "loss": 0.2972, + "num_input_tokens_seen": 29604360, + "step": 51025 + }, + { + "epoch": 7.600536193029491, + "grad_norm": 0.03603528439998627, + "learning_rate": 3.897842173075169e-05, + "loss": 0.1116, + "num_input_tokens_seen": 29607592, + "step": 51030 + }, + { + "epoch": 7.60128090557045, + "grad_norm": 61.088130950927734, + "learning_rate": 3.8975727597403475e-05, + "loss": 0.1549, + "num_input_tokens_seen": 29610344, + "step": 51035 + }, + { + "epoch": 7.6020256181114085, + "grad_norm": 6.229900360107422, + "learning_rate": 3.8973033227949554e-05, + "loss": 0.2304, + "num_input_tokens_seen": 29613064, + "step": 51040 + }, + { + "epoch": 7.602770330652368, + "grad_norm": 17.80533790588379, + "learning_rate": 3.897033862243543e-05, + "loss": 0.345, + "num_input_tokens_seen": 29616072, + "step": 51045 + }, + { + "epoch": 7.603515043193328, + "grad_norm": 4.457577228546143, + "learning_rate": 3.896764378090664e-05, + "loss": 0.0023, + "num_input_tokens_seen": 29618792, + "step": 51050 + }, + { + "epoch": 7.604259755734287, + "grad_norm": 21.512353897094727, + "learning_rate": 3.89649487034087e-05, + "loss": 0.1043, + "num_input_tokens_seen": 29621768, + "step": 51055 + }, + { + "epoch": 7.605004468275245, + "grad_norm": 42.18946075439453, + "learning_rate": 3.8962253389987145e-05, + "loss": 0.2141, + "num_input_tokens_seen": 29624776, + "step": 51060 + }, + { + "epoch": 7.605749180816205, + "grad_norm": 0.0356229804456234, + "learning_rate": 3.895955784068751e-05, + "loss": 0.0043, + "num_input_tokens_seen": 29627688, + "step": 51065 + }, + { + "epoch": 7.606493893357164, + "grad_norm": 0.7910592555999756, + "learning_rate": 3.8956862055555335e-05, + "loss": 0.5314, + "num_input_tokens_seen": 29630472, + "step": 51070 + }, + { + "epoch": 7.607238605898123, + "grad_norm": 0.022078149020671844, + "learning_rate": 3.895416603463616e-05, + "loss": 0.0714, + "num_input_tokens_seen": 29633832, + "step": 51075 + }, + { + "epoch": 7.607983318439082, + "grad_norm": 10.911199569702148, + "learning_rate": 3.895146977797553e-05, + "loss": 0.2215, + "num_input_tokens_seen": 29636840, + "step": 51080 + }, + { + "epoch": 7.608728030980042, + "grad_norm": 0.37186479568481445, + "learning_rate": 3.8948773285619e-05, + "loss": 0.3023, + "num_input_tokens_seen": 29639688, + "step": 51085 + }, + { + "epoch": 7.609472743521001, + "grad_norm": 0.08513583242893219, + "learning_rate": 3.894607655761212e-05, + "loss": 0.217, + "num_input_tokens_seen": 29642536, + "step": 51090 + }, + { + "epoch": 7.61021745606196, + "grad_norm": 0.025920039042830467, + "learning_rate": 3.894337959400045e-05, + "loss": 0.0014, + "num_input_tokens_seen": 29645352, + "step": 51095 + }, + { + "epoch": 7.610962168602919, + "grad_norm": 0.01919548399746418, + "learning_rate": 3.894068239482956e-05, + "loss": 0.1712, + "num_input_tokens_seen": 29648200, + "step": 51100 + }, + { + "epoch": 7.611706881143879, + "grad_norm": 0.033999595791101456, + "learning_rate": 3.8937984960145004e-05, + "loss": 0.1357, + "num_input_tokens_seen": 29650952, + "step": 51105 + }, + { + "epoch": 7.612451593684837, + "grad_norm": 0.9137781858444214, + "learning_rate": 3.893528728999236e-05, + "loss": 0.0331, + "num_input_tokens_seen": 29653736, + "step": 51110 + }, + { + "epoch": 7.613196306225797, + "grad_norm": 0.25884613394737244, + "learning_rate": 3.893258938441719e-05, + "loss": 0.5502, + "num_input_tokens_seen": 29656744, + "step": 51115 + }, + { + "epoch": 7.613941018766756, + "grad_norm": 0.07630952447652817, + "learning_rate": 3.89298912434651e-05, + "loss": 0.2559, + "num_input_tokens_seen": 29659784, + "step": 51120 + }, + { + "epoch": 7.614685731307715, + "grad_norm": 55.67509460449219, + "learning_rate": 3.892719286718165e-05, + "loss": 0.0934, + "num_input_tokens_seen": 29662696, + "step": 51125 + }, + { + "epoch": 7.615430443848674, + "grad_norm": 0.23364071547985077, + "learning_rate": 3.892449425561243e-05, + "loss": 0.0015, + "num_input_tokens_seen": 29665352, + "step": 51130 + }, + { + "epoch": 7.616175156389634, + "grad_norm": 0.01850786618888378, + "learning_rate": 3.892179540880303e-05, + "loss": 0.244, + "num_input_tokens_seen": 29667912, + "step": 51135 + }, + { + "epoch": 7.616919868930593, + "grad_norm": 0.01590055786073208, + "learning_rate": 3.891909632679904e-05, + "loss": 0.0029, + "num_input_tokens_seen": 29670920, + "step": 51140 + }, + { + "epoch": 7.617664581471552, + "grad_norm": 0.06398503482341766, + "learning_rate": 3.8916397009646076e-05, + "loss": 0.2253, + "num_input_tokens_seen": 29673896, + "step": 51145 + }, + { + "epoch": 7.618409294012511, + "grad_norm": 0.009011705406010151, + "learning_rate": 3.891369745738972e-05, + "loss": 0.142, + "num_input_tokens_seen": 29676872, + "step": 51150 + }, + { + "epoch": 7.619154006553471, + "grad_norm": 8.250389099121094, + "learning_rate": 3.8910997670075593e-05, + "loss": 0.3532, + "num_input_tokens_seen": 29679592, + "step": 51155 + }, + { + "epoch": 7.619898719094429, + "grad_norm": 17.988338470458984, + "learning_rate": 3.890829764774929e-05, + "loss": 0.0094, + "num_input_tokens_seen": 29682376, + "step": 51160 + }, + { + "epoch": 7.620643431635389, + "grad_norm": 0.04926268756389618, + "learning_rate": 3.8905597390456446e-05, + "loss": 0.2097, + "num_input_tokens_seen": 29685032, + "step": 51165 + }, + { + "epoch": 7.621388144176348, + "grad_norm": 1.2490705251693726, + "learning_rate": 3.890289689824266e-05, + "loss": 0.0644, + "num_input_tokens_seen": 29688008, + "step": 51170 + }, + { + "epoch": 7.6221328567173074, + "grad_norm": 0.1098242849111557, + "learning_rate": 3.890019617115357e-05, + "loss": 0.1238, + "num_input_tokens_seen": 29691112, + "step": 51175 + }, + { + "epoch": 7.622877569258266, + "grad_norm": 0.44442859292030334, + "learning_rate": 3.889749520923478e-05, + "loss": 0.156, + "num_input_tokens_seen": 29694120, + "step": 51180 + }, + { + "epoch": 7.623622281799226, + "grad_norm": 0.0745871514081955, + "learning_rate": 3.889479401253194e-05, + "loss": 0.4061, + "num_input_tokens_seen": 29696872, + "step": 51185 + }, + { + "epoch": 7.624366994340185, + "grad_norm": 2.272317409515381, + "learning_rate": 3.8892092581090675e-05, + "loss": 0.271, + "num_input_tokens_seen": 29699624, + "step": 51190 + }, + { + "epoch": 7.625111706881144, + "grad_norm": 5.887223720550537, + "learning_rate": 3.888939091495663e-05, + "loss": 0.0883, + "num_input_tokens_seen": 29702472, + "step": 51195 + }, + { + "epoch": 7.625856419422103, + "grad_norm": 119.38944244384766, + "learning_rate": 3.888668901417544e-05, + "loss": 0.1919, + "num_input_tokens_seen": 29705384, + "step": 51200 + }, + { + "epoch": 7.626601131963062, + "grad_norm": 0.07956566661596298, + "learning_rate": 3.888398687879274e-05, + "loss": 0.0252, + "num_input_tokens_seen": 29708296, + "step": 51205 + }, + { + "epoch": 7.627345844504021, + "grad_norm": 9.478428840637207, + "learning_rate": 3.888128450885421e-05, + "loss": 0.4016, + "num_input_tokens_seen": 29711272, + "step": 51210 + }, + { + "epoch": 7.628090557044981, + "grad_norm": 0.16415368020534515, + "learning_rate": 3.887858190440549e-05, + "loss": 0.1484, + "num_input_tokens_seen": 29714152, + "step": 51215 + }, + { + "epoch": 7.62883526958594, + "grad_norm": 0.04535924270749092, + "learning_rate": 3.8875879065492216e-05, + "loss": 0.2922, + "num_input_tokens_seen": 29716840, + "step": 51220 + }, + { + "epoch": 7.629579982126899, + "grad_norm": 0.23230603337287903, + "learning_rate": 3.887317599216008e-05, + "loss": 0.091, + "num_input_tokens_seen": 29720008, + "step": 51225 + }, + { + "epoch": 7.630324694667858, + "grad_norm": 0.2954299747943878, + "learning_rate": 3.887047268445473e-05, + "loss": 0.271, + "num_input_tokens_seen": 29722952, + "step": 51230 + }, + { + "epoch": 7.631069407208818, + "grad_norm": 3.874990940093994, + "learning_rate": 3.8867769142421844e-05, + "loss": 0.0055, + "num_input_tokens_seen": 29725960, + "step": 51235 + }, + { + "epoch": 7.631814119749777, + "grad_norm": 1.431695580482483, + "learning_rate": 3.886506536610709e-05, + "loss": 0.1446, + "num_input_tokens_seen": 29728904, + "step": 51240 + }, + { + "epoch": 7.632558832290735, + "grad_norm": 8.959487915039062, + "learning_rate": 3.8862361355556156e-05, + "loss": 0.1456, + "num_input_tokens_seen": 29731752, + "step": 51245 + }, + { + "epoch": 7.633303544831695, + "grad_norm": 0.27987220883369446, + "learning_rate": 3.8859657110814704e-05, + "loss": 0.0869, + "num_input_tokens_seen": 29734664, + "step": 51250 + }, + { + "epoch": 7.634048257372654, + "grad_norm": 0.027916373685002327, + "learning_rate": 3.885695263192844e-05, + "loss": 0.0006, + "num_input_tokens_seen": 29737672, + "step": 51255 + }, + { + "epoch": 7.6347929699136134, + "grad_norm": 10.786822319030762, + "learning_rate": 3.885424791894305e-05, + "loss": 0.4303, + "num_input_tokens_seen": 29740456, + "step": 51260 + }, + { + "epoch": 7.635537682454572, + "grad_norm": 29.522003173828125, + "learning_rate": 3.885154297190421e-05, + "loss": 0.2188, + "num_input_tokens_seen": 29743560, + "step": 51265 + }, + { + "epoch": 7.636282394995532, + "grad_norm": 0.057148054242134094, + "learning_rate": 3.884883779085764e-05, + "loss": 0.013, + "num_input_tokens_seen": 29746152, + "step": 51270 + }, + { + "epoch": 7.637027107536491, + "grad_norm": 0.058365337550640106, + "learning_rate": 3.884613237584902e-05, + "loss": 0.1056, + "num_input_tokens_seen": 29749192, + "step": 51275 + }, + { + "epoch": 7.63777182007745, + "grad_norm": 5.090465068817139, + "learning_rate": 3.884342672692407e-05, + "loss": 0.1414, + "num_input_tokens_seen": 29752296, + "step": 51280 + }, + { + "epoch": 7.638516532618409, + "grad_norm": 0.7034569382667542, + "learning_rate": 3.88407208441285e-05, + "loss": 0.0024, + "num_input_tokens_seen": 29755432, + "step": 51285 + }, + { + "epoch": 7.639261245159369, + "grad_norm": 0.08395966142416, + "learning_rate": 3.883801472750802e-05, + "loss": 0.0235, + "num_input_tokens_seen": 29758216, + "step": 51290 + }, + { + "epoch": 7.640005957700327, + "grad_norm": 28.447830200195312, + "learning_rate": 3.8835308377108344e-05, + "loss": 0.2042, + "num_input_tokens_seen": 29760872, + "step": 51295 + }, + { + "epoch": 7.640750670241287, + "grad_norm": 1.4822179079055786, + "learning_rate": 3.883260179297519e-05, + "loss": 0.2832, + "num_input_tokens_seen": 29763816, + "step": 51300 + }, + { + "epoch": 7.641495382782246, + "grad_norm": 11.991899490356445, + "learning_rate": 3.882989497515429e-05, + "loss": 0.1011, + "num_input_tokens_seen": 29766984, + "step": 51305 + }, + { + "epoch": 7.6422400953232055, + "grad_norm": 0.035521212965250015, + "learning_rate": 3.8827187923691365e-05, + "loss": 0.0524, + "num_input_tokens_seen": 29769896, + "step": 51310 + }, + { + "epoch": 7.642984807864164, + "grad_norm": 0.004511804319918156, + "learning_rate": 3.882448063863216e-05, + "loss": 0.1367, + "num_input_tokens_seen": 29772808, + "step": 51315 + }, + { + "epoch": 7.643729520405124, + "grad_norm": 0.2831387221813202, + "learning_rate": 3.882177312002241e-05, + "loss": 0.0335, + "num_input_tokens_seen": 29775688, + "step": 51320 + }, + { + "epoch": 7.644474232946083, + "grad_norm": 157.36663818359375, + "learning_rate": 3.881906536790784e-05, + "loss": 0.0509, + "num_input_tokens_seen": 29778600, + "step": 51325 + }, + { + "epoch": 7.645218945487042, + "grad_norm": 0.029187869280576706, + "learning_rate": 3.881635738233421e-05, + "loss": 0.3771, + "num_input_tokens_seen": 29781512, + "step": 51330 + }, + { + "epoch": 7.645963658028001, + "grad_norm": 0.05166684836149216, + "learning_rate": 3.8813649163347266e-05, + "loss": 0.0266, + "num_input_tokens_seen": 29784104, + "step": 51335 + }, + { + "epoch": 7.646708370568961, + "grad_norm": 73.20934295654297, + "learning_rate": 3.881094071099276e-05, + "loss": 0.5115, + "num_input_tokens_seen": 29786824, + "step": 51340 + }, + { + "epoch": 7.6474530831099194, + "grad_norm": 0.9840990304946899, + "learning_rate": 3.880823202531644e-05, + "loss": 0.3825, + "num_input_tokens_seen": 29789512, + "step": 51345 + }, + { + "epoch": 7.648197795650879, + "grad_norm": 0.7335183620452881, + "learning_rate": 3.880552310636408e-05, + "loss": 0.2343, + "num_input_tokens_seen": 29792456, + "step": 51350 + }, + { + "epoch": 7.648942508191838, + "grad_norm": 63.72822570800781, + "learning_rate": 3.880281395418144e-05, + "loss": 0.2828, + "num_input_tokens_seen": 29795272, + "step": 51355 + }, + { + "epoch": 7.6496872207327975, + "grad_norm": 0.9653977751731873, + "learning_rate": 3.8800104568814275e-05, + "loss": 0.0996, + "num_input_tokens_seen": 29797896, + "step": 51360 + }, + { + "epoch": 7.650431933273756, + "grad_norm": 0.11919217556715012, + "learning_rate": 3.879739495030839e-05, + "loss": 0.1812, + "num_input_tokens_seen": 29800968, + "step": 51365 + }, + { + "epoch": 7.651176645814716, + "grad_norm": 6.049289703369141, + "learning_rate": 3.879468509870953e-05, + "loss": 0.3229, + "num_input_tokens_seen": 29803816, + "step": 51370 + }, + { + "epoch": 7.651921358355675, + "grad_norm": 1.3704932928085327, + "learning_rate": 3.879197501406347e-05, + "loss": 0.0056, + "num_input_tokens_seen": 29806728, + "step": 51375 + }, + { + "epoch": 7.652666070896634, + "grad_norm": 25.84914779663086, + "learning_rate": 3.878926469641603e-05, + "loss": 0.36, + "num_input_tokens_seen": 29809448, + "step": 51380 + }, + { + "epoch": 7.653410783437593, + "grad_norm": 34.60381317138672, + "learning_rate": 3.878655414581297e-05, + "loss": 0.1887, + "num_input_tokens_seen": 29812136, + "step": 51385 + }, + { + "epoch": 7.654155495978552, + "grad_norm": 0.042987484484910965, + "learning_rate": 3.878384336230009e-05, + "loss": 0.0545, + "num_input_tokens_seen": 29814984, + "step": 51390 + }, + { + "epoch": 7.6549002085195115, + "grad_norm": 0.17134161293506622, + "learning_rate": 3.878113234592319e-05, + "loss": 0.6137, + "num_input_tokens_seen": 29817992, + "step": 51395 + }, + { + "epoch": 7.655644921060471, + "grad_norm": 1.1523982286453247, + "learning_rate": 3.8778421096728065e-05, + "loss": 0.0856, + "num_input_tokens_seen": 29820616, + "step": 51400 + }, + { + "epoch": 7.65638963360143, + "grad_norm": 0.0861845389008522, + "learning_rate": 3.8775709614760514e-05, + "loss": 0.0353, + "num_input_tokens_seen": 29823368, + "step": 51405 + }, + { + "epoch": 7.657134346142389, + "grad_norm": 0.3554981052875519, + "learning_rate": 3.877299790006635e-05, + "loss": 0.0012, + "num_input_tokens_seen": 29826152, + "step": 51410 + }, + { + "epoch": 7.657879058683348, + "grad_norm": 5.371258735656738, + "learning_rate": 3.877028595269139e-05, + "loss": 0.2105, + "num_input_tokens_seen": 29829704, + "step": 51415 + }, + { + "epoch": 7.658623771224307, + "grad_norm": 0.0361853688955307, + "learning_rate": 3.876757377268144e-05, + "loss": 0.1494, + "num_input_tokens_seen": 29832520, + "step": 51420 + }, + { + "epoch": 7.659368483765267, + "grad_norm": 10.241911888122559, + "learning_rate": 3.8764861360082324e-05, + "loss": 0.2486, + "num_input_tokens_seen": 29835176, + "step": 51425 + }, + { + "epoch": 7.6601131963062254, + "grad_norm": 0.003630701918154955, + "learning_rate": 3.876214871493987e-05, + "loss": 0.0043, + "num_input_tokens_seen": 29837768, + "step": 51430 + }, + { + "epoch": 7.660857908847185, + "grad_norm": 10.740577697753906, + "learning_rate": 3.8759435837299904e-05, + "loss": 0.3663, + "num_input_tokens_seen": 29840840, + "step": 51435 + }, + { + "epoch": 7.661602621388144, + "grad_norm": 0.047073762863874435, + "learning_rate": 3.8756722727208246e-05, + "loss": 0.1011, + "num_input_tokens_seen": 29843656, + "step": 51440 + }, + { + "epoch": 7.6623473339291035, + "grad_norm": 0.1869092881679535, + "learning_rate": 3.8754009384710736e-05, + "loss": 0.204, + "num_input_tokens_seen": 29846440, + "step": 51445 + }, + { + "epoch": 7.663092046470062, + "grad_norm": 0.05611647665500641, + "learning_rate": 3.8751295809853225e-05, + "loss": 0.3835, + "num_input_tokens_seen": 29849480, + "step": 51450 + }, + { + "epoch": 7.663836759011022, + "grad_norm": 0.02881138026714325, + "learning_rate": 3.8748582002681545e-05, + "loss": 0.112, + "num_input_tokens_seen": 29852264, + "step": 51455 + }, + { + "epoch": 7.664581471551981, + "grad_norm": 31.66159439086914, + "learning_rate": 3.8745867963241545e-05, + "loss": 0.0802, + "num_input_tokens_seen": 29854856, + "step": 51460 + }, + { + "epoch": 7.66532618409294, + "grad_norm": 23.227806091308594, + "learning_rate": 3.874315369157907e-05, + "loss": 0.2002, + "num_input_tokens_seen": 29857704, + "step": 51465 + }, + { + "epoch": 7.666070896633899, + "grad_norm": 0.03149053826928139, + "learning_rate": 3.8740439187739993e-05, + "loss": 0.111, + "num_input_tokens_seen": 29860552, + "step": 51470 + }, + { + "epoch": 7.666815609174859, + "grad_norm": 16.407569885253906, + "learning_rate": 3.873772445177015e-05, + "loss": 0.2087, + "num_input_tokens_seen": 29863208, + "step": 51475 + }, + { + "epoch": 7.6675603217158175, + "grad_norm": 0.049213942140340805, + "learning_rate": 3.873500948371542e-05, + "loss": 0.0019, + "num_input_tokens_seen": 29865896, + "step": 51480 + }, + { + "epoch": 7.668305034256777, + "grad_norm": 74.63909912109375, + "learning_rate": 3.873229428362167e-05, + "loss": 0.2116, + "num_input_tokens_seen": 29868840, + "step": 51485 + }, + { + "epoch": 7.669049746797736, + "grad_norm": 0.0017726401565596461, + "learning_rate": 3.872957885153476e-05, + "loss": 0.1276, + "num_input_tokens_seen": 29871560, + "step": 51490 + }, + { + "epoch": 7.6697944593386955, + "grad_norm": 0.18770287930965424, + "learning_rate": 3.8726863187500564e-05, + "loss": 0.1049, + "num_input_tokens_seen": 29874408, + "step": 51495 + }, + { + "epoch": 7.670539171879654, + "grad_norm": 0.39356353878974915, + "learning_rate": 3.872414729156497e-05, + "loss": 0.0625, + "num_input_tokens_seen": 29877128, + "step": 51500 + }, + { + "epoch": 7.671283884420614, + "grad_norm": 0.014628257602453232, + "learning_rate": 3.872143116377386e-05, + "loss": 0.1445, + "num_input_tokens_seen": 29879976, + "step": 51505 + }, + { + "epoch": 7.672028596961573, + "grad_norm": 56.6384162902832, + "learning_rate": 3.871871480417311e-05, + "loss": 0.1497, + "num_input_tokens_seen": 29882792, + "step": 51510 + }, + { + "epoch": 7.672773309502532, + "grad_norm": 28.639026641845703, + "learning_rate": 3.871599821280863e-05, + "loss": 0.1246, + "num_input_tokens_seen": 29886024, + "step": 51515 + }, + { + "epoch": 7.673518022043491, + "grad_norm": 121.5126953125, + "learning_rate": 3.8713281389726285e-05, + "loss": 0.2278, + "num_input_tokens_seen": 29889160, + "step": 51520 + }, + { + "epoch": 7.674262734584451, + "grad_norm": 159.5980682373047, + "learning_rate": 3.871056433497199e-05, + "loss": 0.3004, + "num_input_tokens_seen": 29892296, + "step": 51525 + }, + { + "epoch": 7.6750074471254095, + "grad_norm": 0.03328824043273926, + "learning_rate": 3.870784704859165e-05, + "loss": 0.1418, + "num_input_tokens_seen": 29895400, + "step": 51530 + }, + { + "epoch": 7.675752159666369, + "grad_norm": 68.4201431274414, + "learning_rate": 3.8705129530631165e-05, + "loss": 0.1044, + "num_input_tokens_seen": 29898280, + "step": 51535 + }, + { + "epoch": 7.676496872207328, + "grad_norm": 0.5254912972450256, + "learning_rate": 3.870241178113645e-05, + "loss": 0.1816, + "num_input_tokens_seen": 29901096, + "step": 51540 + }, + { + "epoch": 7.6772415847482876, + "grad_norm": 86.44969940185547, + "learning_rate": 3.86996938001534e-05, + "loss": 0.1129, + "num_input_tokens_seen": 29904104, + "step": 51545 + }, + { + "epoch": 7.677986297289246, + "grad_norm": 0.0445684976875782, + "learning_rate": 3.869697558772796e-05, + "loss": 0.2879, + "num_input_tokens_seen": 29907048, + "step": 51550 + }, + { + "epoch": 7.678731009830205, + "grad_norm": 0.25657743215560913, + "learning_rate": 3.8694257143906035e-05, + "loss": 0.2472, + "num_input_tokens_seen": 29909864, + "step": 51555 + }, + { + "epoch": 7.679475722371165, + "grad_norm": 35.49209213256836, + "learning_rate": 3.869153846873356e-05, + "loss": 0.0233, + "num_input_tokens_seen": 29912776, + "step": 51560 + }, + { + "epoch": 7.680220434912124, + "grad_norm": 0.026368992403149605, + "learning_rate": 3.868881956225645e-05, + "loss": 0.3303, + "num_input_tokens_seen": 29915752, + "step": 51565 + }, + { + "epoch": 7.680965147453083, + "grad_norm": 27.728111267089844, + "learning_rate": 3.868610042452065e-05, + "loss": 0.2394, + "num_input_tokens_seen": 29918504, + "step": 51570 + }, + { + "epoch": 7.681709859994042, + "grad_norm": 0.15424026548862457, + "learning_rate": 3.8683381055572095e-05, + "loss": 0.4244, + "num_input_tokens_seen": 29921160, + "step": 51575 + }, + { + "epoch": 7.6824545725350015, + "grad_norm": 0.022628208622336388, + "learning_rate": 3.868066145545672e-05, + "loss": 0.0022, + "num_input_tokens_seen": 29924392, + "step": 51580 + }, + { + "epoch": 7.683199285075961, + "grad_norm": 4.379243850708008, + "learning_rate": 3.867794162422047e-05, + "loss": 0.0293, + "num_input_tokens_seen": 29927272, + "step": 51585 + }, + { + "epoch": 7.68394399761692, + "grad_norm": 0.7881100177764893, + "learning_rate": 3.86752215619093e-05, + "loss": 0.0008, + "num_input_tokens_seen": 29930184, + "step": 51590 + }, + { + "epoch": 7.684688710157879, + "grad_norm": 3.8847529888153076, + "learning_rate": 3.867250126856917e-05, + "loss": 0.1069, + "num_input_tokens_seen": 29933448, + "step": 51595 + }, + { + "epoch": 7.685433422698838, + "grad_norm": 0.037483517080545425, + "learning_rate": 3.866978074424602e-05, + "loss": 0.2817, + "num_input_tokens_seen": 29936104, + "step": 51600 + }, + { + "epoch": 7.686178135239797, + "grad_norm": 119.53826141357422, + "learning_rate": 3.866705998898582e-05, + "loss": 0.0887, + "num_input_tokens_seen": 29939592, + "step": 51605 + }, + { + "epoch": 7.686922847780757, + "grad_norm": 0.13134223222732544, + "learning_rate": 3.866433900283453e-05, + "loss": 0.0268, + "num_input_tokens_seen": 29942280, + "step": 51610 + }, + { + "epoch": 7.6876675603217155, + "grad_norm": 58.18824768066406, + "learning_rate": 3.866161778583812e-05, + "loss": 0.4347, + "num_input_tokens_seen": 29945320, + "step": 51615 + }, + { + "epoch": 7.688412272862675, + "grad_norm": 0.002626282162964344, + "learning_rate": 3.865889633804257e-05, + "loss": 0.5416, + "num_input_tokens_seen": 29948232, + "step": 51620 + }, + { + "epoch": 7.689156985403634, + "grad_norm": 90.11040496826172, + "learning_rate": 3.8656174659493835e-05, + "loss": 0.3238, + "num_input_tokens_seen": 29951144, + "step": 51625 + }, + { + "epoch": 7.6899016979445936, + "grad_norm": 49.60853576660156, + "learning_rate": 3.865345275023792e-05, + "loss": 0.0803, + "num_input_tokens_seen": 29953896, + "step": 51630 + }, + { + "epoch": 7.690646410485552, + "grad_norm": 0.5755855441093445, + "learning_rate": 3.8650730610320796e-05, + "loss": 0.031, + "num_input_tokens_seen": 29956808, + "step": 51635 + }, + { + "epoch": 7.691391123026512, + "grad_norm": 65.00048065185547, + "learning_rate": 3.864800823978845e-05, + "loss": 0.1273, + "num_input_tokens_seen": 29959688, + "step": 51640 + }, + { + "epoch": 7.692135835567471, + "grad_norm": 2.7773520946502686, + "learning_rate": 3.864528563868687e-05, + "loss": 0.0991, + "num_input_tokens_seen": 29962504, + "step": 51645 + }, + { + "epoch": 7.69288054810843, + "grad_norm": 0.008086620829999447, + "learning_rate": 3.864256280706206e-05, + "loss": 0.0929, + "num_input_tokens_seen": 29965064, + "step": 51650 + }, + { + "epoch": 7.693625260649389, + "grad_norm": 0.015152224339544773, + "learning_rate": 3.8639839744960025e-05, + "loss": 0.0583, + "num_input_tokens_seen": 29967848, + "step": 51655 + }, + { + "epoch": 7.694369973190349, + "grad_norm": 0.008395889773964882, + "learning_rate": 3.863711645242676e-05, + "loss": 0.0864, + "num_input_tokens_seen": 29970696, + "step": 51660 + }, + { + "epoch": 7.6951146857313075, + "grad_norm": 0.16218705475330353, + "learning_rate": 3.863439292950827e-05, + "loss": 0.0251, + "num_input_tokens_seen": 29973960, + "step": 51665 + }, + { + "epoch": 7.695859398272267, + "grad_norm": 1.4491674900054932, + "learning_rate": 3.863166917625056e-05, + "loss": 0.0016, + "num_input_tokens_seen": 29976744, + "step": 51670 + }, + { + "epoch": 7.696604110813226, + "grad_norm": 0.0011251600226387382, + "learning_rate": 3.862894519269966e-05, + "loss": 0.9411, + "num_input_tokens_seen": 29979528, + "step": 51675 + }, + { + "epoch": 7.697348823354186, + "grad_norm": 0.08386071026325226, + "learning_rate": 3.8626220978901585e-05, + "loss": 0.4431, + "num_input_tokens_seen": 29982504, + "step": 51680 + }, + { + "epoch": 7.698093535895144, + "grad_norm": 22.417993545532227, + "learning_rate": 3.862349653490236e-05, + "loss": 0.0421, + "num_input_tokens_seen": 29985352, + "step": 51685 + }, + { + "epoch": 7.698838248436104, + "grad_norm": 3.8894762992858887, + "learning_rate": 3.8620771860748005e-05, + "loss": 0.1389, + "num_input_tokens_seen": 29988648, + "step": 51690 + }, + { + "epoch": 7.699582960977063, + "grad_norm": 20.769895553588867, + "learning_rate": 3.861804695648455e-05, + "loss": 0.3974, + "num_input_tokens_seen": 29991624, + "step": 51695 + }, + { + "epoch": 7.700327673518022, + "grad_norm": 0.5197784304618835, + "learning_rate": 3.861532182215802e-05, + "loss": 0.1548, + "num_input_tokens_seen": 29994568, + "step": 51700 + }, + { + "epoch": 7.701072386058981, + "grad_norm": 72.9325942993164, + "learning_rate": 3.861259645781449e-05, + "loss": 0.2049, + "num_input_tokens_seen": 29997640, + "step": 51705 + }, + { + "epoch": 7.701817098599941, + "grad_norm": 0.02240031026303768, + "learning_rate": 3.860987086349996e-05, + "loss": 0.2301, + "num_input_tokens_seen": 30000776, + "step": 51710 + }, + { + "epoch": 7.7025618111408996, + "grad_norm": 0.03184079751372337, + "learning_rate": 3.86071450392605e-05, + "loss": 0.167, + "num_input_tokens_seen": 30003400, + "step": 51715 + }, + { + "epoch": 7.703306523681858, + "grad_norm": 91.02526092529297, + "learning_rate": 3.860441898514215e-05, + "loss": 0.171, + "num_input_tokens_seen": 30006408, + "step": 51720 + }, + { + "epoch": 7.704051236222818, + "grad_norm": 12.779264450073242, + "learning_rate": 3.8601692701190975e-05, + "loss": 0.1468, + "num_input_tokens_seen": 30009192, + "step": 51725 + }, + { + "epoch": 7.704795948763778, + "grad_norm": 172.7075958251953, + "learning_rate": 3.8598966187453034e-05, + "loss": 0.4155, + "num_input_tokens_seen": 30012328, + "step": 51730 + }, + { + "epoch": 7.705540661304736, + "grad_norm": 1.8064000606536865, + "learning_rate": 3.859623944397437e-05, + "loss": 0.0821, + "num_input_tokens_seen": 30015176, + "step": 51735 + }, + { + "epoch": 7.706285373845695, + "grad_norm": 129.6165008544922, + "learning_rate": 3.859351247080106e-05, + "loss": 0.0655, + "num_input_tokens_seen": 30017928, + "step": 51740 + }, + { + "epoch": 7.707030086386655, + "grad_norm": 0.17222169041633606, + "learning_rate": 3.859078526797917e-05, + "loss": 0.1669, + "num_input_tokens_seen": 30020744, + "step": 51745 + }, + { + "epoch": 7.707774798927614, + "grad_norm": 0.07815126329660416, + "learning_rate": 3.8588057835554776e-05, + "loss": 0.1743, + "num_input_tokens_seen": 30023688, + "step": 51750 + }, + { + "epoch": 7.708519511468573, + "grad_norm": 0.03137895464897156, + "learning_rate": 3.858533017357396e-05, + "loss": 0.2331, + "num_input_tokens_seen": 30026472, + "step": 51755 + }, + { + "epoch": 7.709264224009532, + "grad_norm": 3.376194477081299, + "learning_rate": 3.858260228208279e-05, + "loss": 0.3795, + "num_input_tokens_seen": 30029000, + "step": 51760 + }, + { + "epoch": 7.710008936550492, + "grad_norm": 9.614243507385254, + "learning_rate": 3.857987416112737e-05, + "loss": 0.1668, + "num_input_tokens_seen": 30031656, + "step": 51765 + }, + { + "epoch": 7.71075364909145, + "grad_norm": 0.3045736253261566, + "learning_rate": 3.857714581075377e-05, + "loss": 0.1231, + "num_input_tokens_seen": 30034568, + "step": 51770 + }, + { + "epoch": 7.71149836163241, + "grad_norm": 26.727750778198242, + "learning_rate": 3.85744172310081e-05, + "loss": 0.2271, + "num_input_tokens_seen": 30037064, + "step": 51775 + }, + { + "epoch": 7.712243074173369, + "grad_norm": 0.16975034773349762, + "learning_rate": 3.8571688421936434e-05, + "loss": 0.3677, + "num_input_tokens_seen": 30039848, + "step": 51780 + }, + { + "epoch": 7.712987786714328, + "grad_norm": 0.05274312570691109, + "learning_rate": 3.85689593835849e-05, + "loss": 0.2022, + "num_input_tokens_seen": 30042728, + "step": 51785 + }, + { + "epoch": 7.713732499255287, + "grad_norm": 0.10527697950601578, + "learning_rate": 3.8566230115999575e-05, + "loss": 0.3833, + "num_input_tokens_seen": 30045512, + "step": 51790 + }, + { + "epoch": 7.714477211796247, + "grad_norm": 0.017750069499015808, + "learning_rate": 3.856350061922659e-05, + "loss": 0.1901, + "num_input_tokens_seen": 30048488, + "step": 51795 + }, + { + "epoch": 7.7152219243372056, + "grad_norm": 0.0769115537405014, + "learning_rate": 3.856077089331204e-05, + "loss": 0.1317, + "num_input_tokens_seen": 30051432, + "step": 51800 + }, + { + "epoch": 7.715966636878165, + "grad_norm": 35.43792724609375, + "learning_rate": 3.855804093830205e-05, + "loss": 0.2139, + "num_input_tokens_seen": 30054440, + "step": 51805 + }, + { + "epoch": 7.716711349419124, + "grad_norm": 9.770397186279297, + "learning_rate": 3.855531075424274e-05, + "loss": 0.1145, + "num_input_tokens_seen": 30057000, + "step": 51810 + }, + { + "epoch": 7.717456061960084, + "grad_norm": 0.06732302904129028, + "learning_rate": 3.8552580341180236e-05, + "loss": 0.0377, + "num_input_tokens_seen": 30059976, + "step": 51815 + }, + { + "epoch": 7.718200774501042, + "grad_norm": 118.1686782836914, + "learning_rate": 3.8549849699160655e-05, + "loss": 0.2369, + "num_input_tokens_seen": 30062856, + "step": 51820 + }, + { + "epoch": 7.718945487042002, + "grad_norm": 0.14952369034290314, + "learning_rate": 3.8547118828230135e-05, + "loss": 0.0423, + "num_input_tokens_seen": 30065736, + "step": 51825 + }, + { + "epoch": 7.719690199582961, + "grad_norm": 6.588470935821533, + "learning_rate": 3.854438772843482e-05, + "loss": 0.0794, + "num_input_tokens_seen": 30069032, + "step": 51830 + }, + { + "epoch": 7.72043491212392, + "grad_norm": 14.3125581741333, + "learning_rate": 3.8541656399820825e-05, + "loss": 0.3911, + "num_input_tokens_seen": 30071880, + "step": 51835 + }, + { + "epoch": 7.721179624664879, + "grad_norm": 0.260714054107666, + "learning_rate": 3.853892484243432e-05, + "loss": 0.0565, + "num_input_tokens_seen": 30074760, + "step": 51840 + }, + { + "epoch": 7.721924337205839, + "grad_norm": 1.2600759267807007, + "learning_rate": 3.8536193056321436e-05, + "loss": 0.0175, + "num_input_tokens_seen": 30077416, + "step": 51845 + }, + { + "epoch": 7.722669049746798, + "grad_norm": 63.95756530761719, + "learning_rate": 3.853346104152833e-05, + "loss": 0.1429, + "num_input_tokens_seen": 30080424, + "step": 51850 + }, + { + "epoch": 7.723413762287757, + "grad_norm": 96.9116439819336, + "learning_rate": 3.853072879810115e-05, + "loss": 0.0466, + "num_input_tokens_seen": 30083336, + "step": 51855 + }, + { + "epoch": 7.724158474828716, + "grad_norm": 10.804994583129883, + "learning_rate": 3.8527996326086065e-05, + "loss": 0.3915, + "num_input_tokens_seen": 30086280, + "step": 51860 + }, + { + "epoch": 7.724903187369676, + "grad_norm": 3.469550132751465, + "learning_rate": 3.852526362552923e-05, + "loss": 0.0633, + "num_input_tokens_seen": 30088872, + "step": 51865 + }, + { + "epoch": 7.725647899910634, + "grad_norm": 0.25559720396995544, + "learning_rate": 3.852253069647681e-05, + "loss": 0.0011, + "num_input_tokens_seen": 30091688, + "step": 51870 + }, + { + "epoch": 7.726392612451594, + "grad_norm": 35.46019744873047, + "learning_rate": 3.851979753897498e-05, + "loss": 0.0545, + "num_input_tokens_seen": 30094728, + "step": 51875 + }, + { + "epoch": 7.727137324992553, + "grad_norm": 38.445648193359375, + "learning_rate": 3.8517064153069905e-05, + "loss": 0.2997, + "num_input_tokens_seen": 30097448, + "step": 51880 + }, + { + "epoch": 7.727882037533512, + "grad_norm": 5.161391258239746, + "learning_rate": 3.8514330538807775e-05, + "loss": 0.0032, + "num_input_tokens_seen": 30100616, + "step": 51885 + }, + { + "epoch": 7.728626750074471, + "grad_norm": 16.86277198791504, + "learning_rate": 3.8511596696234765e-05, + "loss": 0.391, + "num_input_tokens_seen": 30103848, + "step": 51890 + }, + { + "epoch": 7.729371462615431, + "grad_norm": 0.11781108379364014, + "learning_rate": 3.8508862625397055e-05, + "loss": 0.0616, + "num_input_tokens_seen": 30106664, + "step": 51895 + }, + { + "epoch": 7.73011617515639, + "grad_norm": 0.4301379323005676, + "learning_rate": 3.850612832634085e-05, + "loss": 0.0729, + "num_input_tokens_seen": 30109544, + "step": 51900 + }, + { + "epoch": 7.730860887697348, + "grad_norm": 0.024405740201473236, + "learning_rate": 3.850339379911233e-05, + "loss": 0.1173, + "num_input_tokens_seen": 30112584, + "step": 51905 + }, + { + "epoch": 7.731605600238308, + "grad_norm": 1.9474358558654785, + "learning_rate": 3.8500659043757705e-05, + "loss": 0.0647, + "num_input_tokens_seen": 30115176, + "step": 51910 + }, + { + "epoch": 7.732350312779268, + "grad_norm": 17.283374786376953, + "learning_rate": 3.8497924060323154e-05, + "loss": 0.1893, + "num_input_tokens_seen": 30118120, + "step": 51915 + }, + { + "epoch": 7.733095025320226, + "grad_norm": 0.016228239983320236, + "learning_rate": 3.84951888488549e-05, + "loss": 0.0155, + "num_input_tokens_seen": 30121224, + "step": 51920 + }, + { + "epoch": 7.733839737861185, + "grad_norm": 41.501895904541016, + "learning_rate": 3.849245340939914e-05, + "loss": 0.2474, + "num_input_tokens_seen": 30124392, + "step": 51925 + }, + { + "epoch": 7.734584450402145, + "grad_norm": 0.025049572810530663, + "learning_rate": 3.84897177420021e-05, + "loss": 0.0373, + "num_input_tokens_seen": 30127624, + "step": 51930 + }, + { + "epoch": 7.735329162943104, + "grad_norm": 7.636963844299316, + "learning_rate": 3.848698184670999e-05, + "loss": 0.0809, + "num_input_tokens_seen": 30130920, + "step": 51935 + }, + { + "epoch": 7.736073875484063, + "grad_norm": 125.54845428466797, + "learning_rate": 3.848424572356902e-05, + "loss": 0.3407, + "num_input_tokens_seen": 30134024, + "step": 51940 + }, + { + "epoch": 7.736818588025022, + "grad_norm": 0.14233927428722382, + "learning_rate": 3.848150937262544e-05, + "loss": 0.1219, + "num_input_tokens_seen": 30137160, + "step": 51945 + }, + { + "epoch": 7.737563300565982, + "grad_norm": 0.6950278282165527, + "learning_rate": 3.847877279392546e-05, + "loss": 0.1066, + "num_input_tokens_seen": 30140168, + "step": 51950 + }, + { + "epoch": 7.73830801310694, + "grad_norm": 16.444921493530273, + "learning_rate": 3.847603598751529e-05, + "loss": 0.2202, + "num_input_tokens_seen": 30143144, + "step": 51955 + }, + { + "epoch": 7.7390527256479, + "grad_norm": 45.40562438964844, + "learning_rate": 3.847329895344121e-05, + "loss": 0.2119, + "num_input_tokens_seen": 30146248, + "step": 51960 + }, + { + "epoch": 7.739797438188859, + "grad_norm": 0.05369167774915695, + "learning_rate": 3.847056169174942e-05, + "loss": 0.0022, + "num_input_tokens_seen": 30149064, + "step": 51965 + }, + { + "epoch": 7.740542150729818, + "grad_norm": 7.825796127319336, + "learning_rate": 3.846782420248619e-05, + "loss": 0.2495, + "num_input_tokens_seen": 30151880, + "step": 51970 + }, + { + "epoch": 7.741286863270777, + "grad_norm": 41.02836227416992, + "learning_rate": 3.8465086485697766e-05, + "loss": 0.1096, + "num_input_tokens_seen": 30154600, + "step": 51975 + }, + { + "epoch": 7.742031575811737, + "grad_norm": 0.01015151385217905, + "learning_rate": 3.8462348541430396e-05, + "loss": 0.2356, + "num_input_tokens_seen": 30157576, + "step": 51980 + }, + { + "epoch": 7.742776288352696, + "grad_norm": 0.02102184295654297, + "learning_rate": 3.8459610369730316e-05, + "loss": 0.3336, + "num_input_tokens_seen": 30160264, + "step": 51985 + }, + { + "epoch": 7.743521000893655, + "grad_norm": 0.020832102745771408, + "learning_rate": 3.8456871970643794e-05, + "loss": 0.1303, + "num_input_tokens_seen": 30162920, + "step": 51990 + }, + { + "epoch": 7.744265713434614, + "grad_norm": 0.14325833320617676, + "learning_rate": 3.8454133344217105e-05, + "loss": 0.3668, + "num_input_tokens_seen": 30165896, + "step": 51995 + }, + { + "epoch": 7.745010425975574, + "grad_norm": 0.7609042525291443, + "learning_rate": 3.8451394490496505e-05, + "loss": 0.099, + "num_input_tokens_seen": 30168808, + "step": 52000 + }, + { + "epoch": 7.745755138516532, + "grad_norm": 50.60837936401367, + "learning_rate": 3.8448655409528274e-05, + "loss": 0.2644, + "num_input_tokens_seen": 30171432, + "step": 52005 + }, + { + "epoch": 7.746499851057492, + "grad_norm": 67.86805725097656, + "learning_rate": 3.844591610135867e-05, + "loss": 0.2204, + "num_input_tokens_seen": 30174312, + "step": 52010 + }, + { + "epoch": 7.747244563598451, + "grad_norm": 14.427573204040527, + "learning_rate": 3.844317656603398e-05, + "loss": 0.1995, + "num_input_tokens_seen": 30177032, + "step": 52015 + }, + { + "epoch": 7.7479892761394105, + "grad_norm": 3.574014902114868, + "learning_rate": 3.844043680360049e-05, + "loss": 0.0551, + "num_input_tokens_seen": 30179784, + "step": 52020 + }, + { + "epoch": 7.748733988680369, + "grad_norm": 0.3077341318130493, + "learning_rate": 3.8437696814104476e-05, + "loss": 0.1325, + "num_input_tokens_seen": 30182312, + "step": 52025 + }, + { + "epoch": 7.749478701221329, + "grad_norm": 0.08542855083942413, + "learning_rate": 3.8434956597592234e-05, + "loss": 0.1846, + "num_input_tokens_seen": 30185096, + "step": 52030 + }, + { + "epoch": 7.750223413762288, + "grad_norm": 8.50634479522705, + "learning_rate": 3.8432216154110053e-05, + "loss": 0.2594, + "num_input_tokens_seen": 30187752, + "step": 52035 + }, + { + "epoch": 7.750968126303247, + "grad_norm": 0.10119929909706116, + "learning_rate": 3.8429475483704236e-05, + "loss": 0.1859, + "num_input_tokens_seen": 30190408, + "step": 52040 + }, + { + "epoch": 7.751712838844206, + "grad_norm": 79.18733978271484, + "learning_rate": 3.842673458642108e-05, + "loss": 0.1678, + "num_input_tokens_seen": 30193224, + "step": 52045 + }, + { + "epoch": 7.752457551385166, + "grad_norm": 2.0092976093292236, + "learning_rate": 3.842399346230688e-05, + "loss": 0.4757, + "num_input_tokens_seen": 30195976, + "step": 52050 + }, + { + "epoch": 7.753202263926124, + "grad_norm": 67.16893005371094, + "learning_rate": 3.842125211140796e-05, + "loss": 0.3606, + "num_input_tokens_seen": 30198664, + "step": 52055 + }, + { + "epoch": 7.753946976467084, + "grad_norm": 20.620624542236328, + "learning_rate": 3.8418510533770624e-05, + "loss": 0.1375, + "num_input_tokens_seen": 30201384, + "step": 52060 + }, + { + "epoch": 7.754691689008043, + "grad_norm": 66.68000030517578, + "learning_rate": 3.841576872944119e-05, + "loss": 0.3988, + "num_input_tokens_seen": 30204104, + "step": 52065 + }, + { + "epoch": 7.755436401549002, + "grad_norm": 0.030187727883458138, + "learning_rate": 3.841302669846599e-05, + "loss": 0.1555, + "num_input_tokens_seen": 30206792, + "step": 52070 + }, + { + "epoch": 7.756181114089961, + "grad_norm": 0.10251681506633759, + "learning_rate": 3.841028444089133e-05, + "loss": 0.0258, + "num_input_tokens_seen": 30209768, + "step": 52075 + }, + { + "epoch": 7.756925826630921, + "grad_norm": 16.381946563720703, + "learning_rate": 3.840754195676354e-05, + "loss": 0.3713, + "num_input_tokens_seen": 30212360, + "step": 52080 + }, + { + "epoch": 7.75767053917188, + "grad_norm": 0.15107497572898865, + "learning_rate": 3.8404799246128956e-05, + "loss": 0.3563, + "num_input_tokens_seen": 30215176, + "step": 52085 + }, + { + "epoch": 7.758415251712838, + "grad_norm": 0.47121989727020264, + "learning_rate": 3.8402056309033915e-05, + "loss": 0.0437, + "num_input_tokens_seen": 30218152, + "step": 52090 + }, + { + "epoch": 7.759159964253798, + "grad_norm": 15.005120277404785, + "learning_rate": 3.839931314552475e-05, + "loss": 0.4165, + "num_input_tokens_seen": 30221128, + "step": 52095 + }, + { + "epoch": 7.759904676794758, + "grad_norm": 2.862769842147827, + "learning_rate": 3.8396569755647816e-05, + "loss": 0.3531, + "num_input_tokens_seen": 30223720, + "step": 52100 + }, + { + "epoch": 7.7606493893357165, + "grad_norm": 0.011649374850094318, + "learning_rate": 3.839382613944944e-05, + "loss": 0.204, + "num_input_tokens_seen": 30226632, + "step": 52105 + }, + { + "epoch": 7.761394101876675, + "grad_norm": 0.18612495064735413, + "learning_rate": 3.8391082296976e-05, + "loss": 0.0686, + "num_input_tokens_seen": 30229608, + "step": 52110 + }, + { + "epoch": 7.762138814417635, + "grad_norm": 0.03154502809047699, + "learning_rate": 3.8388338228273824e-05, + "loss": 0.0031, + "num_input_tokens_seen": 30232424, + "step": 52115 + }, + { + "epoch": 7.762883526958594, + "grad_norm": 0.07493004202842712, + "learning_rate": 3.838559393338927e-05, + "loss": 0.0281, + "num_input_tokens_seen": 30235272, + "step": 52120 + }, + { + "epoch": 7.763628239499553, + "grad_norm": 0.11717485636472702, + "learning_rate": 3.838284941236873e-05, + "loss": 0.3644, + "num_input_tokens_seen": 30238408, + "step": 52125 + }, + { + "epoch": 7.764372952040512, + "grad_norm": 0.37397924065589905, + "learning_rate": 3.8380104665258545e-05, + "loss": 0.0096, + "num_input_tokens_seen": 30241672, + "step": 52130 + }, + { + "epoch": 7.765117664581472, + "grad_norm": 4.42051362991333, + "learning_rate": 3.837735969210509e-05, + "loss": 0.07, + "num_input_tokens_seen": 30244520, + "step": 52135 + }, + { + "epoch": 7.76586237712243, + "grad_norm": 4.197668552398682, + "learning_rate": 3.837461449295474e-05, + "loss": 0.0083, + "num_input_tokens_seen": 30247272, + "step": 52140 + }, + { + "epoch": 7.76660708966339, + "grad_norm": 0.032926689833402634, + "learning_rate": 3.837186906785387e-05, + "loss": 0.0006, + "num_input_tokens_seen": 30250280, + "step": 52145 + }, + { + "epoch": 7.767351802204349, + "grad_norm": 1.8018959760665894, + "learning_rate": 3.836912341684886e-05, + "loss": 0.1296, + "num_input_tokens_seen": 30253224, + "step": 52150 + }, + { + "epoch": 7.7680965147453085, + "grad_norm": 0.01786087267100811, + "learning_rate": 3.83663775399861e-05, + "loss": 0.0986, + "num_input_tokens_seen": 30256296, + "step": 52155 + }, + { + "epoch": 7.768841227286267, + "grad_norm": 0.002731733024120331, + "learning_rate": 3.836363143731198e-05, + "loss": 0.0119, + "num_input_tokens_seen": 30259048, + "step": 52160 + }, + { + "epoch": 7.769585939827227, + "grad_norm": 0.03004734218120575, + "learning_rate": 3.8360885108872885e-05, + "loss": 0.2689, + "num_input_tokens_seen": 30262344, + "step": 52165 + }, + { + "epoch": 7.770330652368186, + "grad_norm": 24.556921005249023, + "learning_rate": 3.8358138554715215e-05, + "loss": 0.0273, + "num_input_tokens_seen": 30265832, + "step": 52170 + }, + { + "epoch": 7.771075364909145, + "grad_norm": 0.00978398509323597, + "learning_rate": 3.8355391774885375e-05, + "loss": 0.2005, + "num_input_tokens_seen": 30268808, + "step": 52175 + }, + { + "epoch": 7.771820077450104, + "grad_norm": 0.1086452528834343, + "learning_rate": 3.835264476942977e-05, + "loss": 0.0846, + "num_input_tokens_seen": 30271944, + "step": 52180 + }, + { + "epoch": 7.772564789991064, + "grad_norm": 0.23826049268245697, + "learning_rate": 3.834989753839479e-05, + "loss": 0.2013, + "num_input_tokens_seen": 30274952, + "step": 52185 + }, + { + "epoch": 7.7733095025320225, + "grad_norm": 6.00754976272583, + "learning_rate": 3.834715008182687e-05, + "loss": 0.1445, + "num_input_tokens_seen": 30277864, + "step": 52190 + }, + { + "epoch": 7.774054215072982, + "grad_norm": 0.08245010673999786, + "learning_rate": 3.83444023997724e-05, + "loss": 0.2906, + "num_input_tokens_seen": 30280680, + "step": 52195 + }, + { + "epoch": 7.774798927613941, + "grad_norm": 54.270626068115234, + "learning_rate": 3.834165449227782e-05, + "loss": 0.0858, + "num_input_tokens_seen": 30283144, + "step": 52200 + }, + { + "epoch": 7.7755436401549005, + "grad_norm": 93.42828369140625, + "learning_rate": 3.833890635938956e-05, + "loss": 0.4181, + "num_input_tokens_seen": 30286568, + "step": 52205 + }, + { + "epoch": 7.776288352695859, + "grad_norm": 0.16328206658363342, + "learning_rate": 3.8336158001154024e-05, + "loss": 0.0965, + "num_input_tokens_seen": 30289736, + "step": 52210 + }, + { + "epoch": 7.777033065236819, + "grad_norm": 0.07451532781124115, + "learning_rate": 3.8333409417617654e-05, + "loss": 0.0052, + "num_input_tokens_seen": 30292584, + "step": 52215 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 0.39324674010276794, + "learning_rate": 3.8330660608826885e-05, + "loss": 0.0616, + "num_input_tokens_seen": 30295304, + "step": 52220 + }, + { + "epoch": 7.778522490318737, + "grad_norm": 21.517641067504883, + "learning_rate": 3.832791157482815e-05, + "loss": 0.1707, + "num_input_tokens_seen": 30298184, + "step": 52225 + }, + { + "epoch": 7.779267202859696, + "grad_norm": 17.103099822998047, + "learning_rate": 3.8325162315667895e-05, + "loss": 0.1489, + "num_input_tokens_seen": 30300872, + "step": 52230 + }, + { + "epoch": 7.780011915400656, + "grad_norm": 0.013796710409224033, + "learning_rate": 3.832241283139256e-05, + "loss": 0.0201, + "num_input_tokens_seen": 30303912, + "step": 52235 + }, + { + "epoch": 7.7807566279416145, + "grad_norm": 50.018836975097656, + "learning_rate": 3.831966312204861e-05, + "loss": 0.2278, + "num_input_tokens_seen": 30306376, + "step": 52240 + }, + { + "epoch": 7.781501340482574, + "grad_norm": 12.048725128173828, + "learning_rate": 3.831691318768249e-05, + "loss": 0.1568, + "num_input_tokens_seen": 30309160, + "step": 52245 + }, + { + "epoch": 7.782246053023533, + "grad_norm": 40.41299819946289, + "learning_rate": 3.831416302834065e-05, + "loss": 0.3856, + "num_input_tokens_seen": 30312008, + "step": 52250 + }, + { + "epoch": 7.782990765564492, + "grad_norm": 3.2639617919921875, + "learning_rate": 3.831141264406957e-05, + "loss": 0.3746, + "num_input_tokens_seen": 30314792, + "step": 52255 + }, + { + "epoch": 7.783735478105451, + "grad_norm": 0.02963542379438877, + "learning_rate": 3.8308662034915685e-05, + "loss": 0.2859, + "num_input_tokens_seen": 30317960, + "step": 52260 + }, + { + "epoch": 7.784480190646411, + "grad_norm": 8.981005668640137, + "learning_rate": 3.830591120092549e-05, + "loss": 0.0068, + "num_input_tokens_seen": 30320808, + "step": 52265 + }, + { + "epoch": 7.78522490318737, + "grad_norm": 44.63077163696289, + "learning_rate": 3.8303160142145444e-05, + "loss": 0.4655, + "num_input_tokens_seen": 30323848, + "step": 52270 + }, + { + "epoch": 7.7859696157283285, + "grad_norm": 0.281690776348114, + "learning_rate": 3.830040885862204e-05, + "loss": 0.0388, + "num_input_tokens_seen": 30326888, + "step": 52275 + }, + { + "epoch": 7.786714328269288, + "grad_norm": 0.04350076988339424, + "learning_rate": 3.8297657350401735e-05, + "loss": 0.0089, + "num_input_tokens_seen": 30330024, + "step": 52280 + }, + { + "epoch": 7.787459040810247, + "grad_norm": 0.025969509035348892, + "learning_rate": 3.829490561753103e-05, + "loss": 0.1576, + "num_input_tokens_seen": 30332936, + "step": 52285 + }, + { + "epoch": 7.7882037533512065, + "grad_norm": 7.289679050445557, + "learning_rate": 3.82921536600564e-05, + "loss": 0.1887, + "num_input_tokens_seen": 30335592, + "step": 52290 + }, + { + "epoch": 7.788948465892165, + "grad_norm": 0.2883945405483246, + "learning_rate": 3.828940147802435e-05, + "loss": 0.0942, + "num_input_tokens_seen": 30338568, + "step": 52295 + }, + { + "epoch": 7.789693178433125, + "grad_norm": 0.033916372805833817, + "learning_rate": 3.828664907148137e-05, + "loss": 0.0021, + "num_input_tokens_seen": 30341448, + "step": 52300 + }, + { + "epoch": 7.790437890974084, + "grad_norm": 0.16055458784103394, + "learning_rate": 3.828389644047395e-05, + "loss": 0.2718, + "num_input_tokens_seen": 30343976, + "step": 52305 + }, + { + "epoch": 7.791182603515043, + "grad_norm": 17.823223114013672, + "learning_rate": 3.8281143585048604e-05, + "loss": 0.0307, + "num_input_tokens_seen": 30347144, + "step": 52310 + }, + { + "epoch": 7.791927316056002, + "grad_norm": 24.33970832824707, + "learning_rate": 3.8278390505251835e-05, + "loss": 0.2942, + "num_input_tokens_seen": 30349768, + "step": 52315 + }, + { + "epoch": 7.792672028596962, + "grad_norm": 27.377002716064453, + "learning_rate": 3.827563720113016e-05, + "loss": 0.4174, + "num_input_tokens_seen": 30352808, + "step": 52320 + }, + { + "epoch": 7.7934167411379205, + "grad_norm": 0.20043013989925385, + "learning_rate": 3.827288367273008e-05, + "loss": 0.0016, + "num_input_tokens_seen": 30355336, + "step": 52325 + }, + { + "epoch": 7.79416145367888, + "grad_norm": 18.628616333007812, + "learning_rate": 3.827012992009812e-05, + "loss": 0.2485, + "num_input_tokens_seen": 30358280, + "step": 52330 + }, + { + "epoch": 7.794906166219839, + "grad_norm": 0.014391669072210789, + "learning_rate": 3.826737594328082e-05, + "loss": 0.2884, + "num_input_tokens_seen": 30361160, + "step": 52335 + }, + { + "epoch": 7.7956508787607985, + "grad_norm": 2.2685322761535645, + "learning_rate": 3.826462174232467e-05, + "loss": 0.2836, + "num_input_tokens_seen": 30364232, + "step": 52340 + }, + { + "epoch": 7.796395591301757, + "grad_norm": 46.77179718017578, + "learning_rate": 3.8261867317276225e-05, + "loss": 0.2256, + "num_input_tokens_seen": 30367016, + "step": 52345 + }, + { + "epoch": 7.797140303842717, + "grad_norm": 0.4755244255065918, + "learning_rate": 3.8259112668181995e-05, + "loss": 0.1376, + "num_input_tokens_seen": 30370088, + "step": 52350 + }, + { + "epoch": 7.797885016383676, + "grad_norm": 0.045836191624403, + "learning_rate": 3.825635779508855e-05, + "loss": 0.0028, + "num_input_tokens_seen": 30373000, + "step": 52355 + }, + { + "epoch": 7.798629728924635, + "grad_norm": 1.1289316415786743, + "learning_rate": 3.82536026980424e-05, + "loss": 0.2559, + "num_input_tokens_seen": 30375848, + "step": 52360 + }, + { + "epoch": 7.799374441465594, + "grad_norm": 0.03498855605721474, + "learning_rate": 3.825084737709011e-05, + "loss": 0.0379, + "num_input_tokens_seen": 30378536, + "step": 52365 + }, + { + "epoch": 7.800119154006554, + "grad_norm": 31.033166885375977, + "learning_rate": 3.824809183227822e-05, + "loss": 0.1594, + "num_input_tokens_seen": 30381512, + "step": 52370 + }, + { + "epoch": 7.8008638665475125, + "grad_norm": 86.2347412109375, + "learning_rate": 3.824533606365329e-05, + "loss": 0.2683, + "num_input_tokens_seen": 30384584, + "step": 52375 + }, + { + "epoch": 7.801608579088472, + "grad_norm": 0.11327341943979263, + "learning_rate": 3.824258007126186e-05, + "loss": 0.2945, + "num_input_tokens_seen": 30387560, + "step": 52380 + }, + { + "epoch": 7.802353291629431, + "grad_norm": 67.11473083496094, + "learning_rate": 3.82398238551505e-05, + "loss": 0.2154, + "num_input_tokens_seen": 30390248, + "step": 52385 + }, + { + "epoch": 7.803098004170391, + "grad_norm": 6.69194221496582, + "learning_rate": 3.823706741536578e-05, + "loss": 0.311, + "num_input_tokens_seen": 30393032, + "step": 52390 + }, + { + "epoch": 7.803842716711349, + "grad_norm": 0.028140561655163765, + "learning_rate": 3.823431075195425e-05, + "loss": 0.1104, + "num_input_tokens_seen": 30395784, + "step": 52395 + }, + { + "epoch": 7.804587429252309, + "grad_norm": 40.97625732421875, + "learning_rate": 3.8231553864962486e-05, + "loss": 0.5316, + "num_input_tokens_seen": 30398600, + "step": 52400 + }, + { + "epoch": 7.805332141793268, + "grad_norm": 0.07051073759794235, + "learning_rate": 3.8228796754437086e-05, + "loss": 0.0029, + "num_input_tokens_seen": 30401480, + "step": 52405 + }, + { + "epoch": 7.806076854334227, + "grad_norm": 46.053314208984375, + "learning_rate": 3.8226039420424596e-05, + "loss": 0.3099, + "num_input_tokens_seen": 30404392, + "step": 52410 + }, + { + "epoch": 7.806821566875186, + "grad_norm": 11.908304214477539, + "learning_rate": 3.822328186297162e-05, + "loss": 0.2515, + "num_input_tokens_seen": 30407016, + "step": 52415 + }, + { + "epoch": 7.807566279416145, + "grad_norm": 4.267214298248291, + "learning_rate": 3.822052408212473e-05, + "loss": 0.0404, + "num_input_tokens_seen": 30409992, + "step": 52420 + }, + { + "epoch": 7.8083109919571045, + "grad_norm": 74.923095703125, + "learning_rate": 3.8217766077930527e-05, + "loss": 0.3284, + "num_input_tokens_seen": 30412872, + "step": 52425 + }, + { + "epoch": 7.809055704498064, + "grad_norm": 0.042382821440696716, + "learning_rate": 3.82150078504356e-05, + "loss": 0.0366, + "num_input_tokens_seen": 30415592, + "step": 52430 + }, + { + "epoch": 7.809800417039023, + "grad_norm": 0.020636629313230515, + "learning_rate": 3.821224939968654e-05, + "loss": 0.0155, + "num_input_tokens_seen": 30418312, + "step": 52435 + }, + { + "epoch": 7.810545129579982, + "grad_norm": 1.2553528547286987, + "learning_rate": 3.820949072572996e-05, + "loss": 0.0316, + "num_input_tokens_seen": 30421096, + "step": 52440 + }, + { + "epoch": 7.811289842120941, + "grad_norm": 0.013442330993711948, + "learning_rate": 3.820673182861246e-05, + "loss": 0.0029, + "num_input_tokens_seen": 30423880, + "step": 52445 + }, + { + "epoch": 7.812034554661901, + "grad_norm": 0.1082996055483818, + "learning_rate": 3.820397270838064e-05, + "loss": 0.0887, + "num_input_tokens_seen": 30426888, + "step": 52450 + }, + { + "epoch": 7.81277926720286, + "grad_norm": 0.3321762681007385, + "learning_rate": 3.820121336508113e-05, + "loss": 0.0017, + "num_input_tokens_seen": 30429864, + "step": 52455 + }, + { + "epoch": 7.8135239797438185, + "grad_norm": 40.93232727050781, + "learning_rate": 3.819845379876054e-05, + "loss": 0.029, + "num_input_tokens_seen": 30433000, + "step": 52460 + }, + { + "epoch": 7.814268692284778, + "grad_norm": 32.952030181884766, + "learning_rate": 3.8195694009465486e-05, + "loss": 0.1591, + "num_input_tokens_seen": 30435784, + "step": 52465 + }, + { + "epoch": 7.815013404825737, + "grad_norm": 117.27090454101562, + "learning_rate": 3.819293399724259e-05, + "loss": 0.3956, + "num_input_tokens_seen": 30438568, + "step": 52470 + }, + { + "epoch": 7.815758117366697, + "grad_norm": 35.0167121887207, + "learning_rate": 3.819017376213848e-05, + "loss": 0.2679, + "num_input_tokens_seen": 30441672, + "step": 52475 + }, + { + "epoch": 7.816502829907655, + "grad_norm": 0.017063846811652184, + "learning_rate": 3.8187413304199796e-05, + "loss": 0.003, + "num_input_tokens_seen": 30444488, + "step": 52480 + }, + { + "epoch": 7.817247542448615, + "grad_norm": 25.001972198486328, + "learning_rate": 3.818465262347316e-05, + "loss": 0.0121, + "num_input_tokens_seen": 30447240, + "step": 52485 + }, + { + "epoch": 7.817992254989574, + "grad_norm": 34.01313781738281, + "learning_rate": 3.818189172000522e-05, + "loss": 0.6092, + "num_input_tokens_seen": 30449992, + "step": 52490 + }, + { + "epoch": 7.818736967530533, + "grad_norm": 102.99991607666016, + "learning_rate": 3.8179130593842626e-05, + "loss": 0.3544, + "num_input_tokens_seen": 30453288, + "step": 52495 + }, + { + "epoch": 7.819481680071492, + "grad_norm": 44.703590393066406, + "learning_rate": 3.8176369245032006e-05, + "loss": 0.3474, + "num_input_tokens_seen": 30456136, + "step": 52500 + }, + { + "epoch": 7.820226392612452, + "grad_norm": 0.016114842146635056, + "learning_rate": 3.817360767362003e-05, + "loss": 0.451, + "num_input_tokens_seen": 30458952, + "step": 52505 + }, + { + "epoch": 7.8209711051534105, + "grad_norm": 0.3257465362548828, + "learning_rate": 3.817084587965333e-05, + "loss": 0.1412, + "num_input_tokens_seen": 30461608, + "step": 52510 + }, + { + "epoch": 7.82171581769437, + "grad_norm": 2.7569851875305176, + "learning_rate": 3.8168083863178586e-05, + "loss": 0.1485, + "num_input_tokens_seen": 30464808, + "step": 52515 + }, + { + "epoch": 7.822460530235329, + "grad_norm": 0.6038332581520081, + "learning_rate": 3.8165321624242434e-05, + "loss": 0.0039, + "num_input_tokens_seen": 30467944, + "step": 52520 + }, + { + "epoch": 7.823205242776289, + "grad_norm": 6.929480075836182, + "learning_rate": 3.816255916289156e-05, + "loss": 0.2885, + "num_input_tokens_seen": 30470856, + "step": 52525 + }, + { + "epoch": 7.823949955317247, + "grad_norm": 53.76498794555664, + "learning_rate": 3.8159796479172626e-05, + "loss": 0.1826, + "num_input_tokens_seen": 30473704, + "step": 52530 + }, + { + "epoch": 7.824694667858207, + "grad_norm": 19.857728958129883, + "learning_rate": 3.815703357313231e-05, + "loss": 0.175, + "num_input_tokens_seen": 30476552, + "step": 52535 + }, + { + "epoch": 7.825439380399166, + "grad_norm": 14.960413932800293, + "learning_rate": 3.8154270444817285e-05, + "loss": 0.235, + "num_input_tokens_seen": 30479560, + "step": 52540 + }, + { + "epoch": 7.826184092940125, + "grad_norm": 0.14577504992485046, + "learning_rate": 3.815150709427423e-05, + "loss": 0.004, + "num_input_tokens_seen": 30482440, + "step": 52545 + }, + { + "epoch": 7.826928805481084, + "grad_norm": 46.901275634765625, + "learning_rate": 3.8148743521549824e-05, + "loss": 0.1061, + "num_input_tokens_seen": 30485192, + "step": 52550 + }, + { + "epoch": 7.827673518022044, + "grad_norm": 13.540574073791504, + "learning_rate": 3.814597972669076e-05, + "loss": 0.3887, + "num_input_tokens_seen": 30488136, + "step": 52555 + }, + { + "epoch": 7.828418230563003, + "grad_norm": 13.901948928833008, + "learning_rate": 3.814321570974373e-05, + "loss": 0.3215, + "num_input_tokens_seen": 30490920, + "step": 52560 + }, + { + "epoch": 7.829162943103962, + "grad_norm": 55.247676849365234, + "learning_rate": 3.814045147075543e-05, + "loss": 0.0842, + "num_input_tokens_seen": 30493672, + "step": 52565 + }, + { + "epoch": 7.829907655644921, + "grad_norm": 23.129182815551758, + "learning_rate": 3.813768700977256e-05, + "loss": 0.4488, + "num_input_tokens_seen": 30496680, + "step": 52570 + }, + { + "epoch": 7.830652368185881, + "grad_norm": 0.05935756117105484, + "learning_rate": 3.813492232684182e-05, + "loss": 0.1519, + "num_input_tokens_seen": 30499560, + "step": 52575 + }, + { + "epoch": 7.831397080726839, + "grad_norm": 0.5396580100059509, + "learning_rate": 3.813215742200992e-05, + "loss": 0.1175, + "num_input_tokens_seen": 30502504, + "step": 52580 + }, + { + "epoch": 7.832141793267798, + "grad_norm": 0.1837654560804367, + "learning_rate": 3.8129392295323566e-05, + "loss": 0.092, + "num_input_tokens_seen": 30505480, + "step": 52585 + }, + { + "epoch": 7.832886505808758, + "grad_norm": 3.843257427215576, + "learning_rate": 3.812662694682946e-05, + "loss": 0.0302, + "num_input_tokens_seen": 30508264, + "step": 52590 + }, + { + "epoch": 7.833631218349717, + "grad_norm": 0.8255858421325684, + "learning_rate": 3.8123861376574344e-05, + "loss": 0.3052, + "num_input_tokens_seen": 30511016, + "step": 52595 + }, + { + "epoch": 7.834375930890676, + "grad_norm": 80.00532531738281, + "learning_rate": 3.8121095584604925e-05, + "loss": 0.2903, + "num_input_tokens_seen": 30513864, + "step": 52600 + }, + { + "epoch": 7.835120643431635, + "grad_norm": 26.434186935424805, + "learning_rate": 3.811832957096794e-05, + "loss": 0.2616, + "num_input_tokens_seen": 30516744, + "step": 52605 + }, + { + "epoch": 7.835865355972595, + "grad_norm": 36.24654006958008, + "learning_rate": 3.81155633357101e-05, + "loss": 0.4273, + "num_input_tokens_seen": 30519656, + "step": 52610 + }, + { + "epoch": 7.836610068513554, + "grad_norm": 0.2026398479938507, + "learning_rate": 3.8112796878878155e-05, + "loss": 0.0345, + "num_input_tokens_seen": 30522504, + "step": 52615 + }, + { + "epoch": 7.837354781054513, + "grad_norm": 0.044524889439344406, + "learning_rate": 3.811003020051883e-05, + "loss": 0.121, + "num_input_tokens_seen": 30525480, + "step": 52620 + }, + { + "epoch": 7.838099493595472, + "grad_norm": 0.21818722784519196, + "learning_rate": 3.8107263300678874e-05, + "loss": 0.0744, + "num_input_tokens_seen": 30528488, + "step": 52625 + }, + { + "epoch": 7.838844206136431, + "grad_norm": 16.17515754699707, + "learning_rate": 3.810449617940502e-05, + "loss": 0.2483, + "num_input_tokens_seen": 30531624, + "step": 52630 + }, + { + "epoch": 7.83958891867739, + "grad_norm": 0.02987341396510601, + "learning_rate": 3.810172883674402e-05, + "loss": 0.0615, + "num_input_tokens_seen": 30534856, + "step": 52635 + }, + { + "epoch": 7.84033363121835, + "grad_norm": 17.835044860839844, + "learning_rate": 3.809896127274264e-05, + "loss": 0.1581, + "num_input_tokens_seen": 30537960, + "step": 52640 + }, + { + "epoch": 7.841078343759309, + "grad_norm": 0.0743437334895134, + "learning_rate": 3.8096193487447604e-05, + "loss": 0.2782, + "num_input_tokens_seen": 30541000, + "step": 52645 + }, + { + "epoch": 7.841823056300268, + "grad_norm": 0.05808002129197121, + "learning_rate": 3.8093425480905706e-05, + "loss": 0.022, + "num_input_tokens_seen": 30544072, + "step": 52650 + }, + { + "epoch": 7.842567768841227, + "grad_norm": 0.23340386152267456, + "learning_rate": 3.809065725316368e-05, + "loss": 0.024, + "num_input_tokens_seen": 30546920, + "step": 52655 + }, + { + "epoch": 7.843312481382187, + "grad_norm": 0.6760352849960327, + "learning_rate": 3.808788880426831e-05, + "loss": 0.042, + "num_input_tokens_seen": 30550184, + "step": 52660 + }, + { + "epoch": 7.844057193923145, + "grad_norm": 0.05374456197023392, + "learning_rate": 3.8085120134266364e-05, + "loss": 0.0948, + "num_input_tokens_seen": 30553480, + "step": 52665 + }, + { + "epoch": 7.844801906464105, + "grad_norm": 0.22015380859375, + "learning_rate": 3.8082351243204605e-05, + "loss": 0.3529, + "num_input_tokens_seen": 30556104, + "step": 52670 + }, + { + "epoch": 7.845546619005064, + "grad_norm": 24.336538314819336, + "learning_rate": 3.8079582131129826e-05, + "loss": 0.0666, + "num_input_tokens_seen": 30559368, + "step": 52675 + }, + { + "epoch": 7.846291331546023, + "grad_norm": 19.440441131591797, + "learning_rate": 3.8076812798088796e-05, + "loss": 0.409, + "num_input_tokens_seen": 30562152, + "step": 52680 + }, + { + "epoch": 7.847036044086982, + "grad_norm": 44.38561248779297, + "learning_rate": 3.80740432441283e-05, + "loss": 0.164, + "num_input_tokens_seen": 30565288, + "step": 52685 + }, + { + "epoch": 7.847780756627942, + "grad_norm": 4.256392478942871, + "learning_rate": 3.807127346929514e-05, + "loss": 0.0422, + "num_input_tokens_seen": 30567976, + "step": 52690 + }, + { + "epoch": 7.848525469168901, + "grad_norm": 0.023590002208948135, + "learning_rate": 3.806850347363609e-05, + "loss": 0.0019, + "num_input_tokens_seen": 30570728, + "step": 52695 + }, + { + "epoch": 7.84927018170986, + "grad_norm": 0.011124872602522373, + "learning_rate": 3.8065733257197964e-05, + "loss": 0.1526, + "num_input_tokens_seen": 30573544, + "step": 52700 + }, + { + "epoch": 7.850014894250819, + "grad_norm": 0.06126686930656433, + "learning_rate": 3.806296282002756e-05, + "loss": 0.069, + "num_input_tokens_seen": 30576680, + "step": 52705 + }, + { + "epoch": 7.850759606791779, + "grad_norm": 1.6455727815628052, + "learning_rate": 3.8060192162171664e-05, + "loss": 0.0011, + "num_input_tokens_seen": 30579688, + "step": 52710 + }, + { + "epoch": 7.851504319332737, + "grad_norm": 11.309823989868164, + "learning_rate": 3.80574212836771e-05, + "loss": 0.2719, + "num_input_tokens_seen": 30582216, + "step": 52715 + }, + { + "epoch": 7.852249031873697, + "grad_norm": 0.08128667622804642, + "learning_rate": 3.805465018459067e-05, + "loss": 0.0999, + "num_input_tokens_seen": 30585192, + "step": 52720 + }, + { + "epoch": 7.852993744414656, + "grad_norm": 0.07380218803882599, + "learning_rate": 3.8051878864959194e-05, + "loss": 0.4298, + "num_input_tokens_seen": 30587944, + "step": 52725 + }, + { + "epoch": 7.8537384569556155, + "grad_norm": 0.06542999297380447, + "learning_rate": 3.804910732482949e-05, + "loss": 0.2231, + "num_input_tokens_seen": 30590792, + "step": 52730 + }, + { + "epoch": 7.854483169496574, + "grad_norm": 0.43248680233955383, + "learning_rate": 3.804633556424839e-05, + "loss": 0.1507, + "num_input_tokens_seen": 30593640, + "step": 52735 + }, + { + "epoch": 7.855227882037534, + "grad_norm": 0.1060970202088356, + "learning_rate": 3.804356358326271e-05, + "loss": 0.017, + "num_input_tokens_seen": 30596744, + "step": 52740 + }, + { + "epoch": 7.855972594578493, + "grad_norm": 24.535003662109375, + "learning_rate": 3.804079138191927e-05, + "loss": 0.137, + "num_input_tokens_seen": 30599720, + "step": 52745 + }, + { + "epoch": 7.856717307119452, + "grad_norm": 0.5392447710037231, + "learning_rate": 3.803801896026491e-05, + "loss": 0.0929, + "num_input_tokens_seen": 30602760, + "step": 52750 + }, + { + "epoch": 7.857462019660411, + "grad_norm": 0.02589552104473114, + "learning_rate": 3.803524631834648e-05, + "loss": 0.1755, + "num_input_tokens_seen": 30605544, + "step": 52755 + }, + { + "epoch": 7.858206732201371, + "grad_norm": 0.012392038479447365, + "learning_rate": 3.8032473456210805e-05, + "loss": 0.0387, + "num_input_tokens_seen": 30608328, + "step": 52760 + }, + { + "epoch": 7.858951444742329, + "grad_norm": 0.04222080111503601, + "learning_rate": 3.8029700373904744e-05, + "loss": 0.2476, + "num_input_tokens_seen": 30611112, + "step": 52765 + }, + { + "epoch": 7.859696157283288, + "grad_norm": 0.01950419694185257, + "learning_rate": 3.802692707147514e-05, + "loss": 0.0375, + "num_input_tokens_seen": 30613896, + "step": 52770 + }, + { + "epoch": 7.860440869824248, + "grad_norm": 23.78727149963379, + "learning_rate": 3.8024153548968835e-05, + "loss": 0.1655, + "num_input_tokens_seen": 30616648, + "step": 52775 + }, + { + "epoch": 7.8611855823652075, + "grad_norm": 0.029732191935181618, + "learning_rate": 3.80213798064327e-05, + "loss": 0.3992, + "num_input_tokens_seen": 30619336, + "step": 52780 + }, + { + "epoch": 7.861930294906166, + "grad_norm": 0.012170393019914627, + "learning_rate": 3.801860584391358e-05, + "loss": 0.3885, + "num_input_tokens_seen": 30622056, + "step": 52785 + }, + { + "epoch": 7.862675007447125, + "grad_norm": 0.01486385241150856, + "learning_rate": 3.801583166145835e-05, + "loss": 0.0134, + "num_input_tokens_seen": 30624904, + "step": 52790 + }, + { + "epoch": 7.863419719988085, + "grad_norm": 32.961509704589844, + "learning_rate": 3.801305725911387e-05, + "loss": 0.1629, + "num_input_tokens_seen": 30628040, + "step": 52795 + }, + { + "epoch": 7.864164432529043, + "grad_norm": 0.010941648855805397, + "learning_rate": 3.8010282636927016e-05, + "loss": 0.169, + "num_input_tokens_seen": 30631016, + "step": 52800 + }, + { + "epoch": 7.864909145070003, + "grad_norm": 52.43606185913086, + "learning_rate": 3.800750779494466e-05, + "loss": 0.3483, + "num_input_tokens_seen": 30633832, + "step": 52805 + }, + { + "epoch": 7.865653857610962, + "grad_norm": 0.0072128865867853165, + "learning_rate": 3.8004732733213674e-05, + "loss": 0.2764, + "num_input_tokens_seen": 30636456, + "step": 52810 + }, + { + "epoch": 7.8663985701519215, + "grad_norm": 13.906529426574707, + "learning_rate": 3.8001957451780956e-05, + "loss": 0.3356, + "num_input_tokens_seen": 30639336, + "step": 52815 + }, + { + "epoch": 7.86714328269288, + "grad_norm": 0.07241978496313095, + "learning_rate": 3.799918195069338e-05, + "loss": 0.1233, + "num_input_tokens_seen": 30642408, + "step": 52820 + }, + { + "epoch": 7.86788799523384, + "grad_norm": 1.0666375160217285, + "learning_rate": 3.799640622999784e-05, + "loss": 0.2392, + "num_input_tokens_seen": 30644872, + "step": 52825 + }, + { + "epoch": 7.868632707774799, + "grad_norm": 121.53217315673828, + "learning_rate": 3.799363028974121e-05, + "loss": 0.2444, + "num_input_tokens_seen": 30647752, + "step": 52830 + }, + { + "epoch": 7.869377420315758, + "grad_norm": 0.016329510137438774, + "learning_rate": 3.799085412997041e-05, + "loss": 0.0527, + "num_input_tokens_seen": 30651144, + "step": 52835 + }, + { + "epoch": 7.870122132856717, + "grad_norm": 0.03294510394334793, + "learning_rate": 3.798807775073234e-05, + "loss": 0.1199, + "num_input_tokens_seen": 30653800, + "step": 52840 + }, + { + "epoch": 7.870866845397677, + "grad_norm": 79.08231353759766, + "learning_rate": 3.7985301152073896e-05, + "loss": 0.3006, + "num_input_tokens_seen": 30656680, + "step": 52845 + }, + { + "epoch": 7.871611557938635, + "grad_norm": 0.013252796605229378, + "learning_rate": 3.798252433404198e-05, + "loss": 0.2332, + "num_input_tokens_seen": 30659528, + "step": 52850 + }, + { + "epoch": 7.872356270479595, + "grad_norm": 38.28996658325195, + "learning_rate": 3.797974729668351e-05, + "loss": 0.6211, + "num_input_tokens_seen": 30662216, + "step": 52855 + }, + { + "epoch": 7.873100983020554, + "grad_norm": 54.31232452392578, + "learning_rate": 3.7976970040045404e-05, + "loss": 0.5886, + "num_input_tokens_seen": 30665000, + "step": 52860 + }, + { + "epoch": 7.8738456955615135, + "grad_norm": 47.66282653808594, + "learning_rate": 3.797419256417458e-05, + "loss": 0.2481, + "num_input_tokens_seen": 30667912, + "step": 52865 + }, + { + "epoch": 7.874590408102472, + "grad_norm": 0.016559768468141556, + "learning_rate": 3.797141486911796e-05, + "loss": 0.0805, + "num_input_tokens_seen": 30670824, + "step": 52870 + }, + { + "epoch": 7.875335120643432, + "grad_norm": 2.648267984390259, + "learning_rate": 3.796863695492247e-05, + "loss": 0.0831, + "num_input_tokens_seen": 30673512, + "step": 52875 + }, + { + "epoch": 7.876079833184391, + "grad_norm": 26.085363388061523, + "learning_rate": 3.796585882163503e-05, + "loss": 0.1299, + "num_input_tokens_seen": 30676456, + "step": 52880 + }, + { + "epoch": 7.87682454572535, + "grad_norm": 0.03588768467307091, + "learning_rate": 3.796308046930258e-05, + "loss": 0.5468, + "num_input_tokens_seen": 30679176, + "step": 52885 + }, + { + "epoch": 7.877569258266309, + "grad_norm": 0.1125028133392334, + "learning_rate": 3.796030189797207e-05, + "loss": 0.1619, + "num_input_tokens_seen": 30682120, + "step": 52890 + }, + { + "epoch": 7.878313970807269, + "grad_norm": 34.48371887207031, + "learning_rate": 3.795752310769044e-05, + "loss": 0.3306, + "num_input_tokens_seen": 30684904, + "step": 52895 + }, + { + "epoch": 7.8790586833482275, + "grad_norm": 40.22383499145508, + "learning_rate": 3.795474409850462e-05, + "loss": 0.2371, + "num_input_tokens_seen": 30687624, + "step": 52900 + }, + { + "epoch": 7.879803395889187, + "grad_norm": 27.269786834716797, + "learning_rate": 3.795196487046157e-05, + "loss": 0.2507, + "num_input_tokens_seen": 30690504, + "step": 52905 + }, + { + "epoch": 7.880548108430146, + "grad_norm": 0.2934857904911041, + "learning_rate": 3.794918542360822e-05, + "loss": 0.1255, + "num_input_tokens_seen": 30693320, + "step": 52910 + }, + { + "epoch": 7.8812928209711055, + "grad_norm": 0.052957601845264435, + "learning_rate": 3.7946405757991556e-05, + "loss": 0.0826, + "num_input_tokens_seen": 30696264, + "step": 52915 + }, + { + "epoch": 7.882037533512064, + "grad_norm": 0.1229851022362709, + "learning_rate": 3.7943625873658515e-05, + "loss": 0.0292, + "num_input_tokens_seen": 30699240, + "step": 52920 + }, + { + "epoch": 7.882782246053024, + "grad_norm": 0.604164183139801, + "learning_rate": 3.7940845770656085e-05, + "loss": 0.0163, + "num_input_tokens_seen": 30702056, + "step": 52925 + }, + { + "epoch": 7.883526958593983, + "grad_norm": 28.89154052734375, + "learning_rate": 3.7938065449031206e-05, + "loss": 0.3889, + "num_input_tokens_seen": 30706120, + "step": 52930 + }, + { + "epoch": 7.884271671134941, + "grad_norm": 0.0614178329706192, + "learning_rate": 3.793528490883087e-05, + "loss": 0.1418, + "num_input_tokens_seen": 30709288, + "step": 52935 + }, + { + "epoch": 7.885016383675901, + "grad_norm": 0.02559073455631733, + "learning_rate": 3.7932504150102045e-05, + "loss": 0.3149, + "num_input_tokens_seen": 30711976, + "step": 52940 + }, + { + "epoch": 7.885761096216861, + "grad_norm": 112.53276824951172, + "learning_rate": 3.7929723172891696e-05, + "loss": 0.4867, + "num_input_tokens_seen": 30714952, + "step": 52945 + }, + { + "epoch": 7.8865058087578195, + "grad_norm": 16.82456398010254, + "learning_rate": 3.792694197724682e-05, + "loss": 0.4678, + "num_input_tokens_seen": 30718024, + "step": 52950 + }, + { + "epoch": 7.887250521298778, + "grad_norm": 1.8653812408447266, + "learning_rate": 3.7924160563214395e-05, + "loss": 0.3309, + "num_input_tokens_seen": 30720840, + "step": 52955 + }, + { + "epoch": 7.887995233839738, + "grad_norm": 12.949161529541016, + "learning_rate": 3.792137893084141e-05, + "loss": 0.4156, + "num_input_tokens_seen": 30723400, + "step": 52960 + }, + { + "epoch": 7.8887399463806975, + "grad_norm": 5.700454235076904, + "learning_rate": 3.791859708017486e-05, + "loss": 0.1121, + "num_input_tokens_seen": 30726152, + "step": 52965 + }, + { + "epoch": 7.889484658921656, + "grad_norm": 56.9820556640625, + "learning_rate": 3.791581501126175e-05, + "loss": 0.2129, + "num_input_tokens_seen": 30729224, + "step": 52970 + }, + { + "epoch": 7.890229371462615, + "grad_norm": 0.15039439499378204, + "learning_rate": 3.791303272414907e-05, + "loss": 0.3269, + "num_input_tokens_seen": 30732072, + "step": 52975 + }, + { + "epoch": 7.890974084003575, + "grad_norm": 7.233654975891113, + "learning_rate": 3.791025021888382e-05, + "loss": 0.0176, + "num_input_tokens_seen": 30734728, + "step": 52980 + }, + { + "epoch": 7.8917187965445335, + "grad_norm": 0.7584672570228577, + "learning_rate": 3.7907467495513026e-05, + "loss": 0.3417, + "num_input_tokens_seen": 30737640, + "step": 52985 + }, + { + "epoch": 7.892463509085493, + "grad_norm": 39.83551788330078, + "learning_rate": 3.790468455408368e-05, + "loss": 0.2791, + "num_input_tokens_seen": 30740648, + "step": 52990 + }, + { + "epoch": 7.893208221626452, + "grad_norm": 31.893333435058594, + "learning_rate": 3.79019013946428e-05, + "loss": 0.1083, + "num_input_tokens_seen": 30743336, + "step": 52995 + }, + { + "epoch": 7.8939529341674115, + "grad_norm": 0.2058742642402649, + "learning_rate": 3.789911801723742e-05, + "loss": 0.1659, + "num_input_tokens_seen": 30746344, + "step": 53000 + }, + { + "epoch": 7.89469764670837, + "grad_norm": 34.76355743408203, + "learning_rate": 3.789633442191455e-05, + "loss": 0.148, + "num_input_tokens_seen": 30749384, + "step": 53005 + }, + { + "epoch": 7.89544235924933, + "grad_norm": 14.775565147399902, + "learning_rate": 3.7893550608721206e-05, + "loss": 0.3425, + "num_input_tokens_seen": 30752456, + "step": 53010 + }, + { + "epoch": 7.896187071790289, + "grad_norm": 30.39360809326172, + "learning_rate": 3.789076657770444e-05, + "loss": 0.104, + "num_input_tokens_seen": 30755112, + "step": 53015 + }, + { + "epoch": 7.896931784331248, + "grad_norm": 9.288470268249512, + "learning_rate": 3.788798232891127e-05, + "loss": 0.0631, + "num_input_tokens_seen": 30757960, + "step": 53020 + }, + { + "epoch": 7.897676496872207, + "grad_norm": 0.07101494073867798, + "learning_rate": 3.788519786238873e-05, + "loss": 0.1699, + "num_input_tokens_seen": 30760712, + "step": 53025 + }, + { + "epoch": 7.898421209413167, + "grad_norm": 0.15484866499900818, + "learning_rate": 3.788241317818388e-05, + "loss": 0.0736, + "num_input_tokens_seen": 30763528, + "step": 53030 + }, + { + "epoch": 7.8991659219541255, + "grad_norm": 0.06250247359275818, + "learning_rate": 3.7879628276343746e-05, + "loss": 0.3711, + "num_input_tokens_seen": 30766152, + "step": 53035 + }, + { + "epoch": 7.899910634495085, + "grad_norm": 22.60474395751953, + "learning_rate": 3.787684315691539e-05, + "loss": 0.2818, + "num_input_tokens_seen": 30769352, + "step": 53040 + }, + { + "epoch": 7.900655347036044, + "grad_norm": 25.54070281982422, + "learning_rate": 3.787405781994584e-05, + "loss": 0.0728, + "num_input_tokens_seen": 30771944, + "step": 53045 + }, + { + "epoch": 7.9014000595770035, + "grad_norm": 0.021172964945435524, + "learning_rate": 3.7871272265482184e-05, + "loss": 0.1074, + "num_input_tokens_seen": 30775048, + "step": 53050 + }, + { + "epoch": 7.902144772117962, + "grad_norm": 0.8605991005897522, + "learning_rate": 3.786848649357145e-05, + "loss": 0.0075, + "num_input_tokens_seen": 30777928, + "step": 53055 + }, + { + "epoch": 7.902889484658922, + "grad_norm": 0.012272133491933346, + "learning_rate": 3.786570050426073e-05, + "loss": 0.2698, + "num_input_tokens_seen": 30780776, + "step": 53060 + }, + { + "epoch": 7.903634197199881, + "grad_norm": 0.3615952730178833, + "learning_rate": 3.7862914297597075e-05, + "loss": 0.1635, + "num_input_tokens_seen": 30783624, + "step": 53065 + }, + { + "epoch": 7.90437890974084, + "grad_norm": 0.6737632751464844, + "learning_rate": 3.7860127873627546e-05, + "loss": 0.3478, + "num_input_tokens_seen": 30786312, + "step": 53070 + }, + { + "epoch": 7.905123622281799, + "grad_norm": 0.01969926990568638, + "learning_rate": 3.785734123239924e-05, + "loss": 0.0024, + "num_input_tokens_seen": 30789320, + "step": 53075 + }, + { + "epoch": 7.905868334822759, + "grad_norm": 0.1319669485092163, + "learning_rate": 3.785455437395921e-05, + "loss": 0.1861, + "num_input_tokens_seen": 30792584, + "step": 53080 + }, + { + "epoch": 7.9066130473637175, + "grad_norm": 23.90718650817871, + "learning_rate": 3.7851767298354554e-05, + "loss": 0.1862, + "num_input_tokens_seen": 30795176, + "step": 53085 + }, + { + "epoch": 7.907357759904677, + "grad_norm": 0.05540425702929497, + "learning_rate": 3.7848980005632344e-05, + "loss": 0.0032, + "num_input_tokens_seen": 30798184, + "step": 53090 + }, + { + "epoch": 7.908102472445636, + "grad_norm": 2.239901304244995, + "learning_rate": 3.7846192495839686e-05, + "loss": 0.0085, + "num_input_tokens_seen": 30801192, + "step": 53095 + }, + { + "epoch": 7.908847184986596, + "grad_norm": 0.6305398941040039, + "learning_rate": 3.784340476902366e-05, + "loss": 0.0834, + "num_input_tokens_seen": 30804168, + "step": 53100 + }, + { + "epoch": 7.909591897527554, + "grad_norm": 22.6121768951416, + "learning_rate": 3.7840616825231365e-05, + "loss": 0.0817, + "num_input_tokens_seen": 30807112, + "step": 53105 + }, + { + "epoch": 7.910336610068514, + "grad_norm": 0.013236387632787228, + "learning_rate": 3.783782866450989e-05, + "loss": 0.2289, + "num_input_tokens_seen": 30809768, + "step": 53110 + }, + { + "epoch": 7.911081322609473, + "grad_norm": 0.9846711158752441, + "learning_rate": 3.783504028690635e-05, + "loss": 0.1258, + "num_input_tokens_seen": 30812552, + "step": 53115 + }, + { + "epoch": 7.9118260351504315, + "grad_norm": 0.006547214929014444, + "learning_rate": 3.783225169246786e-05, + "loss": 0.3102, + "num_input_tokens_seen": 30815528, + "step": 53120 + }, + { + "epoch": 7.912570747691391, + "grad_norm": 0.11664706468582153, + "learning_rate": 3.782946288124151e-05, + "loss": 0.0419, + "num_input_tokens_seen": 30818504, + "step": 53125 + }, + { + "epoch": 7.913315460232351, + "grad_norm": 5.154588222503662, + "learning_rate": 3.782667385327442e-05, + "loss": 0.1461, + "num_input_tokens_seen": 30821416, + "step": 53130 + }, + { + "epoch": 7.9140601727733095, + "grad_norm": 27.977622985839844, + "learning_rate": 3.782388460861372e-05, + "loss": 0.1035, + "num_input_tokens_seen": 30824360, + "step": 53135 + }, + { + "epoch": 7.914804885314268, + "grad_norm": 0.016396602615714073, + "learning_rate": 3.7821095147306527e-05, + "loss": 0.0955, + "num_input_tokens_seen": 30827432, + "step": 53140 + }, + { + "epoch": 7.915549597855228, + "grad_norm": 8.547431945800781, + "learning_rate": 3.781830546939996e-05, + "loss": 0.4892, + "num_input_tokens_seen": 30830280, + "step": 53145 + }, + { + "epoch": 7.916294310396187, + "grad_norm": 25.301002502441406, + "learning_rate": 3.781551557494115e-05, + "loss": 0.2667, + "num_input_tokens_seen": 30832968, + "step": 53150 + }, + { + "epoch": 7.917039022937146, + "grad_norm": 0.04786112904548645, + "learning_rate": 3.7812725463977225e-05, + "loss": 0.5828, + "num_input_tokens_seen": 30835752, + "step": 53155 + }, + { + "epoch": 7.917783735478105, + "grad_norm": 0.09906522184610367, + "learning_rate": 3.7809935136555326e-05, + "loss": 0.0065, + "num_input_tokens_seen": 30838824, + "step": 53160 + }, + { + "epoch": 7.918528448019065, + "grad_norm": 60.51155471801758, + "learning_rate": 3.780714459272259e-05, + "loss": 0.2605, + "num_input_tokens_seen": 30841864, + "step": 53165 + }, + { + "epoch": 7.9192731605600235, + "grad_norm": 0.17049500346183777, + "learning_rate": 3.780435383252617e-05, + "loss": 0.2752, + "num_input_tokens_seen": 30844872, + "step": 53170 + }, + { + "epoch": 7.920017873100983, + "grad_norm": 3.055194854736328, + "learning_rate": 3.78015628560132e-05, + "loss": 0.21, + "num_input_tokens_seen": 30847784, + "step": 53175 + }, + { + "epoch": 7.920762585641942, + "grad_norm": 72.19515991210938, + "learning_rate": 3.779877166323084e-05, + "loss": 0.1374, + "num_input_tokens_seen": 30850696, + "step": 53180 + }, + { + "epoch": 7.921507298182902, + "grad_norm": 0.09168101847171783, + "learning_rate": 3.779598025422624e-05, + "loss": 0.1834, + "num_input_tokens_seen": 30853608, + "step": 53185 + }, + { + "epoch": 7.92225201072386, + "grad_norm": 85.70696258544922, + "learning_rate": 3.779318862904656e-05, + "loss": 0.0789, + "num_input_tokens_seen": 30856648, + "step": 53190 + }, + { + "epoch": 7.92299672326482, + "grad_norm": 0.03392518684267998, + "learning_rate": 3.779039678773896e-05, + "loss": 0.214, + "num_input_tokens_seen": 30859464, + "step": 53195 + }, + { + "epoch": 7.923741435805779, + "grad_norm": 0.06262841075658798, + "learning_rate": 3.77876047303506e-05, + "loss": 0.4575, + "num_input_tokens_seen": 30862152, + "step": 53200 + }, + { + "epoch": 7.924486148346738, + "grad_norm": 0.19933778047561646, + "learning_rate": 3.778481245692866e-05, + "loss": 0.4062, + "num_input_tokens_seen": 30865128, + "step": 53205 + }, + { + "epoch": 7.925230860887697, + "grad_norm": 82.57337951660156, + "learning_rate": 3.7782019967520305e-05, + "loss": 0.2176, + "num_input_tokens_seen": 30867976, + "step": 53210 + }, + { + "epoch": 7.925975573428657, + "grad_norm": 52.87886428833008, + "learning_rate": 3.777922726217271e-05, + "loss": 0.0606, + "num_input_tokens_seen": 30870856, + "step": 53215 + }, + { + "epoch": 7.9267202859696155, + "grad_norm": 0.09157869964838028, + "learning_rate": 3.7776434340933065e-05, + "loss": 0.2839, + "num_input_tokens_seen": 30873736, + "step": 53220 + }, + { + "epoch": 7.927464998510575, + "grad_norm": 0.04377113655209541, + "learning_rate": 3.7773641203848554e-05, + "loss": 0.1156, + "num_input_tokens_seen": 30876712, + "step": 53225 + }, + { + "epoch": 7.928209711051534, + "grad_norm": 0.09539080411195755, + "learning_rate": 3.7770847850966354e-05, + "loss": 0.0062, + "num_input_tokens_seen": 30879624, + "step": 53230 + }, + { + "epoch": 7.928954423592494, + "grad_norm": 0.09567199647426605, + "learning_rate": 3.7768054282333655e-05, + "loss": 0.159, + "num_input_tokens_seen": 30882088, + "step": 53235 + }, + { + "epoch": 7.929699136133452, + "grad_norm": 1.3636020421981812, + "learning_rate": 3.776526049799765e-05, + "loss": 0.0223, + "num_input_tokens_seen": 30884680, + "step": 53240 + }, + { + "epoch": 7.930443848674412, + "grad_norm": 23.90069580078125, + "learning_rate": 3.7762466498005544e-05, + "loss": 0.1709, + "num_input_tokens_seen": 30887784, + "step": 53245 + }, + { + "epoch": 7.931188561215371, + "grad_norm": 40.51247787475586, + "learning_rate": 3.7759672282404546e-05, + "loss": 0.333, + "num_input_tokens_seen": 30890440, + "step": 53250 + }, + { + "epoch": 7.93193327375633, + "grad_norm": 0.2868245244026184, + "learning_rate": 3.775687785124185e-05, + "loss": 0.0014, + "num_input_tokens_seen": 30893192, + "step": 53255 + }, + { + "epoch": 7.932677986297289, + "grad_norm": 3.033787727355957, + "learning_rate": 3.775408320456466e-05, + "loss": 0.1964, + "num_input_tokens_seen": 30896328, + "step": 53260 + }, + { + "epoch": 7.933422698838249, + "grad_norm": 17.552553176879883, + "learning_rate": 3.775128834242021e-05, + "loss": 0.5205, + "num_input_tokens_seen": 30899336, + "step": 53265 + }, + { + "epoch": 7.934167411379208, + "grad_norm": 0.11808697134256363, + "learning_rate": 3.77484932648557e-05, + "loss": 0.0029, + "num_input_tokens_seen": 30902248, + "step": 53270 + }, + { + "epoch": 7.934912123920167, + "grad_norm": 0.030112972483038902, + "learning_rate": 3.774569797191835e-05, + "loss": 0.0292, + "num_input_tokens_seen": 30905448, + "step": 53275 + }, + { + "epoch": 7.935656836461126, + "grad_norm": 0.08476726710796356, + "learning_rate": 3.774290246365539e-05, + "loss": 0.0502, + "num_input_tokens_seen": 30908392, + "step": 53280 + }, + { + "epoch": 7.936401549002085, + "grad_norm": 38.901737213134766, + "learning_rate": 3.774010674011404e-05, + "loss": 0.5295, + "num_input_tokens_seen": 30911208, + "step": 53285 + }, + { + "epoch": 7.937146261543044, + "grad_norm": 0.14156119525432587, + "learning_rate": 3.773731080134154e-05, + "loss": 0.1305, + "num_input_tokens_seen": 30914120, + "step": 53290 + }, + { + "epoch": 7.937890974084004, + "grad_norm": 0.3493802547454834, + "learning_rate": 3.7734514647385114e-05, + "loss": 0.0411, + "num_input_tokens_seen": 30917096, + "step": 53295 + }, + { + "epoch": 7.938635686624963, + "grad_norm": 0.025693612173199654, + "learning_rate": 3.773171827829201e-05, + "loss": 0.1436, + "num_input_tokens_seen": 30920200, + "step": 53300 + }, + { + "epoch": 7.9393803991659215, + "grad_norm": 0.03840792179107666, + "learning_rate": 3.772892169410947e-05, + "loss": 0.0952, + "num_input_tokens_seen": 30923368, + "step": 53305 + }, + { + "epoch": 7.940125111706881, + "grad_norm": 0.042637743055820465, + "learning_rate": 3.772612489488473e-05, + "loss": 0.0103, + "num_input_tokens_seen": 30926056, + "step": 53310 + }, + { + "epoch": 7.940869824247841, + "grad_norm": 10.609922409057617, + "learning_rate": 3.772332788066504e-05, + "loss": 0.3037, + "num_input_tokens_seen": 30929032, + "step": 53315 + }, + { + "epoch": 7.9416145367888, + "grad_norm": 12.065690994262695, + "learning_rate": 3.772053065149766e-05, + "loss": 0.5635, + "num_input_tokens_seen": 30931784, + "step": 53320 + }, + { + "epoch": 7.942359249329758, + "grad_norm": 0.4672683775424957, + "learning_rate": 3.771773320742984e-05, + "loss": 0.1057, + "num_input_tokens_seen": 30934600, + "step": 53325 + }, + { + "epoch": 7.943103961870718, + "grad_norm": 10.782052040100098, + "learning_rate": 3.7714935548508846e-05, + "loss": 1.1184, + "num_input_tokens_seen": 30937352, + "step": 53330 + }, + { + "epoch": 7.943848674411677, + "grad_norm": 0.02583605796098709, + "learning_rate": 3.771213767478194e-05, + "loss": 0.0363, + "num_input_tokens_seen": 30940264, + "step": 53335 + }, + { + "epoch": 7.944593386952636, + "grad_norm": 0.021067429333925247, + "learning_rate": 3.770933958629639e-05, + "loss": 0.1572, + "num_input_tokens_seen": 30942888, + "step": 53340 + }, + { + "epoch": 7.945338099493595, + "grad_norm": 0.1791614592075348, + "learning_rate": 3.7706541283099466e-05, + "loss": 0.0416, + "num_input_tokens_seen": 30945672, + "step": 53345 + }, + { + "epoch": 7.946082812034555, + "grad_norm": 120.20698547363281, + "learning_rate": 3.7703742765238436e-05, + "loss": 0.2017, + "num_input_tokens_seen": 30948520, + "step": 53350 + }, + { + "epoch": 7.946827524575514, + "grad_norm": 0.03845952823758125, + "learning_rate": 3.770094403276059e-05, + "loss": 0.1678, + "num_input_tokens_seen": 30951528, + "step": 53355 + }, + { + "epoch": 7.947572237116473, + "grad_norm": 54.2266845703125, + "learning_rate": 3.7698145085713196e-05, + "loss": 0.2958, + "num_input_tokens_seen": 30954408, + "step": 53360 + }, + { + "epoch": 7.948316949657432, + "grad_norm": 1.460381269454956, + "learning_rate": 3.7695345924143555e-05, + "loss": 0.2888, + "num_input_tokens_seen": 30957352, + "step": 53365 + }, + { + "epoch": 7.949061662198392, + "grad_norm": 53.002811431884766, + "learning_rate": 3.769254654809894e-05, + "loss": 0.233, + "num_input_tokens_seen": 30960168, + "step": 53370 + }, + { + "epoch": 7.94980637473935, + "grad_norm": 4.549907207489014, + "learning_rate": 3.768974695762665e-05, + "loss": 0.0558, + "num_input_tokens_seen": 30963208, + "step": 53375 + }, + { + "epoch": 7.95055108728031, + "grad_norm": 6.592000484466553, + "learning_rate": 3.768694715277398e-05, + "loss": 0.0259, + "num_input_tokens_seen": 30965928, + "step": 53380 + }, + { + "epoch": 7.951295799821269, + "grad_norm": 41.444576263427734, + "learning_rate": 3.7684147133588245e-05, + "loss": 0.1488, + "num_input_tokens_seen": 30968936, + "step": 53385 + }, + { + "epoch": 7.952040512362228, + "grad_norm": 54.50252151489258, + "learning_rate": 3.7681346900116726e-05, + "loss": 0.1821, + "num_input_tokens_seen": 30971688, + "step": 53390 + }, + { + "epoch": 7.952785224903187, + "grad_norm": 127.34130096435547, + "learning_rate": 3.7678546452406736e-05, + "loss": 0.1485, + "num_input_tokens_seen": 30974504, + "step": 53395 + }, + { + "epoch": 7.953529937444147, + "grad_norm": 21.514535903930664, + "learning_rate": 3.76757457905056e-05, + "loss": 0.14, + "num_input_tokens_seen": 30977320, + "step": 53400 + }, + { + "epoch": 7.954274649985106, + "grad_norm": 153.25889587402344, + "learning_rate": 3.767294491446062e-05, + "loss": 0.4686, + "num_input_tokens_seen": 30980136, + "step": 53405 + }, + { + "epoch": 7.955019362526065, + "grad_norm": 23.314697265625, + "learning_rate": 3.7670143824319116e-05, + "loss": 0.2551, + "num_input_tokens_seen": 30982824, + "step": 53410 + }, + { + "epoch": 7.955764075067024, + "grad_norm": 0.09720434248447418, + "learning_rate": 3.76673425201284e-05, + "loss": 0.1312, + "num_input_tokens_seen": 30985672, + "step": 53415 + }, + { + "epoch": 7.956508787607984, + "grad_norm": 31.70758819580078, + "learning_rate": 3.766454100193581e-05, + "loss": 0.2793, + "num_input_tokens_seen": 30988552, + "step": 53420 + }, + { + "epoch": 7.957253500148942, + "grad_norm": 82.3893051147461, + "learning_rate": 3.7661739269788687e-05, + "loss": 0.1008, + "num_input_tokens_seen": 30991432, + "step": 53425 + }, + { + "epoch": 7.957998212689902, + "grad_norm": 0.3412584364414215, + "learning_rate": 3.765893732373433e-05, + "loss": 0.0773, + "num_input_tokens_seen": 30994376, + "step": 53430 + }, + { + "epoch": 7.958742925230861, + "grad_norm": 0.05726274102926254, + "learning_rate": 3.7656135163820105e-05, + "loss": 0.2087, + "num_input_tokens_seen": 30997608, + "step": 53435 + }, + { + "epoch": 7.9594876377718204, + "grad_norm": 0.7541176080703735, + "learning_rate": 3.7653332790093334e-05, + "loss": 0.0437, + "num_input_tokens_seen": 31000648, + "step": 53440 + }, + { + "epoch": 7.960232350312779, + "grad_norm": 0.36284878849983215, + "learning_rate": 3.765053020260137e-05, + "loss": 0.1737, + "num_input_tokens_seen": 31003400, + "step": 53445 + }, + { + "epoch": 7.960977062853738, + "grad_norm": 112.89175415039062, + "learning_rate": 3.764772740139154e-05, + "loss": 0.1533, + "num_input_tokens_seen": 31006248, + "step": 53450 + }, + { + "epoch": 7.961721775394698, + "grad_norm": 11.660761833190918, + "learning_rate": 3.7644924386511225e-05, + "loss": 0.1143, + "num_input_tokens_seen": 31009128, + "step": 53455 + }, + { + "epoch": 7.962466487935657, + "grad_norm": 0.5989315509796143, + "learning_rate": 3.7642121158007756e-05, + "loss": 0.1228, + "num_input_tokens_seen": 31011944, + "step": 53460 + }, + { + "epoch": 7.963211200476616, + "grad_norm": 9.99953556060791, + "learning_rate": 3.7639317715928514e-05, + "loss": 0.3614, + "num_input_tokens_seen": 31014696, + "step": 53465 + }, + { + "epoch": 7.963955913017575, + "grad_norm": 0.05979599803686142, + "learning_rate": 3.763651406032083e-05, + "loss": 0.0177, + "num_input_tokens_seen": 31017704, + "step": 53470 + }, + { + "epoch": 7.964700625558534, + "grad_norm": 0.579362690448761, + "learning_rate": 3.763371019123209e-05, + "loss": 0.2506, + "num_input_tokens_seen": 31020648, + "step": 53475 + }, + { + "epoch": 7.965445338099494, + "grad_norm": 0.019485967233777046, + "learning_rate": 3.7630906108709654e-05, + "loss": 0.2883, + "num_input_tokens_seen": 31023528, + "step": 53480 + }, + { + "epoch": 7.966190050640453, + "grad_norm": 0.058095090091228485, + "learning_rate": 3.76281018128009e-05, + "loss": 0.2591, + "num_input_tokens_seen": 31026056, + "step": 53485 + }, + { + "epoch": 7.966934763181412, + "grad_norm": 0.03037150576710701, + "learning_rate": 3.7625297303553195e-05, + "loss": 0.1095, + "num_input_tokens_seen": 31028808, + "step": 53490 + }, + { + "epoch": 7.967679475722371, + "grad_norm": 30.81161117553711, + "learning_rate": 3.762249258101392e-05, + "loss": 0.0224, + "num_input_tokens_seen": 31031688, + "step": 53495 + }, + { + "epoch": 7.96842418826333, + "grad_norm": 14.07437801361084, + "learning_rate": 3.761968764523048e-05, + "loss": 0.211, + "num_input_tokens_seen": 31034248, + "step": 53500 + }, + { + "epoch": 7.96916890080429, + "grad_norm": 0.16276465356349945, + "learning_rate": 3.761688249625024e-05, + "loss": 0.0044, + "num_input_tokens_seen": 31037320, + "step": 53505 + }, + { + "epoch": 7.969913613345248, + "grad_norm": 0.4917053282260895, + "learning_rate": 3.761407713412058e-05, + "loss": 0.1131, + "num_input_tokens_seen": 31040200, + "step": 53510 + }, + { + "epoch": 7.970658325886208, + "grad_norm": 0.1220049187541008, + "learning_rate": 3.761127155888891e-05, + "loss": 0.12, + "num_input_tokens_seen": 31042952, + "step": 53515 + }, + { + "epoch": 7.971403038427167, + "grad_norm": 52.209903717041016, + "learning_rate": 3.760846577060263e-05, + "loss": 0.1995, + "num_input_tokens_seen": 31045832, + "step": 53520 + }, + { + "epoch": 7.9721477509681264, + "grad_norm": 0.12489276379346848, + "learning_rate": 3.760565976930913e-05, + "loss": 0.3936, + "num_input_tokens_seen": 31049032, + "step": 53525 + }, + { + "epoch": 7.972892463509085, + "grad_norm": 29.00037384033203, + "learning_rate": 3.760285355505583e-05, + "loss": 0.7605, + "num_input_tokens_seen": 31051944, + "step": 53530 + }, + { + "epoch": 7.973637176050045, + "grad_norm": 0.031843654811382294, + "learning_rate": 3.760004712789012e-05, + "loss": 0.1731, + "num_input_tokens_seen": 31054536, + "step": 53535 + }, + { + "epoch": 7.974381888591004, + "grad_norm": 0.021722743287682533, + "learning_rate": 3.759724048785942e-05, + "loss": 0.2018, + "num_input_tokens_seen": 31057288, + "step": 53540 + }, + { + "epoch": 7.975126601131963, + "grad_norm": 0.20756715536117554, + "learning_rate": 3.759443363501115e-05, + "loss": 0.1291, + "num_input_tokens_seen": 31060264, + "step": 53545 + }, + { + "epoch": 7.975871313672922, + "grad_norm": 20.309860229492188, + "learning_rate": 3.759162656939271e-05, + "loss": 0.3529, + "num_input_tokens_seen": 31062920, + "step": 53550 + }, + { + "epoch": 7.976616026213882, + "grad_norm": 12.874190330505371, + "learning_rate": 3.758881929105155e-05, + "loss": 0.2584, + "num_input_tokens_seen": 31065704, + "step": 53555 + }, + { + "epoch": 7.97736073875484, + "grad_norm": 0.2582511007785797, + "learning_rate": 3.758601180003508e-05, + "loss": 0.0069, + "num_input_tokens_seen": 31069064, + "step": 53560 + }, + { + "epoch": 7.9781054512958, + "grad_norm": 0.009617382660508156, + "learning_rate": 3.758320409639074e-05, + "loss": 0.0454, + "num_input_tokens_seen": 31071912, + "step": 53565 + }, + { + "epoch": 7.978850163836759, + "grad_norm": 22.17896842956543, + "learning_rate": 3.758039618016595e-05, + "loss": 0.1236, + "num_input_tokens_seen": 31074568, + "step": 53570 + }, + { + "epoch": 7.9795948763777185, + "grad_norm": 0.015775641426444054, + "learning_rate": 3.757758805140814e-05, + "loss": 0.1669, + "num_input_tokens_seen": 31077480, + "step": 53575 + }, + { + "epoch": 7.980339588918677, + "grad_norm": 0.017431715503335, + "learning_rate": 3.757477971016478e-05, + "loss": 0.0135, + "num_input_tokens_seen": 31080296, + "step": 53580 + }, + { + "epoch": 7.981084301459637, + "grad_norm": 30.215229034423828, + "learning_rate": 3.7571971156483285e-05, + "loss": 0.4863, + "num_input_tokens_seen": 31083048, + "step": 53585 + }, + { + "epoch": 7.981829014000596, + "grad_norm": 30.305980682373047, + "learning_rate": 3.756916239041113e-05, + "loss": 0.0732, + "num_input_tokens_seen": 31085768, + "step": 53590 + }, + { + "epoch": 7.982573726541555, + "grad_norm": 20.90451431274414, + "learning_rate": 3.756635341199574e-05, + "loss": 0.1929, + "num_input_tokens_seen": 31088392, + "step": 53595 + }, + { + "epoch": 7.983318439082514, + "grad_norm": 0.5903618931770325, + "learning_rate": 3.756354422128459e-05, + "loss": 0.3882, + "num_input_tokens_seen": 31091368, + "step": 53600 + }, + { + "epoch": 7.984063151623474, + "grad_norm": 0.049939028918743134, + "learning_rate": 3.756073481832512e-05, + "loss": 0.3541, + "num_input_tokens_seen": 31094472, + "step": 53605 + }, + { + "epoch": 7.9848078641644324, + "grad_norm": 0.12829461693763733, + "learning_rate": 3.75579252031648e-05, + "loss": 0.0193, + "num_input_tokens_seen": 31097352, + "step": 53610 + }, + { + "epoch": 7.985552576705392, + "grad_norm": 19.33565902709961, + "learning_rate": 3.75551153758511e-05, + "loss": 0.3642, + "num_input_tokens_seen": 31100168, + "step": 53615 + }, + { + "epoch": 7.986297289246351, + "grad_norm": 0.028881536796689034, + "learning_rate": 3.755230533643148e-05, + "loss": 0.0089, + "num_input_tokens_seen": 31103272, + "step": 53620 + }, + { + "epoch": 7.9870420017873105, + "grad_norm": 13.422799110412598, + "learning_rate": 3.754949508495344e-05, + "loss": 0.255, + "num_input_tokens_seen": 31106216, + "step": 53625 + }, + { + "epoch": 7.987786714328269, + "grad_norm": 12.559952735900879, + "learning_rate": 3.7546684621464415e-05, + "loss": 0.0708, + "num_input_tokens_seen": 31109064, + "step": 53630 + }, + { + "epoch": 7.988531426869228, + "grad_norm": 27.954378128051758, + "learning_rate": 3.7543873946011916e-05, + "loss": 0.5733, + "num_input_tokens_seen": 31111880, + "step": 53635 + }, + { + "epoch": 7.989276139410188, + "grad_norm": 0.09587601572275162, + "learning_rate": 3.754106305864341e-05, + "loss": 0.1515, + "num_input_tokens_seen": 31114760, + "step": 53640 + }, + { + "epoch": 7.990020851951147, + "grad_norm": 13.144721984863281, + "learning_rate": 3.753825195940639e-05, + "loss": 0.1574, + "num_input_tokens_seen": 31117800, + "step": 53645 + }, + { + "epoch": 7.990765564492106, + "grad_norm": 3.405461549758911, + "learning_rate": 3.753544064834835e-05, + "loss": 0.11, + "num_input_tokens_seen": 31120616, + "step": 53650 + }, + { + "epoch": 7.991510277033065, + "grad_norm": 0.011076566763222218, + "learning_rate": 3.753262912551677e-05, + "loss": 0.0115, + "num_input_tokens_seen": 31123432, + "step": 53655 + }, + { + "epoch": 7.9922549895740245, + "grad_norm": 42.295162200927734, + "learning_rate": 3.7529817390959164e-05, + "loss": 0.4159, + "num_input_tokens_seen": 31126216, + "step": 53660 + }, + { + "epoch": 7.992999702114983, + "grad_norm": 0.5476484298706055, + "learning_rate": 3.752700544472304e-05, + "loss": 0.1308, + "num_input_tokens_seen": 31128968, + "step": 53665 + }, + { + "epoch": 7.993744414655943, + "grad_norm": 27.310781478881836, + "learning_rate": 3.752419328685588e-05, + "loss": 0.1324, + "num_input_tokens_seen": 31131880, + "step": 53670 + }, + { + "epoch": 7.994489127196902, + "grad_norm": 54.24751663208008, + "learning_rate": 3.752138091740521e-05, + "loss": 0.1445, + "num_input_tokens_seen": 31134856, + "step": 53675 + }, + { + "epoch": 7.995233839737861, + "grad_norm": 27.389551162719727, + "learning_rate": 3.7518568336418525e-05, + "loss": 0.0814, + "num_input_tokens_seen": 31137544, + "step": 53680 + }, + { + "epoch": 7.99597855227882, + "grad_norm": 0.027085689827799797, + "learning_rate": 3.751575554394336e-05, + "loss": 0.2522, + "num_input_tokens_seen": 31140296, + "step": 53685 + }, + { + "epoch": 7.99672326481978, + "grad_norm": 0.8900332450866699, + "learning_rate": 3.751294254002722e-05, + "loss": 0.0784, + "num_input_tokens_seen": 31143080, + "step": 53690 + }, + { + "epoch": 7.9974679773607384, + "grad_norm": 8.290812492370605, + "learning_rate": 3.751012932471764e-05, + "loss": 0.0613, + "num_input_tokens_seen": 31145800, + "step": 53695 + }, + { + "epoch": 7.998212689901698, + "grad_norm": 45.89237976074219, + "learning_rate": 3.7507315898062136e-05, + "loss": 0.504, + "num_input_tokens_seen": 31148456, + "step": 53700 + }, + { + "epoch": 7.998957402442657, + "grad_norm": 3.5107150077819824, + "learning_rate": 3.7504502260108245e-05, + "loss": 0.0743, + "num_input_tokens_seen": 31151336, + "step": 53705 + }, + { + "epoch": 7.9997021149836165, + "grad_norm": 0.009604169987142086, + "learning_rate": 3.750168841090349e-05, + "loss": 0.2681, + "num_input_tokens_seen": 31154216, + "step": 53710 + }, + { + "epoch": 8.0, + "eval_loss": 1.4101295471191406, + "eval_runtime": 51.2731, + "eval_samples_per_second": 58.198, + "eval_steps_per_second": 14.55, + "num_input_tokens_seen": 31154960, + "step": 53712 + }, + { + "epoch": 8.000446827524575, + "grad_norm": 94.95777893066406, + "learning_rate": 3.749887435049541e-05, + "loss": 0.2754, + "num_input_tokens_seen": 31156592, + "step": 53715 + }, + { + "epoch": 8.001191540065534, + "grad_norm": 0.09559085965156555, + "learning_rate": 3.749606007893157e-05, + "loss": 0.0121, + "num_input_tokens_seen": 31159344, + "step": 53720 + }, + { + "epoch": 8.001936252606495, + "grad_norm": 0.030723050236701965, + "learning_rate": 3.7493245596259484e-05, + "loss": 0.0569, + "num_input_tokens_seen": 31162032, + "step": 53725 + }, + { + "epoch": 8.002680965147453, + "grad_norm": 0.2777527868747711, + "learning_rate": 3.7490430902526715e-05, + "loss": 0.0026, + "num_input_tokens_seen": 31164784, + "step": 53730 + }, + { + "epoch": 8.003425677688412, + "grad_norm": 2.193249225616455, + "learning_rate": 3.7487615997780815e-05, + "loss": 0.1423, + "num_input_tokens_seen": 31167824, + "step": 53735 + }, + { + "epoch": 8.00417039022937, + "grad_norm": 0.027628587558865547, + "learning_rate": 3.7484800882069324e-05, + "loss": 0.2617, + "num_input_tokens_seen": 31170704, + "step": 53740 + }, + { + "epoch": 8.004915102770331, + "grad_norm": 8.098640441894531, + "learning_rate": 3.748198555543981e-05, + "loss": 0.0062, + "num_input_tokens_seen": 31173456, + "step": 53745 + }, + { + "epoch": 8.00565981531129, + "grad_norm": 22.434947967529297, + "learning_rate": 3.747917001793985e-05, + "loss": 0.0978, + "num_input_tokens_seen": 31176400, + "step": 53750 + }, + { + "epoch": 8.006404527852249, + "grad_norm": 0.04147302731871605, + "learning_rate": 3.7476354269616984e-05, + "loss": 0.086, + "num_input_tokens_seen": 31179536, + "step": 53755 + }, + { + "epoch": 8.007149240393208, + "grad_norm": 0.10636206716299057, + "learning_rate": 3.747353831051879e-05, + "loss": 0.1947, + "num_input_tokens_seen": 31182192, + "step": 53760 + }, + { + "epoch": 8.007893952934168, + "grad_norm": 11.836923599243164, + "learning_rate": 3.747072214069286e-05, + "loss": 0.0054, + "num_input_tokens_seen": 31185232, + "step": 53765 + }, + { + "epoch": 8.008638665475127, + "grad_norm": 0.4819736182689667, + "learning_rate": 3.746790576018674e-05, + "loss": 0.198, + "num_input_tokens_seen": 31188400, + "step": 53770 + }, + { + "epoch": 8.009383378016086, + "grad_norm": 0.017534272745251656, + "learning_rate": 3.746508916904803e-05, + "loss": 0.1423, + "num_input_tokens_seen": 31191728, + "step": 53775 + }, + { + "epoch": 8.010128090557044, + "grad_norm": 0.030598096549510956, + "learning_rate": 3.74622723673243e-05, + "loss": 0.0011, + "num_input_tokens_seen": 31194576, + "step": 53780 + }, + { + "epoch": 8.010872803098005, + "grad_norm": 29.89851188659668, + "learning_rate": 3.745945535506315e-05, + "loss": 0.2119, + "num_input_tokens_seen": 31197584, + "step": 53785 + }, + { + "epoch": 8.011617515638964, + "grad_norm": 7.415139675140381, + "learning_rate": 3.7456638132312164e-05, + "loss": 0.1983, + "num_input_tokens_seen": 31200304, + "step": 53790 + }, + { + "epoch": 8.012362228179922, + "grad_norm": 0.14340147376060486, + "learning_rate": 3.745382069911894e-05, + "loss": 0.0381, + "num_input_tokens_seen": 31203248, + "step": 53795 + }, + { + "epoch": 8.013106940720881, + "grad_norm": 0.08325626701116562, + "learning_rate": 3.745100305553107e-05, + "loss": 0.0043, + "num_input_tokens_seen": 31206320, + "step": 53800 + }, + { + "epoch": 8.013851653261842, + "grad_norm": 0.1233256608247757, + "learning_rate": 3.744818520159616e-05, + "loss": 0.0031, + "num_input_tokens_seen": 31209072, + "step": 53805 + }, + { + "epoch": 8.0145963658028, + "grad_norm": 0.014928100630640984, + "learning_rate": 3.744536713736182e-05, + "loss": 0.0067, + "num_input_tokens_seen": 31212016, + "step": 53810 + }, + { + "epoch": 8.01534107834376, + "grad_norm": 0.042454566806554794, + "learning_rate": 3.744254886287564e-05, + "loss": 0.0921, + "num_input_tokens_seen": 31214960, + "step": 53815 + }, + { + "epoch": 8.016085790884718, + "grad_norm": 0.24475102126598358, + "learning_rate": 3.743973037818524e-05, + "loss": 0.0014, + "num_input_tokens_seen": 31218160, + "step": 53820 + }, + { + "epoch": 8.016830503425677, + "grad_norm": 0.0028028457891196012, + "learning_rate": 3.7436911683338244e-05, + "loss": 0.2505, + "num_input_tokens_seen": 31221264, + "step": 53825 + }, + { + "epoch": 8.017575215966637, + "grad_norm": 0.29075708985328674, + "learning_rate": 3.743409277838227e-05, + "loss": 0.0665, + "num_input_tokens_seen": 31224272, + "step": 53830 + }, + { + "epoch": 8.018319928507596, + "grad_norm": 0.18883322179317474, + "learning_rate": 3.7431273663364926e-05, + "loss": 0.0215, + "num_input_tokens_seen": 31227440, + "step": 53835 + }, + { + "epoch": 8.019064641048555, + "grad_norm": 0.0160409864038229, + "learning_rate": 3.742845433833386e-05, + "loss": 0.0307, + "num_input_tokens_seen": 31230160, + "step": 53840 + }, + { + "epoch": 8.019809353589514, + "grad_norm": 0.009811809286475182, + "learning_rate": 3.742563480333668e-05, + "loss": 0.2579, + "num_input_tokens_seen": 31233136, + "step": 53845 + }, + { + "epoch": 8.020554066130474, + "grad_norm": 0.008026767522096634, + "learning_rate": 3.742281505842103e-05, + "loss": 0.1024, + "num_input_tokens_seen": 31235760, + "step": 53850 + }, + { + "epoch": 8.021298778671433, + "grad_norm": 0.007654301822185516, + "learning_rate": 3.7419995103634546e-05, + "loss": 0.03, + "num_input_tokens_seen": 31238416, + "step": 53855 + }, + { + "epoch": 8.022043491212392, + "grad_norm": 36.066898345947266, + "learning_rate": 3.741717493902488e-05, + "loss": 0.4422, + "num_input_tokens_seen": 31241552, + "step": 53860 + }, + { + "epoch": 8.02278820375335, + "grad_norm": 58.712608337402344, + "learning_rate": 3.741435456463965e-05, + "loss": 0.0484, + "num_input_tokens_seen": 31244208, + "step": 53865 + }, + { + "epoch": 8.023532916294311, + "grad_norm": 0.07029570639133453, + "learning_rate": 3.741153398052653e-05, + "loss": 0.0465, + "num_input_tokens_seen": 31246896, + "step": 53870 + }, + { + "epoch": 8.02427762883527, + "grad_norm": 0.00994487851858139, + "learning_rate": 3.740871318673314e-05, + "loss": 0.1208, + "num_input_tokens_seen": 31249776, + "step": 53875 + }, + { + "epoch": 8.025022341376228, + "grad_norm": 0.01807807758450508, + "learning_rate": 3.740589218330716e-05, + "loss": 0.062, + "num_input_tokens_seen": 31253072, + "step": 53880 + }, + { + "epoch": 8.025767053917187, + "grad_norm": 0.034550294280052185, + "learning_rate": 3.740307097029624e-05, + "loss": 0.1204, + "num_input_tokens_seen": 31255920, + "step": 53885 + }, + { + "epoch": 8.026511766458148, + "grad_norm": 77.36865997314453, + "learning_rate": 3.740024954774804e-05, + "loss": 0.0361, + "num_input_tokens_seen": 31258992, + "step": 53890 + }, + { + "epoch": 8.027256478999107, + "grad_norm": 0.019558798521757126, + "learning_rate": 3.739742791571023e-05, + "loss": 0.3186, + "num_input_tokens_seen": 31261680, + "step": 53895 + }, + { + "epoch": 8.028001191540065, + "grad_norm": 0.018843377009034157, + "learning_rate": 3.739460607423048e-05, + "loss": 0.0023, + "num_input_tokens_seen": 31264848, + "step": 53900 + }, + { + "epoch": 8.028745904081024, + "grad_norm": 0.033143628388643265, + "learning_rate": 3.7391784023356445e-05, + "loss": 0.1941, + "num_input_tokens_seen": 31268112, + "step": 53905 + }, + { + "epoch": 8.029490616621985, + "grad_norm": 0.3843318521976471, + "learning_rate": 3.7388961763135835e-05, + "loss": 0.0604, + "num_input_tokens_seen": 31271280, + "step": 53910 + }, + { + "epoch": 8.030235329162943, + "grad_norm": 4.74009895324707, + "learning_rate": 3.7386139293616285e-05, + "loss": 0.3667, + "num_input_tokens_seen": 31274096, + "step": 53915 + }, + { + "epoch": 8.030980041703902, + "grad_norm": 48.425804138183594, + "learning_rate": 3.738331661484551e-05, + "loss": 0.1841, + "num_input_tokens_seen": 31276880, + "step": 53920 + }, + { + "epoch": 8.03172475424486, + "grad_norm": 0.5817619562149048, + "learning_rate": 3.7380493726871186e-05, + "loss": 0.0047, + "num_input_tokens_seen": 31279792, + "step": 53925 + }, + { + "epoch": 8.032469466785821, + "grad_norm": 29.884279251098633, + "learning_rate": 3.737767062974101e-05, + "loss": 0.0752, + "num_input_tokens_seen": 31283120, + "step": 53930 + }, + { + "epoch": 8.03321417932678, + "grad_norm": 0.033782798796892166, + "learning_rate": 3.737484732350266e-05, + "loss": 0.0595, + "num_input_tokens_seen": 31286288, + "step": 53935 + }, + { + "epoch": 8.033958891867739, + "grad_norm": 0.352485716342926, + "learning_rate": 3.7372023808203836e-05, + "loss": 0.0022, + "num_input_tokens_seen": 31289040, + "step": 53940 + }, + { + "epoch": 8.034703604408698, + "grad_norm": 0.008721312507987022, + "learning_rate": 3.736920008389225e-05, + "loss": 0.153, + "num_input_tokens_seen": 31292144, + "step": 53945 + }, + { + "epoch": 8.035448316949658, + "grad_norm": 0.03547044098377228, + "learning_rate": 3.7366376150615614e-05, + "loss": 0.247, + "num_input_tokens_seen": 31294928, + "step": 53950 + }, + { + "epoch": 8.036193029490617, + "grad_norm": 0.581466019153595, + "learning_rate": 3.7363552008421606e-05, + "loss": 0.1632, + "num_input_tokens_seen": 31297712, + "step": 53955 + }, + { + "epoch": 8.036937742031576, + "grad_norm": 0.18850447237491608, + "learning_rate": 3.7360727657357954e-05, + "loss": 0.0005, + "num_input_tokens_seen": 31300528, + "step": 53960 + }, + { + "epoch": 8.037682454572534, + "grad_norm": 112.80714416503906, + "learning_rate": 3.7357903097472376e-05, + "loss": 0.213, + "num_input_tokens_seen": 31303472, + "step": 53965 + }, + { + "epoch": 8.038427167113495, + "grad_norm": 0.007669650949537754, + "learning_rate": 3.7355078328812583e-05, + "loss": 0.148, + "num_input_tokens_seen": 31306544, + "step": 53970 + }, + { + "epoch": 8.039171879654454, + "grad_norm": 1.0879204273223877, + "learning_rate": 3.7352253351426295e-05, + "loss": 0.1562, + "num_input_tokens_seen": 31309392, + "step": 53975 + }, + { + "epoch": 8.039916592195413, + "grad_norm": 68.84600830078125, + "learning_rate": 3.734942816536124e-05, + "loss": 0.5832, + "num_input_tokens_seen": 31311920, + "step": 53980 + }, + { + "epoch": 8.040661304736371, + "grad_norm": 0.9975001215934753, + "learning_rate": 3.734660277066515e-05, + "loss": 0.001, + "num_input_tokens_seen": 31314640, + "step": 53985 + }, + { + "epoch": 8.041406017277332, + "grad_norm": 0.2581571042537689, + "learning_rate": 3.734377716738576e-05, + "loss": 0.1158, + "num_input_tokens_seen": 31317424, + "step": 53990 + }, + { + "epoch": 8.04215072981829, + "grad_norm": 0.02899918146431446, + "learning_rate": 3.73409513555708e-05, + "loss": 0.1445, + "num_input_tokens_seen": 31320752, + "step": 53995 + }, + { + "epoch": 8.04289544235925, + "grad_norm": 0.40376195311546326, + "learning_rate": 3.733812533526801e-05, + "loss": 0.2167, + "num_input_tokens_seen": 31323728, + "step": 54000 + }, + { + "epoch": 8.043640154900208, + "grad_norm": 0.006201486103236675, + "learning_rate": 3.733529910652513e-05, + "loss": 0.1093, + "num_input_tokens_seen": 31326992, + "step": 54005 + }, + { + "epoch": 8.044384867441167, + "grad_norm": 0.011713535524904728, + "learning_rate": 3.73324726693899e-05, + "loss": 0.1655, + "num_input_tokens_seen": 31329872, + "step": 54010 + }, + { + "epoch": 8.045129579982127, + "grad_norm": 7.657289505004883, + "learning_rate": 3.732964602391009e-05, + "loss": 0.1403, + "num_input_tokens_seen": 31332688, + "step": 54015 + }, + { + "epoch": 8.045874292523086, + "grad_norm": 0.022442247718572617, + "learning_rate": 3.7326819170133434e-05, + "loss": 0.1176, + "num_input_tokens_seen": 31335824, + "step": 54020 + }, + { + "epoch": 8.046619005064045, + "grad_norm": 0.006417031399905682, + "learning_rate": 3.7323992108107705e-05, + "loss": 0.2017, + "num_input_tokens_seen": 31338704, + "step": 54025 + }, + { + "epoch": 8.047363717605004, + "grad_norm": 0.00942394882440567, + "learning_rate": 3.7321164837880654e-05, + "loss": 0.3658, + "num_input_tokens_seen": 31341296, + "step": 54030 + }, + { + "epoch": 8.048108430145964, + "grad_norm": 0.01977905072271824, + "learning_rate": 3.731833735950004e-05, + "loss": 0.0091, + "num_input_tokens_seen": 31343888, + "step": 54035 + }, + { + "epoch": 8.048853142686923, + "grad_norm": 46.20448684692383, + "learning_rate": 3.731550967301364e-05, + "loss": 0.0901, + "num_input_tokens_seen": 31346928, + "step": 54040 + }, + { + "epoch": 8.049597855227882, + "grad_norm": 0.026437565684318542, + "learning_rate": 3.7312681778469216e-05, + "loss": 0.0129, + "num_input_tokens_seen": 31350064, + "step": 54045 + }, + { + "epoch": 8.05034256776884, + "grad_norm": 0.013218236155807972, + "learning_rate": 3.730985367591455e-05, + "loss": 0.0676, + "num_input_tokens_seen": 31353008, + "step": 54050 + }, + { + "epoch": 8.051087280309801, + "grad_norm": 0.03408948704600334, + "learning_rate": 3.730702536539741e-05, + "loss": 0.2248, + "num_input_tokens_seen": 31355696, + "step": 54055 + }, + { + "epoch": 8.05183199285076, + "grad_norm": 0.016426322981715202, + "learning_rate": 3.73041968469656e-05, + "loss": 0.2136, + "num_input_tokens_seen": 31358576, + "step": 54060 + }, + { + "epoch": 8.052576705391719, + "grad_norm": 0.011569934897124767, + "learning_rate": 3.730136812066688e-05, + "loss": 0.0014, + "num_input_tokens_seen": 31361776, + "step": 54065 + }, + { + "epoch": 8.053321417932677, + "grad_norm": 0.04235146567225456, + "learning_rate": 3.7298539186549054e-05, + "loss": 0.001, + "num_input_tokens_seen": 31364688, + "step": 54070 + }, + { + "epoch": 8.054066130473638, + "grad_norm": 0.050323907285928726, + "learning_rate": 3.7295710044659904e-05, + "loss": 0.0434, + "num_input_tokens_seen": 31367792, + "step": 54075 + }, + { + "epoch": 8.054810843014597, + "grad_norm": 0.008520129136741161, + "learning_rate": 3.7292880695047225e-05, + "loss": 0.0003, + "num_input_tokens_seen": 31370352, + "step": 54080 + }, + { + "epoch": 8.055555555555555, + "grad_norm": 0.020326711237430573, + "learning_rate": 3.729005113775883e-05, + "loss": 0.0206, + "num_input_tokens_seen": 31373104, + "step": 54085 + }, + { + "epoch": 8.056300268096514, + "grad_norm": 16.47829818725586, + "learning_rate": 3.7287221372842506e-05, + "loss": 0.2193, + "num_input_tokens_seen": 31375760, + "step": 54090 + }, + { + "epoch": 8.057044980637475, + "grad_norm": 18.095495223999023, + "learning_rate": 3.728439140034607e-05, + "loss": 0.0636, + "num_input_tokens_seen": 31379024, + "step": 54095 + }, + { + "epoch": 8.057789693178433, + "grad_norm": 0.050385043025016785, + "learning_rate": 3.728156122031732e-05, + "loss": 0.0003, + "num_input_tokens_seen": 31381808, + "step": 54100 + }, + { + "epoch": 8.058534405719392, + "grad_norm": 0.052486274391412735, + "learning_rate": 3.7278730832804076e-05, + "loss": 0.2204, + "num_input_tokens_seen": 31384656, + "step": 54105 + }, + { + "epoch": 8.059279118260351, + "grad_norm": 0.0027254519518464804, + "learning_rate": 3.727590023785416e-05, + "loss": 0.0043, + "num_input_tokens_seen": 31387568, + "step": 54110 + }, + { + "epoch": 8.060023830801311, + "grad_norm": 0.01870599575340748, + "learning_rate": 3.727306943551538e-05, + "loss": 0.3475, + "num_input_tokens_seen": 31390416, + "step": 54115 + }, + { + "epoch": 8.06076854334227, + "grad_norm": 0.015522046945989132, + "learning_rate": 3.727023842583557e-05, + "loss": 0.09, + "num_input_tokens_seen": 31393360, + "step": 54120 + }, + { + "epoch": 8.061513255883229, + "grad_norm": 0.01292339526116848, + "learning_rate": 3.726740720886255e-05, + "loss": 0.0114, + "num_input_tokens_seen": 31395984, + "step": 54125 + }, + { + "epoch": 8.062257968424188, + "grad_norm": 0.019211387261748314, + "learning_rate": 3.726457578464416e-05, + "loss": 0.1691, + "num_input_tokens_seen": 31398704, + "step": 54130 + }, + { + "epoch": 8.063002680965148, + "grad_norm": 7.869915008544922, + "learning_rate": 3.726174415322822e-05, + "loss": 0.0183, + "num_input_tokens_seen": 31401552, + "step": 54135 + }, + { + "epoch": 8.063747393506107, + "grad_norm": 0.6283866167068481, + "learning_rate": 3.725891231466258e-05, + "loss": 0.0007, + "num_input_tokens_seen": 31404624, + "step": 54140 + }, + { + "epoch": 8.064492106047066, + "grad_norm": 0.03567155823111534, + "learning_rate": 3.725608026899507e-05, + "loss": 0.0086, + "num_input_tokens_seen": 31407760, + "step": 54145 + }, + { + "epoch": 8.065236818588025, + "grad_norm": 0.08090052753686905, + "learning_rate": 3.7253248016273545e-05, + "loss": 0.0444, + "num_input_tokens_seen": 31410736, + "step": 54150 + }, + { + "epoch": 8.065981531128985, + "grad_norm": 0.04901798069477081, + "learning_rate": 3.725041555654585e-05, + "loss": 0.0005, + "num_input_tokens_seen": 31413808, + "step": 54155 + }, + { + "epoch": 8.066726243669944, + "grad_norm": 6.024497032165527, + "learning_rate": 3.7247582889859824e-05, + "loss": 0.0061, + "num_input_tokens_seen": 31416912, + "step": 54160 + }, + { + "epoch": 8.067470956210903, + "grad_norm": 0.02578740008175373, + "learning_rate": 3.724475001626335e-05, + "loss": 0.1453, + "num_input_tokens_seen": 31419760, + "step": 54165 + }, + { + "epoch": 8.068215668751861, + "grad_norm": 168.97991943359375, + "learning_rate": 3.7241916935804254e-05, + "loss": 0.1971, + "num_input_tokens_seen": 31422544, + "step": 54170 + }, + { + "epoch": 8.06896038129282, + "grad_norm": 39.1596794128418, + "learning_rate": 3.723908364853042e-05, + "loss": 0.228, + "num_input_tokens_seen": 31425360, + "step": 54175 + }, + { + "epoch": 8.06970509383378, + "grad_norm": 0.02929009683430195, + "learning_rate": 3.723625015448971e-05, + "loss": 0.2442, + "num_input_tokens_seen": 31428112, + "step": 54180 + }, + { + "epoch": 8.07044980637474, + "grad_norm": 1.2517805099487305, + "learning_rate": 3.723341645372998e-05, + "loss": 0.0466, + "num_input_tokens_seen": 31431088, + "step": 54185 + }, + { + "epoch": 8.071194518915698, + "grad_norm": 0.03370882198214531, + "learning_rate": 3.723058254629912e-05, + "loss": 0.2691, + "num_input_tokens_seen": 31434160, + "step": 54190 + }, + { + "epoch": 8.071939231456657, + "grad_norm": 7.844191074371338, + "learning_rate": 3.7227748432245e-05, + "loss": 0.0339, + "num_input_tokens_seen": 31436784, + "step": 54195 + }, + { + "epoch": 8.072683943997617, + "grad_norm": 17.497961044311523, + "learning_rate": 3.722491411161549e-05, + "loss": 0.1545, + "num_input_tokens_seen": 31439856, + "step": 54200 + }, + { + "epoch": 8.073428656538576, + "grad_norm": 60.82069778442383, + "learning_rate": 3.722207958445849e-05, + "loss": 0.0622, + "num_input_tokens_seen": 31442448, + "step": 54205 + }, + { + "epoch": 8.074173369079535, + "grad_norm": 0.6027558445930481, + "learning_rate": 3.721924485082187e-05, + "loss": 0.0018, + "num_input_tokens_seen": 31445456, + "step": 54210 + }, + { + "epoch": 8.074918081620494, + "grad_norm": 10.867706298828125, + "learning_rate": 3.721640991075354e-05, + "loss": 0.7524, + "num_input_tokens_seen": 31448432, + "step": 54215 + }, + { + "epoch": 8.075662794161454, + "grad_norm": 3.0967724323272705, + "learning_rate": 3.7213574764301363e-05, + "loss": 0.0045, + "num_input_tokens_seen": 31451248, + "step": 54220 + }, + { + "epoch": 8.076407506702413, + "grad_norm": 0.0072206552140414715, + "learning_rate": 3.721073941151327e-05, + "loss": 0.0005, + "num_input_tokens_seen": 31454160, + "step": 54225 + }, + { + "epoch": 8.077152219243372, + "grad_norm": 0.03549949824810028, + "learning_rate": 3.7207903852437134e-05, + "loss": 0.2883, + "num_input_tokens_seen": 31456784, + "step": 54230 + }, + { + "epoch": 8.07789693178433, + "grad_norm": 0.009853445924818516, + "learning_rate": 3.7205068087120876e-05, + "loss": 0.1993, + "num_input_tokens_seen": 31459888, + "step": 54235 + }, + { + "epoch": 8.078641644325291, + "grad_norm": 0.020210541784763336, + "learning_rate": 3.7202232115612396e-05, + "loss": 0.0919, + "num_input_tokens_seen": 31462704, + "step": 54240 + }, + { + "epoch": 8.07938635686625, + "grad_norm": 0.09013552963733673, + "learning_rate": 3.7199395937959604e-05, + "loss": 0.2461, + "num_input_tokens_seen": 31465520, + "step": 54245 + }, + { + "epoch": 8.080131069407209, + "grad_norm": 0.01010578777641058, + "learning_rate": 3.7196559554210415e-05, + "loss": 0.0023, + "num_input_tokens_seen": 31468496, + "step": 54250 + }, + { + "epoch": 8.080875781948167, + "grad_norm": 0.061201997101306915, + "learning_rate": 3.719372296441275e-05, + "loss": 0.0088, + "num_input_tokens_seen": 31471248, + "step": 54255 + }, + { + "epoch": 8.081620494489128, + "grad_norm": 30.723264694213867, + "learning_rate": 3.719088616861453e-05, + "loss": 0.0512, + "num_input_tokens_seen": 31474128, + "step": 54260 + }, + { + "epoch": 8.082365207030087, + "grad_norm": 0.05138611048460007, + "learning_rate": 3.718804916686368e-05, + "loss": 0.001, + "num_input_tokens_seen": 31477040, + "step": 54265 + }, + { + "epoch": 8.083109919571045, + "grad_norm": 0.032468609511852264, + "learning_rate": 3.7185211959208124e-05, + "loss": 0.1452, + "num_input_tokens_seen": 31479920, + "step": 54270 + }, + { + "epoch": 8.083854632112004, + "grad_norm": 12.898059844970703, + "learning_rate": 3.71823745456958e-05, + "loss": 0.3212, + "num_input_tokens_seen": 31482896, + "step": 54275 + }, + { + "epoch": 8.084599344652965, + "grad_norm": 0.5669986605644226, + "learning_rate": 3.7179536926374636e-05, + "loss": 0.0036, + "num_input_tokens_seen": 31485680, + "step": 54280 + }, + { + "epoch": 8.085344057193923, + "grad_norm": 0.016409533098340034, + "learning_rate": 3.7176699101292574e-05, + "loss": 0.0005, + "num_input_tokens_seen": 31488336, + "step": 54285 + }, + { + "epoch": 8.086088769734882, + "grad_norm": 27.52035903930664, + "learning_rate": 3.7173861070497556e-05, + "loss": 0.1665, + "num_input_tokens_seen": 31491184, + "step": 54290 + }, + { + "epoch": 8.086833482275841, + "grad_norm": 0.24960541725158691, + "learning_rate": 3.717102283403753e-05, + "loss": 0.0014, + "num_input_tokens_seen": 31494128, + "step": 54295 + }, + { + "epoch": 8.087578194816802, + "grad_norm": 13.35759162902832, + "learning_rate": 3.716818439196045e-05, + "loss": 0.0279, + "num_input_tokens_seen": 31497296, + "step": 54300 + }, + { + "epoch": 8.08832290735776, + "grad_norm": 0.045131292194128036, + "learning_rate": 3.716534574431425e-05, + "loss": 0.0436, + "num_input_tokens_seen": 31500432, + "step": 54305 + }, + { + "epoch": 8.089067619898719, + "grad_norm": 0.18113909661769867, + "learning_rate": 3.7162506891146896e-05, + "loss": 0.0039, + "num_input_tokens_seen": 31503312, + "step": 54310 + }, + { + "epoch": 8.089812332439678, + "grad_norm": 7.543679714202881, + "learning_rate": 3.7159667832506365e-05, + "loss": 0.1647, + "num_input_tokens_seen": 31506320, + "step": 54315 + }, + { + "epoch": 8.090557044980638, + "grad_norm": 0.6957477927207947, + "learning_rate": 3.715682856844059e-05, + "loss": 0.0981, + "num_input_tokens_seen": 31509328, + "step": 54320 + }, + { + "epoch": 8.091301757521597, + "grad_norm": 19.21686363220215, + "learning_rate": 3.715398909899756e-05, + "loss": 0.0737, + "num_input_tokens_seen": 31512336, + "step": 54325 + }, + { + "epoch": 8.092046470062556, + "grad_norm": 54.598934173583984, + "learning_rate": 3.715114942422524e-05, + "loss": 0.0493, + "num_input_tokens_seen": 31515312, + "step": 54330 + }, + { + "epoch": 8.092791182603515, + "grad_norm": 13.966756820678711, + "learning_rate": 3.71483095441716e-05, + "loss": 0.3784, + "num_input_tokens_seen": 31517968, + "step": 54335 + }, + { + "epoch": 8.093535895144473, + "grad_norm": 9.463720321655273, + "learning_rate": 3.7145469458884606e-05, + "loss": 0.3043, + "num_input_tokens_seen": 31520848, + "step": 54340 + }, + { + "epoch": 8.094280607685434, + "grad_norm": 0.09180803596973419, + "learning_rate": 3.714262916841226e-05, + "loss": 0.0019, + "num_input_tokens_seen": 31523888, + "step": 54345 + }, + { + "epoch": 8.095025320226393, + "grad_norm": 16.752872467041016, + "learning_rate": 3.7139788672802526e-05, + "loss": 0.1552, + "num_input_tokens_seen": 31526832, + "step": 54350 + }, + { + "epoch": 8.095770032767351, + "grad_norm": 0.043088365346193314, + "learning_rate": 3.713694797210341e-05, + "loss": 0.1194, + "num_input_tokens_seen": 31529808, + "step": 54355 + }, + { + "epoch": 8.09651474530831, + "grad_norm": 47.848876953125, + "learning_rate": 3.713410706636289e-05, + "loss": 0.1161, + "num_input_tokens_seen": 31532720, + "step": 54360 + }, + { + "epoch": 8.09725945784927, + "grad_norm": 0.13006217777729034, + "learning_rate": 3.713126595562896e-05, + "loss": 0.1942, + "num_input_tokens_seen": 31535920, + "step": 54365 + }, + { + "epoch": 8.09800417039023, + "grad_norm": 0.016551369801163673, + "learning_rate": 3.712842463994963e-05, + "loss": 0.0954, + "num_input_tokens_seen": 31538800, + "step": 54370 + }, + { + "epoch": 8.098748882931188, + "grad_norm": 94.74308013916016, + "learning_rate": 3.7125583119372884e-05, + "loss": 0.2611, + "num_input_tokens_seen": 31541456, + "step": 54375 + }, + { + "epoch": 8.099493595472147, + "grad_norm": 0.3524024486541748, + "learning_rate": 3.712274139394674e-05, + "loss": 0.0011, + "num_input_tokens_seen": 31544464, + "step": 54380 + }, + { + "epoch": 8.100238308013108, + "grad_norm": 7.037570476531982, + "learning_rate": 3.71198994637192e-05, + "loss": 0.2132, + "num_input_tokens_seen": 31547440, + "step": 54385 + }, + { + "epoch": 8.100983020554066, + "grad_norm": 0.04494548961520195, + "learning_rate": 3.711705732873828e-05, + "loss": 0.1359, + "num_input_tokens_seen": 31550512, + "step": 54390 + }, + { + "epoch": 8.101727733095025, + "grad_norm": 0.03893665224313736, + "learning_rate": 3.711421498905198e-05, + "loss": 0.1594, + "num_input_tokens_seen": 31553488, + "step": 54395 + }, + { + "epoch": 8.102472445635984, + "grad_norm": 0.058284442871809006, + "learning_rate": 3.7111372444708345e-05, + "loss": 0.2348, + "num_input_tokens_seen": 31556368, + "step": 54400 + }, + { + "epoch": 8.103217158176944, + "grad_norm": 0.5923266410827637, + "learning_rate": 3.7108529695755375e-05, + "loss": 0.0016, + "num_input_tokens_seen": 31558896, + "step": 54405 + }, + { + "epoch": 8.103961870717903, + "grad_norm": 0.046444982290267944, + "learning_rate": 3.7105686742241095e-05, + "loss": 0.0925, + "num_input_tokens_seen": 31561712, + "step": 54410 + }, + { + "epoch": 8.104706583258862, + "grad_norm": 9.758895874023438, + "learning_rate": 3.7102843584213556e-05, + "loss": 0.0033, + "num_input_tokens_seen": 31564816, + "step": 54415 + }, + { + "epoch": 8.10545129579982, + "grad_norm": 0.503161609172821, + "learning_rate": 3.710000022172076e-05, + "loss": 0.0016, + "num_input_tokens_seen": 31567504, + "step": 54420 + }, + { + "epoch": 8.106196008340781, + "grad_norm": 0.011384811252355576, + "learning_rate": 3.7097156654810774e-05, + "loss": 0.005, + "num_input_tokens_seen": 31570544, + "step": 54425 + }, + { + "epoch": 8.10694072088174, + "grad_norm": 15.494332313537598, + "learning_rate": 3.709431288353161e-05, + "loss": 0.0796, + "num_input_tokens_seen": 31573040, + "step": 54430 + }, + { + "epoch": 8.107685433422699, + "grad_norm": 0.011988768354058266, + "learning_rate": 3.7091468907931324e-05, + "loss": 0.0911, + "num_input_tokens_seen": 31576112, + "step": 54435 + }, + { + "epoch": 8.108430145963657, + "grad_norm": 103.8777847290039, + "learning_rate": 3.708862472805796e-05, + "loss": 0.1041, + "num_input_tokens_seen": 31579216, + "step": 54440 + }, + { + "epoch": 8.109174858504618, + "grad_norm": 0.14298707246780396, + "learning_rate": 3.708578034395957e-05, + "loss": 0.0045, + "num_input_tokens_seen": 31582224, + "step": 54445 + }, + { + "epoch": 8.109919571045577, + "grad_norm": 66.55805206298828, + "learning_rate": 3.70829357556842e-05, + "loss": 0.2853, + "num_input_tokens_seen": 31585040, + "step": 54450 + }, + { + "epoch": 8.110664283586535, + "grad_norm": 0.006812546867877245, + "learning_rate": 3.7080090963279915e-05, + "loss": 0.114, + "num_input_tokens_seen": 31587920, + "step": 54455 + }, + { + "epoch": 8.111408996127494, + "grad_norm": 81.8471908569336, + "learning_rate": 3.7077245966794774e-05, + "loss": 0.1536, + "num_input_tokens_seen": 31590608, + "step": 54460 + }, + { + "epoch": 8.112153708668455, + "grad_norm": 22.47039794921875, + "learning_rate": 3.707440076627683e-05, + "loss": 0.1925, + "num_input_tokens_seen": 31593520, + "step": 54465 + }, + { + "epoch": 8.112898421209414, + "grad_norm": 21.328277587890625, + "learning_rate": 3.7071555361774165e-05, + "loss": 0.0948, + "num_input_tokens_seen": 31596688, + "step": 54470 + }, + { + "epoch": 8.113643133750372, + "grad_norm": 0.010442824102938175, + "learning_rate": 3.706870975333484e-05, + "loss": 0.0024, + "num_input_tokens_seen": 31599248, + "step": 54475 + }, + { + "epoch": 8.114387846291331, + "grad_norm": 22.669029235839844, + "learning_rate": 3.706586394100692e-05, + "loss": 0.0989, + "num_input_tokens_seen": 31602064, + "step": 54480 + }, + { + "epoch": 8.115132558832292, + "grad_norm": 0.015322607010602951, + "learning_rate": 3.70630179248385e-05, + "loss": 0.0924, + "num_input_tokens_seen": 31605296, + "step": 54485 + }, + { + "epoch": 8.11587727137325, + "grad_norm": 51.74611282348633, + "learning_rate": 3.706017170487765e-05, + "loss": 0.1513, + "num_input_tokens_seen": 31608432, + "step": 54490 + }, + { + "epoch": 8.116621983914209, + "grad_norm": 0.011960494332015514, + "learning_rate": 3.705732528117246e-05, + "loss": 0.1264, + "num_input_tokens_seen": 31611280, + "step": 54495 + }, + { + "epoch": 8.117366696455168, + "grad_norm": 0.016087768599390984, + "learning_rate": 3.7054478653771005e-05, + "loss": 0.0005, + "num_input_tokens_seen": 31614192, + "step": 54500 + }, + { + "epoch": 8.118111408996128, + "grad_norm": 22.504348754882812, + "learning_rate": 3.7051631822721395e-05, + "loss": 0.1543, + "num_input_tokens_seen": 31617200, + "step": 54505 + }, + { + "epoch": 8.118856121537087, + "grad_norm": 146.6582489013672, + "learning_rate": 3.704878478807171e-05, + "loss": 0.2695, + "num_input_tokens_seen": 31619984, + "step": 54510 + }, + { + "epoch": 8.119600834078046, + "grad_norm": 2.471005439758301, + "learning_rate": 3.704593754987005e-05, + "loss": 0.2002, + "num_input_tokens_seen": 31622768, + "step": 54515 + }, + { + "epoch": 8.120345546619005, + "grad_norm": 9.21908187866211, + "learning_rate": 3.704309010816452e-05, + "loss": 0.2073, + "num_input_tokens_seen": 31625488, + "step": 54520 + }, + { + "epoch": 8.121090259159963, + "grad_norm": 0.11841242760419846, + "learning_rate": 3.7040242463003225e-05, + "loss": 0.0343, + "num_input_tokens_seen": 31628688, + "step": 54525 + }, + { + "epoch": 8.121834971700924, + "grad_norm": 0.2760416269302368, + "learning_rate": 3.703739461443427e-05, + "loss": 0.0945, + "num_input_tokens_seen": 31631664, + "step": 54530 + }, + { + "epoch": 8.122579684241883, + "grad_norm": 0.020867206156253815, + "learning_rate": 3.703454656250576e-05, + "loss": 0.0229, + "num_input_tokens_seen": 31634448, + "step": 54535 + }, + { + "epoch": 8.123324396782841, + "grad_norm": 0.003367082914337516, + "learning_rate": 3.7031698307265824e-05, + "loss": 0.0032, + "num_input_tokens_seen": 31637200, + "step": 54540 + }, + { + "epoch": 8.1240691093238, + "grad_norm": 0.039866603910923004, + "learning_rate": 3.702884984876257e-05, + "loss": 0.169, + "num_input_tokens_seen": 31640016, + "step": 54545 + }, + { + "epoch": 8.12481382186476, + "grad_norm": 0.09154272079467773, + "learning_rate": 3.702600118704412e-05, + "loss": 0.0515, + "num_input_tokens_seen": 31642992, + "step": 54550 + }, + { + "epoch": 8.12555853440572, + "grad_norm": 69.23269653320312, + "learning_rate": 3.702315232215862e-05, + "loss": 0.3264, + "num_input_tokens_seen": 31645840, + "step": 54555 + }, + { + "epoch": 8.126303246946678, + "grad_norm": 0.11670873314142227, + "learning_rate": 3.7020303254154164e-05, + "loss": 0.007, + "num_input_tokens_seen": 31648752, + "step": 54560 + }, + { + "epoch": 8.127047959487637, + "grad_norm": 0.02043302170932293, + "learning_rate": 3.701745398307891e-05, + "loss": 0.0037, + "num_input_tokens_seen": 31651600, + "step": 54565 + }, + { + "epoch": 8.127792672028598, + "grad_norm": 0.009112721309065819, + "learning_rate": 3.701460450898098e-05, + "loss": 0.0836, + "num_input_tokens_seen": 31654448, + "step": 54570 + }, + { + "epoch": 8.128537384569556, + "grad_norm": 0.06000986695289612, + "learning_rate": 3.701175483190852e-05, + "loss": 0.011, + "num_input_tokens_seen": 31657296, + "step": 54575 + }, + { + "epoch": 8.129282097110515, + "grad_norm": 0.0015306586865335703, + "learning_rate": 3.700890495190967e-05, + "loss": 0.0003, + "num_input_tokens_seen": 31660144, + "step": 54580 + }, + { + "epoch": 8.130026809651474, + "grad_norm": 0.00328715774230659, + "learning_rate": 3.7006054869032574e-05, + "loss": 0.121, + "num_input_tokens_seen": 31663216, + "step": 54585 + }, + { + "epoch": 8.130771522192434, + "grad_norm": 0.004397719632834196, + "learning_rate": 3.700320458332539e-05, + "loss": 0.0003, + "num_input_tokens_seen": 31666256, + "step": 54590 + }, + { + "epoch": 8.131516234733393, + "grad_norm": 151.5927734375, + "learning_rate": 3.700035409483626e-05, + "loss": 0.2074, + "num_input_tokens_seen": 31669008, + "step": 54595 + }, + { + "epoch": 8.132260947274352, + "grad_norm": 29.756914138793945, + "learning_rate": 3.699750340361334e-05, + "loss": 0.4831, + "num_input_tokens_seen": 31672144, + "step": 54600 + }, + { + "epoch": 8.13300565981531, + "grad_norm": 0.002734603127464652, + "learning_rate": 3.69946525097048e-05, + "loss": 0.0004, + "num_input_tokens_seen": 31674864, + "step": 54605 + }, + { + "epoch": 8.133750372356271, + "grad_norm": 0.009110786952078342, + "learning_rate": 3.6991801413158795e-05, + "loss": 0.0404, + "num_input_tokens_seen": 31677744, + "step": 54610 + }, + { + "epoch": 8.13449508489723, + "grad_norm": 5.913848876953125, + "learning_rate": 3.6988950114023494e-05, + "loss": 0.0009, + "num_input_tokens_seen": 31680560, + "step": 54615 + }, + { + "epoch": 8.135239797438189, + "grad_norm": 0.016714734956622124, + "learning_rate": 3.6986098612347056e-05, + "loss": 0.0639, + "num_input_tokens_seen": 31683728, + "step": 54620 + }, + { + "epoch": 8.135984509979147, + "grad_norm": 0.04249628633260727, + "learning_rate": 3.6983246908177675e-05, + "loss": 0.0912, + "num_input_tokens_seen": 31686512, + "step": 54625 + }, + { + "epoch": 8.136729222520108, + "grad_norm": 0.004923922009766102, + "learning_rate": 3.698039500156352e-05, + "loss": 0.1882, + "num_input_tokens_seen": 31689264, + "step": 54630 + }, + { + "epoch": 8.137473935061067, + "grad_norm": 0.0018571199616417289, + "learning_rate": 3.697754289255277e-05, + "loss": 0.1819, + "num_input_tokens_seen": 31692208, + "step": 54635 + }, + { + "epoch": 8.138218647602026, + "grad_norm": 0.40826213359832764, + "learning_rate": 3.697469058119359e-05, + "loss": 0.0767, + "num_input_tokens_seen": 31695024, + "step": 54640 + }, + { + "epoch": 8.138963360142984, + "grad_norm": 0.004508041311055422, + "learning_rate": 3.697183806753419e-05, + "loss": 0.1514, + "num_input_tokens_seen": 31698192, + "step": 54645 + }, + { + "epoch": 8.139708072683945, + "grad_norm": 0.04492957517504692, + "learning_rate": 3.696898535162275e-05, + "loss": 0.1582, + "num_input_tokens_seen": 31700944, + "step": 54650 + }, + { + "epoch": 8.140452785224904, + "grad_norm": 62.61597442626953, + "learning_rate": 3.696613243350747e-05, + "loss": 0.0832, + "num_input_tokens_seen": 31704016, + "step": 54655 + }, + { + "epoch": 8.141197497765862, + "grad_norm": 49.02940368652344, + "learning_rate": 3.696327931323655e-05, + "loss": 0.7739, + "num_input_tokens_seen": 31707024, + "step": 54660 + }, + { + "epoch": 8.141942210306821, + "grad_norm": 0.049931369721889496, + "learning_rate": 3.696042599085818e-05, + "loss": 0.005, + "num_input_tokens_seen": 31709840, + "step": 54665 + }, + { + "epoch": 8.142686922847782, + "grad_norm": 136.3589630126953, + "learning_rate": 3.695757246642057e-05, + "loss": 0.1941, + "num_input_tokens_seen": 31712816, + "step": 54670 + }, + { + "epoch": 8.14343163538874, + "grad_norm": 39.14421463012695, + "learning_rate": 3.695471873997193e-05, + "loss": 0.0457, + "num_input_tokens_seen": 31715440, + "step": 54675 + }, + { + "epoch": 8.1441763479297, + "grad_norm": 0.002028627786785364, + "learning_rate": 3.6951864811560464e-05, + "loss": 0.2118, + "num_input_tokens_seen": 31718352, + "step": 54680 + }, + { + "epoch": 8.144921060470658, + "grad_norm": 27.13225555419922, + "learning_rate": 3.694901068123439e-05, + "loss": 0.4387, + "num_input_tokens_seen": 31721424, + "step": 54685 + }, + { + "epoch": 8.145665773011617, + "grad_norm": 0.6188777685165405, + "learning_rate": 3.694615634904192e-05, + "loss": 0.0992, + "num_input_tokens_seen": 31724496, + "step": 54690 + }, + { + "epoch": 8.146410485552577, + "grad_norm": 0.08326289057731628, + "learning_rate": 3.694330181503128e-05, + "loss": 0.0013, + "num_input_tokens_seen": 31727344, + "step": 54695 + }, + { + "epoch": 8.147155198093536, + "grad_norm": 7.381552696228027, + "learning_rate": 3.69404470792507e-05, + "loss": 0.1482, + "num_input_tokens_seen": 31730320, + "step": 54700 + }, + { + "epoch": 8.147899910634495, + "grad_norm": 0.0027796137146651745, + "learning_rate": 3.69375921417484e-05, + "loss": 0.0004, + "num_input_tokens_seen": 31733360, + "step": 54705 + }, + { + "epoch": 8.148644623175453, + "grad_norm": 0.026042090728878975, + "learning_rate": 3.6934737002572614e-05, + "loss": 0.0076, + "num_input_tokens_seen": 31736112, + "step": 54710 + }, + { + "epoch": 8.149389335716414, + "grad_norm": 33.677398681640625, + "learning_rate": 3.693188166177158e-05, + "loss": 0.2245, + "num_input_tokens_seen": 31738992, + "step": 54715 + }, + { + "epoch": 8.150134048257373, + "grad_norm": 0.07106567174196243, + "learning_rate": 3.6929026119393525e-05, + "loss": 0.2355, + "num_input_tokens_seen": 31742000, + "step": 54720 + }, + { + "epoch": 8.150878760798332, + "grad_norm": 0.028136853128671646, + "learning_rate": 3.69261703754867e-05, + "loss": 0.1537, + "num_input_tokens_seen": 31744912, + "step": 54725 + }, + { + "epoch": 8.15162347333929, + "grad_norm": 17.64044952392578, + "learning_rate": 3.6923314430099354e-05, + "loss": 0.0088, + "num_input_tokens_seen": 31747856, + "step": 54730 + }, + { + "epoch": 8.15236818588025, + "grad_norm": 66.43356323242188, + "learning_rate": 3.6920458283279725e-05, + "loss": 0.1405, + "num_input_tokens_seen": 31750928, + "step": 54735 + }, + { + "epoch": 8.15311289842121, + "grad_norm": 2.033353805541992, + "learning_rate": 3.691760193507607e-05, + "loss": 0.0083, + "num_input_tokens_seen": 31754032, + "step": 54740 + }, + { + "epoch": 8.153857610962168, + "grad_norm": 72.94454956054688, + "learning_rate": 3.691474538553664e-05, + "loss": 0.1641, + "num_input_tokens_seen": 31756816, + "step": 54745 + }, + { + "epoch": 8.154602323503127, + "grad_norm": 2.500020742416382, + "learning_rate": 3.691188863470969e-05, + "loss": 0.2017, + "num_input_tokens_seen": 31759792, + "step": 54750 + }, + { + "epoch": 8.155347036044088, + "grad_norm": 14.299756050109863, + "learning_rate": 3.69090316826435e-05, + "loss": 0.1323, + "num_input_tokens_seen": 31762480, + "step": 54755 + }, + { + "epoch": 8.156091748585046, + "grad_norm": 4.871379852294922, + "learning_rate": 3.690617452938632e-05, + "loss": 0.011, + "num_input_tokens_seen": 31765264, + "step": 54760 + }, + { + "epoch": 8.156836461126005, + "grad_norm": 0.006764878053218126, + "learning_rate": 3.6903317174986425e-05, + "loss": 0.1227, + "num_input_tokens_seen": 31768176, + "step": 54765 + }, + { + "epoch": 8.157581173666964, + "grad_norm": 31.904449462890625, + "learning_rate": 3.690045961949208e-05, + "loss": 0.1407, + "num_input_tokens_seen": 31770960, + "step": 54770 + }, + { + "epoch": 8.158325886207924, + "grad_norm": 0.04384719952940941, + "learning_rate": 3.689760186295156e-05, + "loss": 0.131, + "num_input_tokens_seen": 31773584, + "step": 54775 + }, + { + "epoch": 8.159070598748883, + "grad_norm": 0.00521966814994812, + "learning_rate": 3.689474390541316e-05, + "loss": 0.0005, + "num_input_tokens_seen": 31776432, + "step": 54780 + }, + { + "epoch": 8.159815311289842, + "grad_norm": 0.007491006515920162, + "learning_rate": 3.6891885746925136e-05, + "loss": 0.0196, + "num_input_tokens_seen": 31779408, + "step": 54785 + }, + { + "epoch": 8.1605600238308, + "grad_norm": 71.52557373046875, + "learning_rate": 3.68890273875358e-05, + "loss": 0.3138, + "num_input_tokens_seen": 31782640, + "step": 54790 + }, + { + "epoch": 8.161304736371761, + "grad_norm": 0.010523357428610325, + "learning_rate": 3.688616882729343e-05, + "loss": 0.0012, + "num_input_tokens_seen": 31785616, + "step": 54795 + }, + { + "epoch": 8.16204944891272, + "grad_norm": 0.016448592767119408, + "learning_rate": 3.688331006624632e-05, + "loss": 0.1599, + "num_input_tokens_seen": 31788496, + "step": 54800 + }, + { + "epoch": 8.162794161453679, + "grad_norm": 0.2594979703426361, + "learning_rate": 3.688045110444276e-05, + "loss": 0.0005, + "num_input_tokens_seen": 31791760, + "step": 54805 + }, + { + "epoch": 8.163538873994638, + "grad_norm": 10.197343826293945, + "learning_rate": 3.687759194193105e-05, + "loss": 0.1936, + "num_input_tokens_seen": 31794576, + "step": 54810 + }, + { + "epoch": 8.164283586535598, + "grad_norm": 20.557903289794922, + "learning_rate": 3.6874732578759495e-05, + "loss": 0.0059, + "num_input_tokens_seen": 31797552, + "step": 54815 + }, + { + "epoch": 8.165028299076557, + "grad_norm": 30.7203311920166, + "learning_rate": 3.687187301497641e-05, + "loss": 0.3937, + "num_input_tokens_seen": 31800912, + "step": 54820 + }, + { + "epoch": 8.165773011617516, + "grad_norm": 0.005506809335201979, + "learning_rate": 3.6869013250630094e-05, + "loss": 0.1848, + "num_input_tokens_seen": 31803824, + "step": 54825 + }, + { + "epoch": 8.166517724158474, + "grad_norm": 0.06306101381778717, + "learning_rate": 3.686615328576886e-05, + "loss": 0.0012, + "num_input_tokens_seen": 31807504, + "step": 54830 + }, + { + "epoch": 8.167262436699435, + "grad_norm": 17.934980392456055, + "learning_rate": 3.686329312044102e-05, + "loss": 0.3043, + "num_input_tokens_seen": 31810576, + "step": 54835 + }, + { + "epoch": 8.168007149240394, + "grad_norm": 0.03371331840753555, + "learning_rate": 3.6860432754694915e-05, + "loss": 0.0011, + "num_input_tokens_seen": 31813648, + "step": 54840 + }, + { + "epoch": 8.168751861781352, + "grad_norm": 19.089466094970703, + "learning_rate": 3.685757218857885e-05, + "loss": 0.4162, + "num_input_tokens_seen": 31816240, + "step": 54845 + }, + { + "epoch": 8.169496574322311, + "grad_norm": 0.02155791036784649, + "learning_rate": 3.6854711422141144e-05, + "loss": 0.0005, + "num_input_tokens_seen": 31818800, + "step": 54850 + }, + { + "epoch": 8.17024128686327, + "grad_norm": 54.935569763183594, + "learning_rate": 3.685185045543014e-05, + "loss": 0.2056, + "num_input_tokens_seen": 31821520, + "step": 54855 + }, + { + "epoch": 8.17098599940423, + "grad_norm": 0.008333339355885983, + "learning_rate": 3.684898928849417e-05, + "loss": 0.4303, + "num_input_tokens_seen": 31824560, + "step": 54860 + }, + { + "epoch": 8.17173071194519, + "grad_norm": 0.08363735675811768, + "learning_rate": 3.6846127921381576e-05, + "loss": 0.0042, + "num_input_tokens_seen": 31827408, + "step": 54865 + }, + { + "epoch": 8.172475424486148, + "grad_norm": 0.7060031890869141, + "learning_rate": 3.684326635414068e-05, + "loss": 0.1422, + "num_input_tokens_seen": 31830160, + "step": 54870 + }, + { + "epoch": 8.173220137027107, + "grad_norm": 7.136815071105957, + "learning_rate": 3.684040458681984e-05, + "loss": 0.0571, + "num_input_tokens_seen": 31832848, + "step": 54875 + }, + { + "epoch": 8.173964849568067, + "grad_norm": 0.02613663487136364, + "learning_rate": 3.6837542619467404e-05, + "loss": 0.0036, + "num_input_tokens_seen": 31836080, + "step": 54880 + }, + { + "epoch": 8.174709562109026, + "grad_norm": 0.051565926522016525, + "learning_rate": 3.6834680452131707e-05, + "loss": 0.0009, + "num_input_tokens_seen": 31838928, + "step": 54885 + }, + { + "epoch": 8.175454274649985, + "grad_norm": 0.11163985729217529, + "learning_rate": 3.683181808486112e-05, + "loss": 0.4139, + "num_input_tokens_seen": 31841584, + "step": 54890 + }, + { + "epoch": 8.176198987190944, + "grad_norm": 0.30237993597984314, + "learning_rate": 3.682895551770399e-05, + "loss": 0.1668, + "num_input_tokens_seen": 31844496, + "step": 54895 + }, + { + "epoch": 8.176943699731904, + "grad_norm": 11.048300743103027, + "learning_rate": 3.682609275070867e-05, + "loss": 0.1218, + "num_input_tokens_seen": 31847184, + "step": 54900 + }, + { + "epoch": 8.177688412272863, + "grad_norm": 0.035785011947155, + "learning_rate": 3.682322978392354e-05, + "loss": 0.002, + "num_input_tokens_seen": 31850128, + "step": 54905 + }, + { + "epoch": 8.178433124813822, + "grad_norm": 0.8730336427688599, + "learning_rate": 3.682036661739696e-05, + "loss": 0.123, + "num_input_tokens_seen": 31852912, + "step": 54910 + }, + { + "epoch": 8.17917783735478, + "grad_norm": 0.014982189051806927, + "learning_rate": 3.68175032511773e-05, + "loss": 0.0094, + "num_input_tokens_seen": 31855920, + "step": 54915 + }, + { + "epoch": 8.17992254989574, + "grad_norm": 0.054575350135564804, + "learning_rate": 3.6814639685312936e-05, + "loss": 0.0027, + "num_input_tokens_seen": 31859088, + "step": 54920 + }, + { + "epoch": 8.1806672624367, + "grad_norm": 0.0015902145532891154, + "learning_rate": 3.6811775919852245e-05, + "loss": 0.0003, + "num_input_tokens_seen": 31861936, + "step": 54925 + }, + { + "epoch": 8.181411974977658, + "grad_norm": 0.2711132764816284, + "learning_rate": 3.6808911954843595e-05, + "loss": 0.0003, + "num_input_tokens_seen": 31864816, + "step": 54930 + }, + { + "epoch": 8.182156687518617, + "grad_norm": 0.07323487848043442, + "learning_rate": 3.680604779033538e-05, + "loss": 0.0015, + "num_input_tokens_seen": 31867728, + "step": 54935 + }, + { + "epoch": 8.182901400059578, + "grad_norm": 19.622974395751953, + "learning_rate": 3.680318342637599e-05, + "loss": 0.1081, + "num_input_tokens_seen": 31870704, + "step": 54940 + }, + { + "epoch": 8.183646112600536, + "grad_norm": 1.178238034248352, + "learning_rate": 3.6800318863013806e-05, + "loss": 0.0786, + "num_input_tokens_seen": 31873616, + "step": 54945 + }, + { + "epoch": 8.184390825141495, + "grad_norm": 0.04144580662250519, + "learning_rate": 3.6797454100297234e-05, + "loss": 0.2702, + "num_input_tokens_seen": 31876528, + "step": 54950 + }, + { + "epoch": 8.185135537682454, + "grad_norm": 0.057233311235904694, + "learning_rate": 3.679458913827467e-05, + "loss": 0.1192, + "num_input_tokens_seen": 31879088, + "step": 54955 + }, + { + "epoch": 8.185880250223414, + "grad_norm": 0.008672280237078667, + "learning_rate": 3.6791723976994505e-05, + "loss": 0.1746, + "num_input_tokens_seen": 31881840, + "step": 54960 + }, + { + "epoch": 8.186624962764373, + "grad_norm": 0.006883152760565281, + "learning_rate": 3.678885861650515e-05, + "loss": 0.1758, + "num_input_tokens_seen": 31885008, + "step": 54965 + }, + { + "epoch": 8.187369675305332, + "grad_norm": 94.39617919921875, + "learning_rate": 3.6785993056855004e-05, + "loss": 0.1214, + "num_input_tokens_seen": 31887952, + "step": 54970 + }, + { + "epoch": 8.18811438784629, + "grad_norm": 0.3833865523338318, + "learning_rate": 3.678312729809249e-05, + "loss": 0.1827, + "num_input_tokens_seen": 31891088, + "step": 54975 + }, + { + "epoch": 8.188859100387251, + "grad_norm": 0.49550697207450867, + "learning_rate": 3.6780261340266014e-05, + "loss": 0.4733, + "num_input_tokens_seen": 31893872, + "step": 54980 + }, + { + "epoch": 8.18960381292821, + "grad_norm": 0.15237584710121155, + "learning_rate": 3.677739518342399e-05, + "loss": 0.1634, + "num_input_tokens_seen": 31896688, + "step": 54985 + }, + { + "epoch": 8.190348525469169, + "grad_norm": 0.009633997455239296, + "learning_rate": 3.677452882761486e-05, + "loss": 0.0023, + "num_input_tokens_seen": 31899536, + "step": 54990 + }, + { + "epoch": 8.191093238010128, + "grad_norm": 0.008144867606461048, + "learning_rate": 3.677166227288702e-05, + "loss": 0.0008, + "num_input_tokens_seen": 31902672, + "step": 54995 + }, + { + "epoch": 8.191837950551088, + "grad_norm": 0.01618686504662037, + "learning_rate": 3.6768795519288916e-05, + "loss": 0.2825, + "num_input_tokens_seen": 31905456, + "step": 55000 + }, + { + "epoch": 8.192582663092047, + "grad_norm": 88.68907165527344, + "learning_rate": 3.6765928566868976e-05, + "loss": 0.1053, + "num_input_tokens_seen": 31908272, + "step": 55005 + }, + { + "epoch": 8.193327375633006, + "grad_norm": 25.323532104492188, + "learning_rate": 3.676306141567562e-05, + "loss": 0.0625, + "num_input_tokens_seen": 31911216, + "step": 55010 + }, + { + "epoch": 8.194072088173964, + "grad_norm": 0.37281015515327454, + "learning_rate": 3.67601940657573e-05, + "loss": 0.0036, + "num_input_tokens_seen": 31914096, + "step": 55015 + }, + { + "epoch": 8.194816800714925, + "grad_norm": 15.846817016601562, + "learning_rate": 3.675732651716246e-05, + "loss": 0.1627, + "num_input_tokens_seen": 31916848, + "step": 55020 + }, + { + "epoch": 8.195561513255884, + "grad_norm": 0.07317587733268738, + "learning_rate": 3.675445876993953e-05, + "loss": 0.0004, + "num_input_tokens_seen": 31919792, + "step": 55025 + }, + { + "epoch": 8.196306225796842, + "grad_norm": 0.0245959535241127, + "learning_rate": 3.675159082413697e-05, + "loss": 0.0003, + "num_input_tokens_seen": 31922704, + "step": 55030 + }, + { + "epoch": 8.197050938337801, + "grad_norm": 0.1418396234512329, + "learning_rate": 3.674872267980323e-05, + "loss": 0.0002, + "num_input_tokens_seen": 31925744, + "step": 55035 + }, + { + "epoch": 8.19779565087876, + "grad_norm": 0.7673089504241943, + "learning_rate": 3.674585433698676e-05, + "loss": 0.0766, + "num_input_tokens_seen": 31928560, + "step": 55040 + }, + { + "epoch": 8.19854036341972, + "grad_norm": 0.01228575874119997, + "learning_rate": 3.674298579573602e-05, + "loss": 0.1903, + "num_input_tokens_seen": 31931472, + "step": 55045 + }, + { + "epoch": 8.19928507596068, + "grad_norm": 0.005640655290335417, + "learning_rate": 3.674011705609946e-05, + "loss": 0.1105, + "num_input_tokens_seen": 31935152, + "step": 55050 + }, + { + "epoch": 8.200029788501638, + "grad_norm": 21.289752960205078, + "learning_rate": 3.6737248118125564e-05, + "loss": 0.4005, + "num_input_tokens_seen": 31938032, + "step": 55055 + }, + { + "epoch": 8.200774501042597, + "grad_norm": 0.041236285120248795, + "learning_rate": 3.673437898186279e-05, + "loss": 0.0184, + "num_input_tokens_seen": 31940656, + "step": 55060 + }, + { + "epoch": 8.201519213583557, + "grad_norm": 0.0006025466718710959, + "learning_rate": 3.6731509647359604e-05, + "loss": 0.1816, + "num_input_tokens_seen": 31943568, + "step": 55065 + }, + { + "epoch": 8.202263926124516, + "grad_norm": 0.007086377125233412, + "learning_rate": 3.6728640114664485e-05, + "loss": 0.0975, + "num_input_tokens_seen": 31946352, + "step": 55070 + }, + { + "epoch": 8.203008638665475, + "grad_norm": 0.04061255604028702, + "learning_rate": 3.672577038382592e-05, + "loss": 0.1823, + "num_input_tokens_seen": 31948976, + "step": 55075 + }, + { + "epoch": 8.203753351206434, + "grad_norm": 0.014906907454133034, + "learning_rate": 3.672290045489238e-05, + "loss": 0.0796, + "num_input_tokens_seen": 31951824, + "step": 55080 + }, + { + "epoch": 8.204498063747394, + "grad_norm": 0.008317752741277218, + "learning_rate": 3.672003032791235e-05, + "loss": 0.3266, + "num_input_tokens_seen": 31954800, + "step": 55085 + }, + { + "epoch": 8.205242776288353, + "grad_norm": 0.016468971967697144, + "learning_rate": 3.671716000293432e-05, + "loss": 0.055, + "num_input_tokens_seen": 31957520, + "step": 55090 + }, + { + "epoch": 8.205987488829312, + "grad_norm": 0.19178162515163422, + "learning_rate": 3.671428948000677e-05, + "loss": 0.1233, + "num_input_tokens_seen": 31960208, + "step": 55095 + }, + { + "epoch": 8.20673220137027, + "grad_norm": 19.412479400634766, + "learning_rate": 3.671141875917822e-05, + "loss": 0.0557, + "num_input_tokens_seen": 31963056, + "step": 55100 + }, + { + "epoch": 8.207476913911231, + "grad_norm": 0.003878913586959243, + "learning_rate": 3.6708547840497144e-05, + "loss": 0.2316, + "num_input_tokens_seen": 31965616, + "step": 55105 + }, + { + "epoch": 8.20822162645219, + "grad_norm": 1.184955358505249, + "learning_rate": 3.6705676724012055e-05, + "loss": 0.0362, + "num_input_tokens_seen": 31968336, + "step": 55110 + }, + { + "epoch": 8.208966338993148, + "grad_norm": 44.26652145385742, + "learning_rate": 3.670280540977145e-05, + "loss": 0.4263, + "num_input_tokens_seen": 31971152, + "step": 55115 + }, + { + "epoch": 8.209711051534107, + "grad_norm": 0.013783569447696209, + "learning_rate": 3.669993389782385e-05, + "loss": 0.0648, + "num_input_tokens_seen": 31973904, + "step": 55120 + }, + { + "epoch": 8.210455764075068, + "grad_norm": 6.391139984130859, + "learning_rate": 3.669706218821776e-05, + "loss": 0.1597, + "num_input_tokens_seen": 31976784, + "step": 55125 + }, + { + "epoch": 8.211200476616026, + "grad_norm": 87.33403015136719, + "learning_rate": 3.669419028100169e-05, + "loss": 0.1805, + "num_input_tokens_seen": 31980016, + "step": 55130 + }, + { + "epoch": 8.211945189156985, + "grad_norm": 0.03743574023246765, + "learning_rate": 3.6691318176224156e-05, + "loss": 0.0011, + "num_input_tokens_seen": 31982896, + "step": 55135 + }, + { + "epoch": 8.212689901697944, + "grad_norm": 0.022142639383673668, + "learning_rate": 3.6688445873933686e-05, + "loss": 0.0211, + "num_input_tokens_seen": 31985872, + "step": 55140 + }, + { + "epoch": 8.213434614238905, + "grad_norm": 0.00904888566583395, + "learning_rate": 3.66855733741788e-05, + "loss": 0.2701, + "num_input_tokens_seen": 31988624, + "step": 55145 + }, + { + "epoch": 8.214179326779863, + "grad_norm": 3.1558921337127686, + "learning_rate": 3.668270067700803e-05, + "loss": 0.0019, + "num_input_tokens_seen": 31991664, + "step": 55150 + }, + { + "epoch": 8.214924039320822, + "grad_norm": 0.24491336941719055, + "learning_rate": 3.667982778246991e-05, + "loss": 0.1629, + "num_input_tokens_seen": 31994768, + "step": 55155 + }, + { + "epoch": 8.21566875186178, + "grad_norm": 0.27382758259773254, + "learning_rate": 3.6676954690612974e-05, + "loss": 0.017, + "num_input_tokens_seen": 31997744, + "step": 55160 + }, + { + "epoch": 8.216413464402741, + "grad_norm": 64.032470703125, + "learning_rate": 3.6674081401485746e-05, + "loss": 0.1908, + "num_input_tokens_seen": 32000720, + "step": 55165 + }, + { + "epoch": 8.2171581769437, + "grad_norm": 0.0670265182852745, + "learning_rate": 3.667120791513678e-05, + "loss": 0.0006, + "num_input_tokens_seen": 32003536, + "step": 55170 + }, + { + "epoch": 8.217902889484659, + "grad_norm": 0.0546509325504303, + "learning_rate": 3.666833423161462e-05, + "loss": 0.0094, + "num_input_tokens_seen": 32006640, + "step": 55175 + }, + { + "epoch": 8.218647602025618, + "grad_norm": 38.373050689697266, + "learning_rate": 3.666546035096781e-05, + "loss": 0.4546, + "num_input_tokens_seen": 32009232, + "step": 55180 + }, + { + "epoch": 8.219392314566578, + "grad_norm": 81.65835571289062, + "learning_rate": 3.6662586273244906e-05, + "loss": 0.1595, + "num_input_tokens_seen": 32012112, + "step": 55185 + }, + { + "epoch": 8.220137027107537, + "grad_norm": 0.0035510745365172625, + "learning_rate": 3.665971199849447e-05, + "loss": 0.0018, + "num_input_tokens_seen": 32014768, + "step": 55190 + }, + { + "epoch": 8.220881739648496, + "grad_norm": 0.006262022070586681, + "learning_rate": 3.665683752676503e-05, + "loss": 0.1471, + "num_input_tokens_seen": 32017776, + "step": 55195 + }, + { + "epoch": 8.221626452189454, + "grad_norm": 25.341148376464844, + "learning_rate": 3.665396285810519e-05, + "loss": 0.1082, + "num_input_tokens_seen": 32020304, + "step": 55200 + }, + { + "epoch": 8.222371164730415, + "grad_norm": 0.0030182411428540945, + "learning_rate": 3.665108799256348e-05, + "loss": 0.0952, + "num_input_tokens_seen": 32023152, + "step": 55205 + }, + { + "epoch": 8.223115877271374, + "grad_norm": 0.03174074366688728, + "learning_rate": 3.6648212930188474e-05, + "loss": 0.0003, + "num_input_tokens_seen": 32026032, + "step": 55210 + }, + { + "epoch": 8.223860589812332, + "grad_norm": 0.004162325989454985, + "learning_rate": 3.664533767102876e-05, + "loss": 0.0071, + "num_input_tokens_seen": 32029168, + "step": 55215 + }, + { + "epoch": 8.224605302353291, + "grad_norm": 0.4912889301776886, + "learning_rate": 3.664246221513289e-05, + "loss": 0.0325, + "num_input_tokens_seen": 32031952, + "step": 55220 + }, + { + "epoch": 8.22535001489425, + "grad_norm": 5.678673267364502, + "learning_rate": 3.663958656254947e-05, + "loss": 0.1033, + "num_input_tokens_seen": 32034640, + "step": 55225 + }, + { + "epoch": 8.22609472743521, + "grad_norm": 0.08650379627943039, + "learning_rate": 3.663671071332705e-05, + "loss": 0.0514, + "num_input_tokens_seen": 32037360, + "step": 55230 + }, + { + "epoch": 8.22683943997617, + "grad_norm": 0.5244291424751282, + "learning_rate": 3.6633834667514236e-05, + "loss": 0.0209, + "num_input_tokens_seen": 32040112, + "step": 55235 + }, + { + "epoch": 8.227584152517128, + "grad_norm": 36.018516540527344, + "learning_rate": 3.663095842515961e-05, + "loss": 0.0781, + "num_input_tokens_seen": 32043152, + "step": 55240 + }, + { + "epoch": 8.228328865058087, + "grad_norm": 0.004480695817619562, + "learning_rate": 3.662808198631176e-05, + "loss": 0.1702, + "num_input_tokens_seen": 32045808, + "step": 55245 + }, + { + "epoch": 8.229073577599047, + "grad_norm": 9.748363494873047, + "learning_rate": 3.662520535101928e-05, + "loss": 0.1094, + "num_input_tokens_seen": 32048656, + "step": 55250 + }, + { + "epoch": 8.229818290140006, + "grad_norm": 0.018791144713759422, + "learning_rate": 3.662232851933079e-05, + "loss": 0.0003, + "num_input_tokens_seen": 32051696, + "step": 55255 + }, + { + "epoch": 8.230563002680965, + "grad_norm": 64.72943115234375, + "learning_rate": 3.661945149129485e-05, + "loss": 0.2036, + "num_input_tokens_seen": 32054768, + "step": 55260 + }, + { + "epoch": 8.231307715221924, + "grad_norm": 8.88622760772705, + "learning_rate": 3.661657426696009e-05, + "loss": 0.0345, + "num_input_tokens_seen": 32057392, + "step": 55265 + }, + { + "epoch": 8.232052427762884, + "grad_norm": 0.011714734137058258, + "learning_rate": 3.6613696846375115e-05, + "loss": 0.0003, + "num_input_tokens_seen": 32060112, + "step": 55270 + }, + { + "epoch": 8.232797140303843, + "grad_norm": 0.17562484741210938, + "learning_rate": 3.661081922958854e-05, + "loss": 0.0007, + "num_input_tokens_seen": 32062832, + "step": 55275 + }, + { + "epoch": 8.233541852844802, + "grad_norm": 33.07556915283203, + "learning_rate": 3.660794141664898e-05, + "loss": 0.086, + "num_input_tokens_seen": 32065776, + "step": 55280 + }, + { + "epoch": 8.23428656538576, + "grad_norm": 0.024532431736588478, + "learning_rate": 3.660506340760504e-05, + "loss": 0.0906, + "num_input_tokens_seen": 32068848, + "step": 55285 + }, + { + "epoch": 8.235031277926721, + "grad_norm": 0.0019363396568223834, + "learning_rate": 3.660218520250535e-05, + "loss": 0.0445, + "num_input_tokens_seen": 32071440, + "step": 55290 + }, + { + "epoch": 8.23577599046768, + "grad_norm": 0.3814353942871094, + "learning_rate": 3.659930680139853e-05, + "loss": 0.1164, + "num_input_tokens_seen": 32074032, + "step": 55295 + }, + { + "epoch": 8.236520703008638, + "grad_norm": 1.0304235219955444, + "learning_rate": 3.659642820433322e-05, + "loss": 0.1921, + "num_input_tokens_seen": 32076880, + "step": 55300 + }, + { + "epoch": 8.237265415549597, + "grad_norm": 55.102298736572266, + "learning_rate": 3.659354941135803e-05, + "loss": 0.3527, + "num_input_tokens_seen": 32079984, + "step": 55305 + }, + { + "epoch": 8.238010128090558, + "grad_norm": 0.011635987088084221, + "learning_rate": 3.65906704225216e-05, + "loss": 0.2251, + "num_input_tokens_seen": 32083152, + "step": 55310 + }, + { + "epoch": 8.238754840631517, + "grad_norm": 0.021734070032835007, + "learning_rate": 3.658779123787259e-05, + "loss": 0.2174, + "num_input_tokens_seen": 32086064, + "step": 55315 + }, + { + "epoch": 8.239499553172475, + "grad_norm": 1.4219635725021362, + "learning_rate": 3.6584911857459624e-05, + "loss": 0.1286, + "num_input_tokens_seen": 32088880, + "step": 55320 + }, + { + "epoch": 8.240244265713434, + "grad_norm": 3.21394944190979, + "learning_rate": 3.6582032281331345e-05, + "loss": 0.0985, + "num_input_tokens_seen": 32091792, + "step": 55325 + }, + { + "epoch": 8.240988978254395, + "grad_norm": 0.6803685426712036, + "learning_rate": 3.6579152509536395e-05, + "loss": 0.0007, + "num_input_tokens_seen": 32094544, + "step": 55330 + }, + { + "epoch": 8.241733690795353, + "grad_norm": 0.013325229287147522, + "learning_rate": 3.6576272542123435e-05, + "loss": 0.0098, + "num_input_tokens_seen": 32097392, + "step": 55335 + }, + { + "epoch": 8.242478403336312, + "grad_norm": 0.08031900227069855, + "learning_rate": 3.657339237914111e-05, + "loss": 0.0005, + "num_input_tokens_seen": 32100400, + "step": 55340 + }, + { + "epoch": 8.24322311587727, + "grad_norm": 149.40249633789062, + "learning_rate": 3.657051202063809e-05, + "loss": 0.0952, + "num_input_tokens_seen": 32103344, + "step": 55345 + }, + { + "epoch": 8.243967828418231, + "grad_norm": 115.95681762695312, + "learning_rate": 3.656763146666303e-05, + "loss": 0.3013, + "num_input_tokens_seen": 32106576, + "step": 55350 + }, + { + "epoch": 8.24471254095919, + "grad_norm": 0.0045804050751030445, + "learning_rate": 3.6564750717264595e-05, + "loss": 0.0307, + "num_input_tokens_seen": 32109552, + "step": 55355 + }, + { + "epoch": 8.245457253500149, + "grad_norm": 0.06155640631914139, + "learning_rate": 3.656186977249145e-05, + "loss": 0.1358, + "num_input_tokens_seen": 32112400, + "step": 55360 + }, + { + "epoch": 8.246201966041108, + "grad_norm": 0.0026920491363853216, + "learning_rate": 3.655898863239226e-05, + "loss": 0.0004, + "num_input_tokens_seen": 32115312, + "step": 55365 + }, + { + "epoch": 8.246946678582066, + "grad_norm": 0.6918419599533081, + "learning_rate": 3.655610729701571e-05, + "loss": 0.0568, + "num_input_tokens_seen": 32118224, + "step": 55370 + }, + { + "epoch": 8.247691391123027, + "grad_norm": 0.005777292884886265, + "learning_rate": 3.655322576641047e-05, + "loss": 0.2307, + "num_input_tokens_seen": 32121072, + "step": 55375 + }, + { + "epoch": 8.248436103663986, + "grad_norm": 26.4240665435791, + "learning_rate": 3.655034404062522e-05, + "loss": 0.142, + "num_input_tokens_seen": 32124112, + "step": 55380 + }, + { + "epoch": 8.249180816204944, + "grad_norm": 0.006763457786291838, + "learning_rate": 3.654746211970865e-05, + "loss": 0.0002, + "num_input_tokens_seen": 32126960, + "step": 55385 + }, + { + "epoch": 8.249925528745903, + "grad_norm": 0.8492786884307861, + "learning_rate": 3.654458000370945e-05, + "loss": 0.0009, + "num_input_tokens_seen": 32129936, + "step": 55390 + }, + { + "epoch": 8.250670241286864, + "grad_norm": 0.0022293380461633205, + "learning_rate": 3.65416976926763e-05, + "loss": 0.2294, + "num_input_tokens_seen": 32132560, + "step": 55395 + }, + { + "epoch": 8.251414953827823, + "grad_norm": 0.026681289076805115, + "learning_rate": 3.653881518665789e-05, + "loss": 0.0002, + "num_input_tokens_seen": 32135600, + "step": 55400 + }, + { + "epoch": 8.252159666368781, + "grad_norm": 0.15114015340805054, + "learning_rate": 3.6535932485702933e-05, + "loss": 0.0355, + "num_input_tokens_seen": 32138512, + "step": 55405 + }, + { + "epoch": 8.25290437890974, + "grad_norm": 0.006741710938513279, + "learning_rate": 3.6533049589860115e-05, + "loss": 0.0003, + "num_input_tokens_seen": 32141488, + "step": 55410 + }, + { + "epoch": 8.2536490914507, + "grad_norm": 30.510601043701172, + "learning_rate": 3.653016649917815e-05, + "loss": 0.2792, + "num_input_tokens_seen": 32144464, + "step": 55415 + }, + { + "epoch": 8.25439380399166, + "grad_norm": 89.98735046386719, + "learning_rate": 3.652728321370575e-05, + "loss": 0.1962, + "num_input_tokens_seen": 32147472, + "step": 55420 + }, + { + "epoch": 8.255138516532618, + "grad_norm": 0.06179715692996979, + "learning_rate": 3.6524399733491605e-05, + "loss": 0.0006, + "num_input_tokens_seen": 32150256, + "step": 55425 + }, + { + "epoch": 8.255883229073577, + "grad_norm": 58.135894775390625, + "learning_rate": 3.652151605858444e-05, + "loss": 0.0453, + "num_input_tokens_seen": 32153296, + "step": 55430 + }, + { + "epoch": 8.256627941614537, + "grad_norm": 0.6179900765419006, + "learning_rate": 3.651863218903297e-05, + "loss": 0.1691, + "num_input_tokens_seen": 32156304, + "step": 55435 + }, + { + "epoch": 8.257372654155496, + "grad_norm": 19.220905303955078, + "learning_rate": 3.651574812488592e-05, + "loss": 0.0657, + "num_input_tokens_seen": 32158896, + "step": 55440 + }, + { + "epoch": 8.258117366696455, + "grad_norm": 0.006688289809972048, + "learning_rate": 3.651286386619201e-05, + "loss": 0.6252, + "num_input_tokens_seen": 32161680, + "step": 55445 + }, + { + "epoch": 8.258862079237414, + "grad_norm": 57.695133209228516, + "learning_rate": 3.650997941299996e-05, + "loss": 0.422, + "num_input_tokens_seen": 32164752, + "step": 55450 + }, + { + "epoch": 8.259606791778374, + "grad_norm": 0.047141578048467636, + "learning_rate": 3.650709476535852e-05, + "loss": 0.3177, + "num_input_tokens_seen": 32167600, + "step": 55455 + }, + { + "epoch": 8.260351504319333, + "grad_norm": 0.011652559973299503, + "learning_rate": 3.6504209923316396e-05, + "loss": 0.0399, + "num_input_tokens_seen": 32170384, + "step": 55460 + }, + { + "epoch": 8.261096216860292, + "grad_norm": 0.0016313799424096942, + "learning_rate": 3.650132488692234e-05, + "loss": 0.078, + "num_input_tokens_seen": 32173392, + "step": 55465 + }, + { + "epoch": 8.26184092940125, + "grad_norm": 58.543033599853516, + "learning_rate": 3.649843965622509e-05, + "loss": 0.2121, + "num_input_tokens_seen": 32176464, + "step": 55470 + }, + { + "epoch": 8.262585641942211, + "grad_norm": 0.011355343274772167, + "learning_rate": 3.6495554231273386e-05, + "loss": 0.0017, + "num_input_tokens_seen": 32179632, + "step": 55475 + }, + { + "epoch": 8.26333035448317, + "grad_norm": 2.931547164916992, + "learning_rate": 3.6492668612115986e-05, + "loss": 0.0532, + "num_input_tokens_seen": 32182512, + "step": 55480 + }, + { + "epoch": 8.264075067024129, + "grad_norm": 6.096297740936279, + "learning_rate": 3.648978279880162e-05, + "loss": 0.0323, + "num_input_tokens_seen": 32185264, + "step": 55485 + }, + { + "epoch": 8.264819779565087, + "grad_norm": 0.021722303703427315, + "learning_rate": 3.648689679137906e-05, + "loss": 0.1779, + "num_input_tokens_seen": 32188272, + "step": 55490 + }, + { + "epoch": 8.265564492106048, + "grad_norm": 0.03340922296047211, + "learning_rate": 3.648401058989705e-05, + "loss": 0.3429, + "num_input_tokens_seen": 32191248, + "step": 55495 + }, + { + "epoch": 8.266309204647007, + "grad_norm": 0.34879258275032043, + "learning_rate": 3.6481124194404345e-05, + "loss": 0.0028, + "num_input_tokens_seen": 32193840, + "step": 55500 + }, + { + "epoch": 8.267053917187965, + "grad_norm": 0.2586355209350586, + "learning_rate": 3.6478237604949725e-05, + "loss": 0.1921, + "num_input_tokens_seen": 32196848, + "step": 55505 + }, + { + "epoch": 8.267798629728924, + "grad_norm": 0.016072094440460205, + "learning_rate": 3.647535082158194e-05, + "loss": 0.0007, + "num_input_tokens_seen": 32199568, + "step": 55510 + }, + { + "epoch": 8.268543342269885, + "grad_norm": 0.005770782474428415, + "learning_rate": 3.6472463844349776e-05, + "loss": 0.006, + "num_input_tokens_seen": 32202288, + "step": 55515 + }, + { + "epoch": 8.269288054810843, + "grad_norm": 0.015740659087896347, + "learning_rate": 3.646957667330199e-05, + "loss": 0.1741, + "num_input_tokens_seen": 32205136, + "step": 55520 + }, + { + "epoch": 8.270032767351802, + "grad_norm": 0.007379376795142889, + "learning_rate": 3.6466689308487364e-05, + "loss": 0.2149, + "num_input_tokens_seen": 32207952, + "step": 55525 + }, + { + "epoch": 8.270777479892761, + "grad_norm": 11.928914070129395, + "learning_rate": 3.646380174995468e-05, + "loss": 0.0488, + "num_input_tokens_seen": 32210704, + "step": 55530 + }, + { + "epoch": 8.271522192433721, + "grad_norm": 0.013834839686751366, + "learning_rate": 3.646091399775271e-05, + "loss": 0.0014, + "num_input_tokens_seen": 32213392, + "step": 55535 + }, + { + "epoch": 8.27226690497468, + "grad_norm": 0.010599211789667606, + "learning_rate": 3.645802605193025e-05, + "loss": 0.1696, + "num_input_tokens_seen": 32216336, + "step": 55540 + }, + { + "epoch": 8.273011617515639, + "grad_norm": 28.753347396850586, + "learning_rate": 3.645513791253608e-05, + "loss": 0.2387, + "num_input_tokens_seen": 32219536, + "step": 55545 + }, + { + "epoch": 8.273756330056598, + "grad_norm": 0.0631803423166275, + "learning_rate": 3.6452249579619005e-05, + "loss": 0.0736, + "num_input_tokens_seen": 32222224, + "step": 55550 + }, + { + "epoch": 8.274501042597556, + "grad_norm": 0.1025458574295044, + "learning_rate": 3.644936105322781e-05, + "loss": 0.0472, + "num_input_tokens_seen": 32225072, + "step": 55555 + }, + { + "epoch": 8.275245755138517, + "grad_norm": 37.06667709350586, + "learning_rate": 3.644647233341129e-05, + "loss": 0.0168, + "num_input_tokens_seen": 32227760, + "step": 55560 + }, + { + "epoch": 8.275990467679476, + "grad_norm": 0.015675906091928482, + "learning_rate": 3.644358342021826e-05, + "loss": 0.1718, + "num_input_tokens_seen": 32230704, + "step": 55565 + }, + { + "epoch": 8.276735180220435, + "grad_norm": 1.4381413459777832, + "learning_rate": 3.644069431369752e-05, + "loss": 0.1091, + "num_input_tokens_seen": 32233744, + "step": 55570 + }, + { + "epoch": 8.277479892761393, + "grad_norm": 47.22489547729492, + "learning_rate": 3.643780501389787e-05, + "loss": 0.119, + "num_input_tokens_seen": 32236656, + "step": 55575 + }, + { + "epoch": 8.278224605302354, + "grad_norm": 0.02308829501271248, + "learning_rate": 3.643491552086814e-05, + "loss": 0.0003, + "num_input_tokens_seen": 32239344, + "step": 55580 + }, + { + "epoch": 8.278969317843313, + "grad_norm": 9.803462982177734, + "learning_rate": 3.643202583465713e-05, + "loss": 0.0036, + "num_input_tokens_seen": 32242352, + "step": 55585 + }, + { + "epoch": 8.279714030384271, + "grad_norm": 126.25794219970703, + "learning_rate": 3.6429135955313664e-05, + "loss": 0.3117, + "num_input_tokens_seen": 32245392, + "step": 55590 + }, + { + "epoch": 8.28045874292523, + "grad_norm": 5.546287536621094, + "learning_rate": 3.6426245882886554e-05, + "loss": 0.0139, + "num_input_tokens_seen": 32248272, + "step": 55595 + }, + { + "epoch": 8.28120345546619, + "grad_norm": 0.14377658069133759, + "learning_rate": 3.6423355617424634e-05, + "loss": 0.0008, + "num_input_tokens_seen": 32250928, + "step": 55600 + }, + { + "epoch": 8.28194816800715, + "grad_norm": 0.005545423366129398, + "learning_rate": 3.6420465158976746e-05, + "loss": 0.1399, + "num_input_tokens_seen": 32254000, + "step": 55605 + }, + { + "epoch": 8.282692880548108, + "grad_norm": 0.20851825177669525, + "learning_rate": 3.6417574507591694e-05, + "loss": 0.0003, + "num_input_tokens_seen": 32257008, + "step": 55610 + }, + { + "epoch": 8.283437593089067, + "grad_norm": 0.019014732912182808, + "learning_rate": 3.641468366331833e-05, + "loss": 0.0195, + "num_input_tokens_seen": 32259792, + "step": 55615 + }, + { + "epoch": 8.284182305630027, + "grad_norm": 0.00379466125741601, + "learning_rate": 3.641179262620548e-05, + "loss": 0.0173, + "num_input_tokens_seen": 32262928, + "step": 55620 + }, + { + "epoch": 8.284927018170986, + "grad_norm": 0.9898366928100586, + "learning_rate": 3.6408901396301995e-05, + "loss": 0.0917, + "num_input_tokens_seen": 32265840, + "step": 55625 + }, + { + "epoch": 8.285671730711945, + "grad_norm": 0.4229317903518677, + "learning_rate": 3.640600997365672e-05, + "loss": 0.1093, + "num_input_tokens_seen": 32268752, + "step": 55630 + }, + { + "epoch": 8.286416443252904, + "grad_norm": 67.01350402832031, + "learning_rate": 3.64031183583185e-05, + "loss": 0.2947, + "num_input_tokens_seen": 32271312, + "step": 55635 + }, + { + "epoch": 8.287161155793864, + "grad_norm": 32.14003372192383, + "learning_rate": 3.6400226550336166e-05, + "loss": 0.169, + "num_input_tokens_seen": 32274352, + "step": 55640 + }, + { + "epoch": 8.287905868334823, + "grad_norm": 0.007516045588999987, + "learning_rate": 3.6397334549758614e-05, + "loss": 0.1724, + "num_input_tokens_seen": 32277104, + "step": 55645 + }, + { + "epoch": 8.288650580875782, + "grad_norm": 0.10305006802082062, + "learning_rate": 3.639444235663467e-05, + "loss": 0.2537, + "num_input_tokens_seen": 32279824, + "step": 55650 + }, + { + "epoch": 8.28939529341674, + "grad_norm": 109.53150939941406, + "learning_rate": 3.6391549971013213e-05, + "loss": 0.1146, + "num_input_tokens_seen": 32282576, + "step": 55655 + }, + { + "epoch": 8.290140005957701, + "grad_norm": 6.230197429656982, + "learning_rate": 3.638865739294308e-05, + "loss": 0.0029, + "num_input_tokens_seen": 32285520, + "step": 55660 + }, + { + "epoch": 8.29088471849866, + "grad_norm": 46.065250396728516, + "learning_rate": 3.6385764622473164e-05, + "loss": 0.2888, + "num_input_tokens_seen": 32288464, + "step": 55665 + }, + { + "epoch": 8.291629431039619, + "grad_norm": 25.081315994262695, + "learning_rate": 3.6382871659652326e-05, + "loss": 0.279, + "num_input_tokens_seen": 32291248, + "step": 55670 + }, + { + "epoch": 8.292374143580577, + "grad_norm": 5.08182954788208, + "learning_rate": 3.6379978504529436e-05, + "loss": 0.0008, + "num_input_tokens_seen": 32294352, + "step": 55675 + }, + { + "epoch": 8.293118856121538, + "grad_norm": 0.039707981050014496, + "learning_rate": 3.637708515715338e-05, + "loss": 0.0292, + "num_input_tokens_seen": 32297200, + "step": 55680 + }, + { + "epoch": 8.293863568662497, + "grad_norm": 0.003064636141061783, + "learning_rate": 3.637419161757304e-05, + "loss": 0.1147, + "num_input_tokens_seen": 32299888, + "step": 55685 + }, + { + "epoch": 8.294608281203455, + "grad_norm": 0.00664798729121685, + "learning_rate": 3.637129788583729e-05, + "loss": 0.0011, + "num_input_tokens_seen": 32302544, + "step": 55690 + }, + { + "epoch": 8.295352993744414, + "grad_norm": 32.92739486694336, + "learning_rate": 3.6368403961995014e-05, + "loss": 0.2026, + "num_input_tokens_seen": 32305008, + "step": 55695 + }, + { + "epoch": 8.296097706285375, + "grad_norm": 11.457490921020508, + "learning_rate": 3.63655098460951e-05, + "loss": 0.3262, + "num_input_tokens_seen": 32308304, + "step": 55700 + }, + { + "epoch": 8.296842418826333, + "grad_norm": 224.7195281982422, + "learning_rate": 3.636261553818646e-05, + "loss": 0.3468, + "num_input_tokens_seen": 32311120, + "step": 55705 + }, + { + "epoch": 8.297587131367292, + "grad_norm": 5.894731044769287, + "learning_rate": 3.6359721038317976e-05, + "loss": 0.1405, + "num_input_tokens_seen": 32313968, + "step": 55710 + }, + { + "epoch": 8.298331843908251, + "grad_norm": 0.049024272710084915, + "learning_rate": 3.6356826346538555e-05, + "loss": 0.24, + "num_input_tokens_seen": 32316912, + "step": 55715 + }, + { + "epoch": 8.299076556449211, + "grad_norm": 0.8397465348243713, + "learning_rate": 3.635393146289709e-05, + "loss": 0.2613, + "num_input_tokens_seen": 32319888, + "step": 55720 + }, + { + "epoch": 8.29982126899017, + "grad_norm": 0.07055122405290604, + "learning_rate": 3.6351036387442496e-05, + "loss": 0.3934, + "num_input_tokens_seen": 32322768, + "step": 55725 + }, + { + "epoch": 8.300565981531129, + "grad_norm": 0.16600577533245087, + "learning_rate": 3.634814112022368e-05, + "loss": 0.0014, + "num_input_tokens_seen": 32325552, + "step": 55730 + }, + { + "epoch": 8.301310694072088, + "grad_norm": 0.10845518857240677, + "learning_rate": 3.634524566128955e-05, + "loss": 0.0657, + "num_input_tokens_seen": 32328528, + "step": 55735 + }, + { + "epoch": 8.302055406613047, + "grad_norm": 118.92909240722656, + "learning_rate": 3.6342350010689017e-05, + "loss": 0.1826, + "num_input_tokens_seen": 32331312, + "step": 55740 + }, + { + "epoch": 8.302800119154007, + "grad_norm": 0.008346455171704292, + "learning_rate": 3.633945416847102e-05, + "loss": 0.0004, + "num_input_tokens_seen": 32334160, + "step": 55745 + }, + { + "epoch": 8.303544831694966, + "grad_norm": 12.804290771484375, + "learning_rate": 3.6336558134684465e-05, + "loss": 0.2267, + "num_input_tokens_seen": 32336944, + "step": 55750 + }, + { + "epoch": 8.304289544235925, + "grad_norm": 0.15663623809814453, + "learning_rate": 3.6333661909378286e-05, + "loss": 0.1062, + "num_input_tokens_seen": 32339952, + "step": 55755 + }, + { + "epoch": 8.305034256776883, + "grad_norm": 0.0783262848854065, + "learning_rate": 3.63307654926014e-05, + "loss": 0.0021, + "num_input_tokens_seen": 32342704, + "step": 55760 + }, + { + "epoch": 8.305778969317844, + "grad_norm": 32.09528732299805, + "learning_rate": 3.632786888440276e-05, + "loss": 0.239, + "num_input_tokens_seen": 32345616, + "step": 55765 + }, + { + "epoch": 8.306523681858803, + "grad_norm": 0.02399246208369732, + "learning_rate": 3.6324972084831284e-05, + "loss": 0.0625, + "num_input_tokens_seen": 32348432, + "step": 55770 + }, + { + "epoch": 8.307268394399761, + "grad_norm": 4.31131649017334, + "learning_rate": 3.632207509393591e-05, + "loss": 0.1667, + "num_input_tokens_seen": 32351440, + "step": 55775 + }, + { + "epoch": 8.30801310694072, + "grad_norm": 0.0048417868092656136, + "learning_rate": 3.6319177911765583e-05, + "loss": 0.0002, + "num_input_tokens_seen": 32354352, + "step": 55780 + }, + { + "epoch": 8.30875781948168, + "grad_norm": 30.840042114257812, + "learning_rate": 3.631628053836926e-05, + "loss": 0.221, + "num_input_tokens_seen": 32357264, + "step": 55785 + }, + { + "epoch": 8.30950253202264, + "grad_norm": 0.6942218542098999, + "learning_rate": 3.631338297379587e-05, + "loss": 0.0362, + "num_input_tokens_seen": 32360016, + "step": 55790 + }, + { + "epoch": 8.310247244563598, + "grad_norm": 0.02174099162220955, + "learning_rate": 3.631048521809437e-05, + "loss": 0.22, + "num_input_tokens_seen": 32363216, + "step": 55795 + }, + { + "epoch": 8.310991957104557, + "grad_norm": 66.2894287109375, + "learning_rate": 3.630758727131373e-05, + "loss": 0.4986, + "num_input_tokens_seen": 32366000, + "step": 55800 + }, + { + "epoch": 8.311736669645517, + "grad_norm": 17.63193702697754, + "learning_rate": 3.6304689133502884e-05, + "loss": 0.171, + "num_input_tokens_seen": 32369040, + "step": 55805 + }, + { + "epoch": 8.312481382186476, + "grad_norm": 0.007713749073445797, + "learning_rate": 3.630179080471081e-05, + "loss": 0.0691, + "num_input_tokens_seen": 32372208, + "step": 55810 + }, + { + "epoch": 8.313226094727435, + "grad_norm": 0.2002287060022354, + "learning_rate": 3.629889228498646e-05, + "loss": 0.0009, + "num_input_tokens_seen": 32375344, + "step": 55815 + }, + { + "epoch": 8.313970807268394, + "grad_norm": 54.261417388916016, + "learning_rate": 3.629599357437882e-05, + "loss": 0.2506, + "num_input_tokens_seen": 32378480, + "step": 55820 + }, + { + "epoch": 8.314715519809354, + "grad_norm": 0.028300439938902855, + "learning_rate": 3.629309467293685e-05, + "loss": 0.0091, + "num_input_tokens_seen": 32381328, + "step": 55825 + }, + { + "epoch": 8.315460232350313, + "grad_norm": 0.07510162144899368, + "learning_rate": 3.6290195580709505e-05, + "loss": 0.2226, + "num_input_tokens_seen": 32384240, + "step": 55830 + }, + { + "epoch": 8.316204944891272, + "grad_norm": 0.00301935407333076, + "learning_rate": 3.62872962977458e-05, + "loss": 0.0013, + "num_input_tokens_seen": 32387280, + "step": 55835 + }, + { + "epoch": 8.31694965743223, + "grad_norm": 0.05981629341840744, + "learning_rate": 3.628439682409468e-05, + "loss": 0.1422, + "num_input_tokens_seen": 32389936, + "step": 55840 + }, + { + "epoch": 8.317694369973191, + "grad_norm": 35.34733581542969, + "learning_rate": 3.628149715980516e-05, + "loss": 0.3361, + "num_input_tokens_seen": 32393008, + "step": 55845 + }, + { + "epoch": 8.31843908251415, + "grad_norm": 104.73714447021484, + "learning_rate": 3.62785973049262e-05, + "loss": 0.1603, + "num_input_tokens_seen": 32395792, + "step": 55850 + }, + { + "epoch": 8.319183795055109, + "grad_norm": 0.021680915728211403, + "learning_rate": 3.627569725950681e-05, + "loss": 0.0023, + "num_input_tokens_seen": 32398800, + "step": 55855 + }, + { + "epoch": 8.319928507596067, + "grad_norm": 48.65449523925781, + "learning_rate": 3.6272797023595974e-05, + "loss": 0.1359, + "num_input_tokens_seen": 32401328, + "step": 55860 + }, + { + "epoch": 8.320673220137028, + "grad_norm": 0.00898469053208828, + "learning_rate": 3.626989659724268e-05, + "loss": 0.1395, + "num_input_tokens_seen": 32404560, + "step": 55865 + }, + { + "epoch": 8.321417932677987, + "grad_norm": 0.03050653263926506, + "learning_rate": 3.626699598049594e-05, + "loss": 0.1696, + "num_input_tokens_seen": 32407184, + "step": 55870 + }, + { + "epoch": 8.322162645218945, + "grad_norm": 0.024977460503578186, + "learning_rate": 3.626409517340476e-05, + "loss": 0.2031, + "num_input_tokens_seen": 32410416, + "step": 55875 + }, + { + "epoch": 8.322907357759904, + "grad_norm": 0.03214678913354874, + "learning_rate": 3.626119417601814e-05, + "loss": 0.0551, + "num_input_tokens_seen": 32413648, + "step": 55880 + }, + { + "epoch": 8.323652070300863, + "grad_norm": 0.014343329705297947, + "learning_rate": 3.625829298838509e-05, + "loss": 0.0046, + "num_input_tokens_seen": 32416368, + "step": 55885 + }, + { + "epoch": 8.324396782841823, + "grad_norm": 0.010035539045929909, + "learning_rate": 3.6255391610554624e-05, + "loss": 0.1318, + "num_input_tokens_seen": 32419248, + "step": 55890 + }, + { + "epoch": 8.325141495382782, + "grad_norm": 0.010575794614851475, + "learning_rate": 3.625249004257575e-05, + "loss": 0.0337, + "num_input_tokens_seen": 32422384, + "step": 55895 + }, + { + "epoch": 8.325886207923741, + "grad_norm": 0.011512159369885921, + "learning_rate": 3.6249588284497496e-05, + "loss": 0.0163, + "num_input_tokens_seen": 32425232, + "step": 55900 + }, + { + "epoch": 8.3266309204647, + "grad_norm": 6.5294294357299805, + "learning_rate": 3.624668633636888e-05, + "loss": 0.3195, + "num_input_tokens_seen": 32428240, + "step": 55905 + }, + { + "epoch": 8.32737563300566, + "grad_norm": 2.5257110595703125, + "learning_rate": 3.624378419823893e-05, + "loss": 0.1604, + "num_input_tokens_seen": 32431088, + "step": 55910 + }, + { + "epoch": 8.328120345546619, + "grad_norm": 8.061113357543945, + "learning_rate": 3.624088187015668e-05, + "loss": 0.2106, + "num_input_tokens_seen": 32433744, + "step": 55915 + }, + { + "epoch": 8.328865058087578, + "grad_norm": 0.07221288979053497, + "learning_rate": 3.623797935217115e-05, + "loss": 0.0199, + "num_input_tokens_seen": 32436976, + "step": 55920 + }, + { + "epoch": 8.329609770628537, + "grad_norm": 70.389404296875, + "learning_rate": 3.623507664433138e-05, + "loss": 0.188, + "num_input_tokens_seen": 32439696, + "step": 55925 + }, + { + "epoch": 8.330354483169497, + "grad_norm": 0.08125228434801102, + "learning_rate": 3.6232173746686405e-05, + "loss": 0.0673, + "num_input_tokens_seen": 32442288, + "step": 55930 + }, + { + "epoch": 8.331099195710456, + "grad_norm": 0.030869994312524796, + "learning_rate": 3.6229270659285276e-05, + "loss": 0.139, + "num_input_tokens_seen": 32445136, + "step": 55935 + }, + { + "epoch": 8.331843908251415, + "grad_norm": 0.0906967967748642, + "learning_rate": 3.622636738217703e-05, + "loss": 0.0952, + "num_input_tokens_seen": 32448080, + "step": 55940 + }, + { + "epoch": 8.332588620792373, + "grad_norm": 273.6839904785156, + "learning_rate": 3.6223463915410714e-05, + "loss": 0.2155, + "num_input_tokens_seen": 32450928, + "step": 55945 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 10.40923023223877, + "learning_rate": 3.622056025903539e-05, + "loss": 0.3233, + "num_input_tokens_seen": 32453808, + "step": 55950 + }, + { + "epoch": 8.334078045874293, + "grad_norm": 0.015688536688685417, + "learning_rate": 3.62176564131001e-05, + "loss": 0.0521, + "num_input_tokens_seen": 32456816, + "step": 55955 + }, + { + "epoch": 8.334822758415251, + "grad_norm": 0.25590193271636963, + "learning_rate": 3.62147523776539e-05, + "loss": 0.0006, + "num_input_tokens_seen": 32459760, + "step": 55960 + }, + { + "epoch": 8.33556747095621, + "grad_norm": 0.08165722340345383, + "learning_rate": 3.621184815274587e-05, + "loss": 0.4565, + "num_input_tokens_seen": 32462672, + "step": 55965 + }, + { + "epoch": 8.33631218349717, + "grad_norm": 12.298406600952148, + "learning_rate": 3.620894373842505e-05, + "loss": 0.2219, + "num_input_tokens_seen": 32465648, + "step": 55970 + }, + { + "epoch": 8.33705689603813, + "grad_norm": 1.2112613916397095, + "learning_rate": 3.6206039134740525e-05, + "loss": 0.0626, + "num_input_tokens_seen": 32468496, + "step": 55975 + }, + { + "epoch": 8.337801608579088, + "grad_norm": 11.62504768371582, + "learning_rate": 3.620313434174135e-05, + "loss": 0.004, + "num_input_tokens_seen": 32471440, + "step": 55980 + }, + { + "epoch": 8.338546321120047, + "grad_norm": 32.863990783691406, + "learning_rate": 3.6200229359476614e-05, + "loss": 0.226, + "num_input_tokens_seen": 32474384, + "step": 55985 + }, + { + "epoch": 8.339291033661008, + "grad_norm": 0.15374743938446045, + "learning_rate": 3.6197324187995384e-05, + "loss": 0.0138, + "num_input_tokens_seen": 32477360, + "step": 55990 + }, + { + "epoch": 8.340035746201966, + "grad_norm": 0.013765600509941578, + "learning_rate": 3.6194418827346746e-05, + "loss": 0.0005, + "num_input_tokens_seen": 32480528, + "step": 55995 + }, + { + "epoch": 8.340780458742925, + "grad_norm": 2.147087812423706, + "learning_rate": 3.619151327757977e-05, + "loss": 0.1328, + "num_input_tokens_seen": 32483152, + "step": 56000 + }, + { + "epoch": 8.341525171283884, + "grad_norm": 1.261767029762268, + "learning_rate": 3.6188607538743556e-05, + "loss": 0.0018, + "num_input_tokens_seen": 32485744, + "step": 56005 + }, + { + "epoch": 8.342269883824844, + "grad_norm": 1.054177165031433, + "learning_rate": 3.618570161088719e-05, + "loss": 0.0012, + "num_input_tokens_seen": 32488784, + "step": 56010 + }, + { + "epoch": 8.343014596365803, + "grad_norm": 0.7713714838027954, + "learning_rate": 3.6182795494059764e-05, + "loss": 0.0009, + "num_input_tokens_seen": 32491856, + "step": 56015 + }, + { + "epoch": 8.343759308906762, + "grad_norm": 2.593179225921631, + "learning_rate": 3.617988918831038e-05, + "loss": 0.3173, + "num_input_tokens_seen": 32494672, + "step": 56020 + }, + { + "epoch": 8.34450402144772, + "grad_norm": 0.2543911337852478, + "learning_rate": 3.617698269368812e-05, + "loss": 0.0932, + "num_input_tokens_seen": 32497392, + "step": 56025 + }, + { + "epoch": 8.345248733988681, + "grad_norm": 58.89563751220703, + "learning_rate": 3.61740760102421e-05, + "loss": 0.2183, + "num_input_tokens_seen": 32500528, + "step": 56030 + }, + { + "epoch": 8.34599344652964, + "grad_norm": 0.029943717643618584, + "learning_rate": 3.617116913802143e-05, + "loss": 0.1351, + "num_input_tokens_seen": 32503536, + "step": 56035 + }, + { + "epoch": 8.346738159070599, + "grad_norm": 0.03778783231973648, + "learning_rate": 3.61682620770752e-05, + "loss": 0.3094, + "num_input_tokens_seen": 32506928, + "step": 56040 + }, + { + "epoch": 8.347482871611557, + "grad_norm": 0.014402241446077824, + "learning_rate": 3.616535482745254e-05, + "loss": 0.1791, + "num_input_tokens_seen": 32509648, + "step": 56045 + }, + { + "epoch": 8.348227584152518, + "grad_norm": 0.5520220398902893, + "learning_rate": 3.616244738920256e-05, + "loss": 0.2923, + "num_input_tokens_seen": 32512720, + "step": 56050 + }, + { + "epoch": 8.348972296693477, + "grad_norm": 0.03148435428738594, + "learning_rate": 3.615953976237438e-05, + "loss": 0.0656, + "num_input_tokens_seen": 32515504, + "step": 56055 + }, + { + "epoch": 8.349717009234435, + "grad_norm": 0.04626039043068886, + "learning_rate": 3.6156631947017106e-05, + "loss": 0.0179, + "num_input_tokens_seen": 32518256, + "step": 56060 + }, + { + "epoch": 8.350461721775394, + "grad_norm": 2.2332868576049805, + "learning_rate": 3.6153723943179876e-05, + "loss": 0.3923, + "num_input_tokens_seen": 32520816, + "step": 56065 + }, + { + "epoch": 8.351206434316353, + "grad_norm": 56.64291763305664, + "learning_rate": 3.6150815750911825e-05, + "loss": 0.2831, + "num_input_tokens_seen": 32523920, + "step": 56070 + }, + { + "epoch": 8.351951146857314, + "grad_norm": 0.006555766332894564, + "learning_rate": 3.614790737026207e-05, + "loss": 0.0335, + "num_input_tokens_seen": 32526960, + "step": 56075 + }, + { + "epoch": 8.352695859398272, + "grad_norm": 5.262070178985596, + "learning_rate": 3.614499880127975e-05, + "loss": 0.045, + "num_input_tokens_seen": 32529936, + "step": 56080 + }, + { + "epoch": 8.353440571939231, + "grad_norm": 9.477934837341309, + "learning_rate": 3.6142090044014e-05, + "loss": 0.3189, + "num_input_tokens_seen": 32533072, + "step": 56085 + }, + { + "epoch": 8.35418528448019, + "grad_norm": 0.018291911110281944, + "learning_rate": 3.613918109851397e-05, + "loss": 0.0006, + "num_input_tokens_seen": 32536080, + "step": 56090 + }, + { + "epoch": 8.35492999702115, + "grad_norm": 0.008909954689443111, + "learning_rate": 3.613627196482879e-05, + "loss": 0.1154, + "num_input_tokens_seen": 32539056, + "step": 56095 + }, + { + "epoch": 8.35567470956211, + "grad_norm": 0.045275893062353134, + "learning_rate": 3.613336264300762e-05, + "loss": 0.0125, + "num_input_tokens_seen": 32541872, + "step": 56100 + }, + { + "epoch": 8.356419422103068, + "grad_norm": 35.128387451171875, + "learning_rate": 3.613045313309959e-05, + "loss": 0.0985, + "num_input_tokens_seen": 32545872, + "step": 56105 + }, + { + "epoch": 8.357164134644027, + "grad_norm": 0.007056334521621466, + "learning_rate": 3.612754343515388e-05, + "loss": 0.2078, + "num_input_tokens_seen": 32548592, + "step": 56110 + }, + { + "epoch": 8.357908847184987, + "grad_norm": 0.012116661295294762, + "learning_rate": 3.612463354921963e-05, + "loss": 0.3114, + "num_input_tokens_seen": 32551472, + "step": 56115 + }, + { + "epoch": 8.358653559725946, + "grad_norm": 0.06079076603055, + "learning_rate": 3.6121723475346006e-05, + "loss": 0.1542, + "num_input_tokens_seen": 32554320, + "step": 56120 + }, + { + "epoch": 8.359398272266905, + "grad_norm": 0.06390266865491867, + "learning_rate": 3.6118813213582156e-05, + "loss": 0.3186, + "num_input_tokens_seen": 32557200, + "step": 56125 + }, + { + "epoch": 8.360142984807863, + "grad_norm": 8.950922012329102, + "learning_rate": 3.611590276397727e-05, + "loss": 0.0136, + "num_input_tokens_seen": 32560048, + "step": 56130 + }, + { + "epoch": 8.360887697348824, + "grad_norm": 4.053642749786377, + "learning_rate": 3.6112992126580505e-05, + "loss": 0.317, + "num_input_tokens_seen": 32563056, + "step": 56135 + }, + { + "epoch": 8.361632409889783, + "grad_norm": 85.42271423339844, + "learning_rate": 3.611008130144102e-05, + "loss": 0.0995, + "num_input_tokens_seen": 32565808, + "step": 56140 + }, + { + "epoch": 8.362377122430741, + "grad_norm": 0.23671555519104004, + "learning_rate": 3.610717028860801e-05, + "loss": 0.3091, + "num_input_tokens_seen": 32568560, + "step": 56145 + }, + { + "epoch": 8.3631218349717, + "grad_norm": 96.55757141113281, + "learning_rate": 3.6104259088130655e-05, + "loss": 0.3095, + "num_input_tokens_seen": 32571216, + "step": 56150 + }, + { + "epoch": 8.36386654751266, + "grad_norm": 36.33070755004883, + "learning_rate": 3.6101347700058116e-05, + "loss": 0.0623, + "num_input_tokens_seen": 32574160, + "step": 56155 + }, + { + "epoch": 8.36461126005362, + "grad_norm": 0.00920292641967535, + "learning_rate": 3.6098436124439594e-05, + "loss": 0.0009, + "num_input_tokens_seen": 32577168, + "step": 56160 + }, + { + "epoch": 8.365355972594578, + "grad_norm": 0.012933938764035702, + "learning_rate": 3.609552436132427e-05, + "loss": 0.0924, + "num_input_tokens_seen": 32579888, + "step": 56165 + }, + { + "epoch": 8.366100685135537, + "grad_norm": 329.5790710449219, + "learning_rate": 3.609261241076136e-05, + "loss": 0.0286, + "num_input_tokens_seen": 32582544, + "step": 56170 + }, + { + "epoch": 8.366845397676498, + "grad_norm": 0.02384745329618454, + "learning_rate": 3.608970027280001e-05, + "loss": 0.1104, + "num_input_tokens_seen": 32585200, + "step": 56175 + }, + { + "epoch": 8.367590110217456, + "grad_norm": 0.013550106436014175, + "learning_rate": 3.608678794748946e-05, + "loss": 0.0358, + "num_input_tokens_seen": 32587984, + "step": 56180 + }, + { + "epoch": 8.368334822758415, + "grad_norm": 61.7808723449707, + "learning_rate": 3.608387543487889e-05, + "loss": 0.0938, + "num_input_tokens_seen": 32590736, + "step": 56185 + }, + { + "epoch": 8.369079535299374, + "grad_norm": 6.0528717041015625, + "learning_rate": 3.6080962735017514e-05, + "loss": 0.1741, + "num_input_tokens_seen": 32593552, + "step": 56190 + }, + { + "epoch": 8.369824247840334, + "grad_norm": 35.9870719909668, + "learning_rate": 3.607804984795453e-05, + "loss": 0.1285, + "num_input_tokens_seen": 32596400, + "step": 56195 + }, + { + "epoch": 8.370568960381293, + "grad_norm": 0.037974558770656586, + "learning_rate": 3.607513677373916e-05, + "loss": 0.1305, + "num_input_tokens_seen": 32599216, + "step": 56200 + }, + { + "epoch": 8.371313672922252, + "grad_norm": 48.092918395996094, + "learning_rate": 3.60722235124206e-05, + "loss": 0.2749, + "num_input_tokens_seen": 32602096, + "step": 56205 + }, + { + "epoch": 8.37205838546321, + "grad_norm": 16.32544708251953, + "learning_rate": 3.606931006404809e-05, + "loss": 0.2856, + "num_input_tokens_seen": 32604912, + "step": 56210 + }, + { + "epoch": 8.372803098004171, + "grad_norm": 0.022154364734888077, + "learning_rate": 3.606639642867083e-05, + "loss": 0.1692, + "num_input_tokens_seen": 32607920, + "step": 56215 + }, + { + "epoch": 8.37354781054513, + "grad_norm": 6.501697540283203, + "learning_rate": 3.606348260633805e-05, + "loss": 0.1365, + "num_input_tokens_seen": 32610832, + "step": 56220 + }, + { + "epoch": 8.374292523086089, + "grad_norm": 0.20280471444129944, + "learning_rate": 3.6060568597098974e-05, + "loss": 0.2485, + "num_input_tokens_seen": 32613552, + "step": 56225 + }, + { + "epoch": 8.375037235627047, + "grad_norm": 0.010092329233884811, + "learning_rate": 3.605765440100283e-05, + "loss": 0.0753, + "num_input_tokens_seen": 32616400, + "step": 56230 + }, + { + "epoch": 8.375781948168008, + "grad_norm": 11.543680191040039, + "learning_rate": 3.605474001809886e-05, + "loss": 0.3463, + "num_input_tokens_seen": 32619504, + "step": 56235 + }, + { + "epoch": 8.376526660708967, + "grad_norm": 0.012939376756548882, + "learning_rate": 3.6051825448436286e-05, + "loss": 0.0978, + "num_input_tokens_seen": 32622416, + "step": 56240 + }, + { + "epoch": 8.377271373249926, + "grad_norm": 3.412729024887085, + "learning_rate": 3.604891069206437e-05, + "loss": 0.0064, + "num_input_tokens_seen": 32625392, + "step": 56245 + }, + { + "epoch": 8.378016085790884, + "grad_norm": 49.423797607421875, + "learning_rate": 3.6045995749032326e-05, + "loss": 0.2983, + "num_input_tokens_seen": 32628048, + "step": 56250 + }, + { + "epoch": 8.378760798331843, + "grad_norm": 0.1378365457057953, + "learning_rate": 3.6043080619389406e-05, + "loss": 0.1991, + "num_input_tokens_seen": 32630800, + "step": 56255 + }, + { + "epoch": 8.379505510872804, + "grad_norm": 51.59907531738281, + "learning_rate": 3.604016530318487e-05, + "loss": 0.1652, + "num_input_tokens_seen": 32634064, + "step": 56260 + }, + { + "epoch": 8.380250223413762, + "grad_norm": 0.04938500002026558, + "learning_rate": 3.6037249800467957e-05, + "loss": 0.0061, + "num_input_tokens_seen": 32636816, + "step": 56265 + }, + { + "epoch": 8.380994935954721, + "grad_norm": 0.006678703241050243, + "learning_rate": 3.6034334111287926e-05, + "loss": 0.0576, + "num_input_tokens_seen": 32639792, + "step": 56270 + }, + { + "epoch": 8.38173964849568, + "grad_norm": 2.8421761989593506, + "learning_rate": 3.603141823569404e-05, + "loss": 0.2776, + "num_input_tokens_seen": 32642608, + "step": 56275 + }, + { + "epoch": 8.38248436103664, + "grad_norm": 5.343212604522705, + "learning_rate": 3.602850217373555e-05, + "loss": 0.1005, + "num_input_tokens_seen": 32645520, + "step": 56280 + }, + { + "epoch": 8.3832290735776, + "grad_norm": 0.05901473015546799, + "learning_rate": 3.602558592546172e-05, + "loss": 0.0074, + "num_input_tokens_seen": 32648528, + "step": 56285 + }, + { + "epoch": 8.383973786118558, + "grad_norm": 61.091209411621094, + "learning_rate": 3.602266949092184e-05, + "loss": 0.2997, + "num_input_tokens_seen": 32651728, + "step": 56290 + }, + { + "epoch": 8.384718498659517, + "grad_norm": 7.338757038116455, + "learning_rate": 3.6019752870165145e-05, + "loss": 0.004, + "num_input_tokens_seen": 32654448, + "step": 56295 + }, + { + "epoch": 8.385463211200477, + "grad_norm": 42.54311752319336, + "learning_rate": 3.601683606324093e-05, + "loss": 0.2332, + "num_input_tokens_seen": 32657360, + "step": 56300 + }, + { + "epoch": 8.386207923741436, + "grad_norm": 0.1741250604391098, + "learning_rate": 3.601391907019847e-05, + "loss": 0.2816, + "num_input_tokens_seen": 32660432, + "step": 56305 + }, + { + "epoch": 8.386952636282395, + "grad_norm": 2.539052963256836, + "learning_rate": 3.601100189108704e-05, + "loss": 0.0833, + "num_input_tokens_seen": 32663536, + "step": 56310 + }, + { + "epoch": 8.387697348823353, + "grad_norm": 27.95963478088379, + "learning_rate": 3.600808452595592e-05, + "loss": 0.0293, + "num_input_tokens_seen": 32666576, + "step": 56315 + }, + { + "epoch": 8.388442061364314, + "grad_norm": 0.04741689935326576, + "learning_rate": 3.6005166974854406e-05, + "loss": 0.0005, + "num_input_tokens_seen": 32669392, + "step": 56320 + }, + { + "epoch": 8.389186773905273, + "grad_norm": 0.051765840500593185, + "learning_rate": 3.6002249237831774e-05, + "loss": 0.0003, + "num_input_tokens_seen": 32672080, + "step": 56325 + }, + { + "epoch": 8.389931486446232, + "grad_norm": 0.00207300903275609, + "learning_rate": 3.599933131493733e-05, + "loss": 0.0089, + "num_input_tokens_seen": 32674864, + "step": 56330 + }, + { + "epoch": 8.39067619898719, + "grad_norm": 0.49754294753074646, + "learning_rate": 3.599641320622036e-05, + "loss": 0.001, + "num_input_tokens_seen": 32677968, + "step": 56335 + }, + { + "epoch": 8.39142091152815, + "grad_norm": 29.558496475219727, + "learning_rate": 3.599349491173016e-05, + "loss": 0.1607, + "num_input_tokens_seen": 32680848, + "step": 56340 + }, + { + "epoch": 8.39216562406911, + "grad_norm": 0.019643502309918404, + "learning_rate": 3.5990576431516044e-05, + "loss": 0.142, + "num_input_tokens_seen": 32683536, + "step": 56345 + }, + { + "epoch": 8.392910336610068, + "grad_norm": 0.12280087918043137, + "learning_rate": 3.598765776562731e-05, + "loss": 0.002, + "num_input_tokens_seen": 32686320, + "step": 56350 + }, + { + "epoch": 8.393655049151027, + "grad_norm": 0.035870034247636795, + "learning_rate": 3.598473891411326e-05, + "loss": 0.2327, + "num_input_tokens_seen": 32689136, + "step": 56355 + }, + { + "epoch": 8.394399761691988, + "grad_norm": 0.025863490998744965, + "learning_rate": 3.598181987702321e-05, + "loss": 0.0004, + "num_input_tokens_seen": 32691728, + "step": 56360 + }, + { + "epoch": 8.395144474232946, + "grad_norm": 0.8209325075149536, + "learning_rate": 3.5978900654406476e-05, + "loss": 0.3302, + "num_input_tokens_seen": 32694608, + "step": 56365 + }, + { + "epoch": 8.395889186773905, + "grad_norm": 0.04218900576233864, + "learning_rate": 3.597598124631239e-05, + "loss": 0.0013, + "num_input_tokens_seen": 32697424, + "step": 56370 + }, + { + "epoch": 8.396633899314864, + "grad_norm": 0.021856458857655525, + "learning_rate": 3.5973061652790237e-05, + "loss": 0.0003, + "num_input_tokens_seen": 32700208, + "step": 56375 + }, + { + "epoch": 8.397378611855824, + "grad_norm": 0.07526987791061401, + "learning_rate": 3.597014187388936e-05, + "loss": 0.2666, + "num_input_tokens_seen": 32703216, + "step": 56380 + }, + { + "epoch": 8.398123324396783, + "grad_norm": 96.97686767578125, + "learning_rate": 3.5967221909659095e-05, + "loss": 0.3703, + "num_input_tokens_seen": 32705968, + "step": 56385 + }, + { + "epoch": 8.398868036937742, + "grad_norm": 22.908531188964844, + "learning_rate": 3.596430176014875e-05, + "loss": 0.3428, + "num_input_tokens_seen": 32709040, + "step": 56390 + }, + { + "epoch": 8.3996127494787, + "grad_norm": 0.028981242328882217, + "learning_rate": 3.596138142540768e-05, + "loss": 0.0008, + "num_input_tokens_seen": 32711984, + "step": 56395 + }, + { + "epoch": 8.400357462019661, + "grad_norm": 3.340461492538452, + "learning_rate": 3.5958460905485216e-05, + "loss": 0.0021, + "num_input_tokens_seen": 32714704, + "step": 56400 + }, + { + "epoch": 8.40110217456062, + "grad_norm": 0.07259073108434677, + "learning_rate": 3.595554020043068e-05, + "loss": 0.2984, + "num_input_tokens_seen": 32717328, + "step": 56405 + }, + { + "epoch": 8.401846887101579, + "grad_norm": 17.493921279907227, + "learning_rate": 3.5952619310293435e-05, + "loss": 0.0343, + "num_input_tokens_seen": 32720528, + "step": 56410 + }, + { + "epoch": 8.402591599642538, + "grad_norm": 0.006170653272420168, + "learning_rate": 3.594969823512282e-05, + "loss": 0.599, + "num_input_tokens_seen": 32723248, + "step": 56415 + }, + { + "epoch": 8.403336312183498, + "grad_norm": 0.011718488298356533, + "learning_rate": 3.5946776974968174e-05, + "loss": 0.1546, + "num_input_tokens_seen": 32726416, + "step": 56420 + }, + { + "epoch": 8.404081024724457, + "grad_norm": 0.009285466745495796, + "learning_rate": 3.5943855529878865e-05, + "loss": 0.201, + "num_input_tokens_seen": 32729520, + "step": 56425 + }, + { + "epoch": 8.404825737265416, + "grad_norm": 13.52450942993164, + "learning_rate": 3.594093389990424e-05, + "loss": 0.2338, + "num_input_tokens_seen": 32732496, + "step": 56430 + }, + { + "epoch": 8.405570449806374, + "grad_norm": 5.304790496826172, + "learning_rate": 3.593801208509365e-05, + "loss": 0.1079, + "num_input_tokens_seen": 32735184, + "step": 56435 + }, + { + "epoch": 8.406315162347333, + "grad_norm": 53.26766586303711, + "learning_rate": 3.593509008549646e-05, + "loss": 0.1683, + "num_input_tokens_seen": 32737776, + "step": 56440 + }, + { + "epoch": 8.407059874888294, + "grad_norm": 7.482316017150879, + "learning_rate": 3.593216790116205e-05, + "loss": 0.1234, + "num_input_tokens_seen": 32740528, + "step": 56445 + }, + { + "epoch": 8.407804587429252, + "grad_norm": 21.240205764770508, + "learning_rate": 3.5929245532139773e-05, + "loss": 0.3305, + "num_input_tokens_seen": 32743504, + "step": 56450 + }, + { + "epoch": 8.408549299970211, + "grad_norm": 1.3814444541931152, + "learning_rate": 3.5926322978478985e-05, + "loss": 0.1289, + "num_input_tokens_seen": 32746480, + "step": 56455 + }, + { + "epoch": 8.40929401251117, + "grad_norm": 0.056055083870887756, + "learning_rate": 3.592340024022909e-05, + "loss": 0.1304, + "num_input_tokens_seen": 32749296, + "step": 56460 + }, + { + "epoch": 8.41003872505213, + "grad_norm": 0.10325392335653305, + "learning_rate": 3.592047731743944e-05, + "loss": 0.1773, + "num_input_tokens_seen": 32752368, + "step": 56465 + }, + { + "epoch": 8.41078343759309, + "grad_norm": 10.944153785705566, + "learning_rate": 3.591755421015943e-05, + "loss": 0.2538, + "num_input_tokens_seen": 32755216, + "step": 56470 + }, + { + "epoch": 8.411528150134048, + "grad_norm": 0.12722885608673096, + "learning_rate": 3.591463091843844e-05, + "loss": 0.0853, + "num_input_tokens_seen": 32758064, + "step": 56475 + }, + { + "epoch": 8.412272862675007, + "grad_norm": 0.012364804744720459, + "learning_rate": 3.591170744232585e-05, + "loss": 0.1267, + "num_input_tokens_seen": 32761168, + "step": 56480 + }, + { + "epoch": 8.413017575215967, + "grad_norm": 0.007705143187195063, + "learning_rate": 3.590878378187106e-05, + "loss": 0.0203, + "num_input_tokens_seen": 32764112, + "step": 56485 + }, + { + "epoch": 8.413762287756926, + "grad_norm": 0.0939694195985794, + "learning_rate": 3.5905859937123445e-05, + "loss": 0.1918, + "num_input_tokens_seen": 32767248, + "step": 56490 + }, + { + "epoch": 8.414507000297885, + "grad_norm": 12.417545318603516, + "learning_rate": 3.5902935908132416e-05, + "loss": 0.3757, + "num_input_tokens_seen": 32770320, + "step": 56495 + }, + { + "epoch": 8.415251712838844, + "grad_norm": 0.0036373953334987164, + "learning_rate": 3.5900011694947364e-05, + "loss": 0.0022, + "num_input_tokens_seen": 32773264, + "step": 56500 + }, + { + "epoch": 8.415996425379804, + "grad_norm": 0.029467960819602013, + "learning_rate": 3.5897087297617694e-05, + "loss": 0.2009, + "num_input_tokens_seen": 32776464, + "step": 56505 + }, + { + "epoch": 8.416741137920763, + "grad_norm": 0.035964470356702805, + "learning_rate": 3.589416271619281e-05, + "loss": 0.0002, + "num_input_tokens_seen": 32779440, + "step": 56510 + }, + { + "epoch": 8.417485850461722, + "grad_norm": 0.03351482003927231, + "learning_rate": 3.589123795072212e-05, + "loss": 0.1741, + "num_input_tokens_seen": 32782576, + "step": 56515 + }, + { + "epoch": 8.41823056300268, + "grad_norm": 0.05023947358131409, + "learning_rate": 3.5888313001255034e-05, + "loss": 0.0036, + "num_input_tokens_seen": 32785392, + "step": 56520 + }, + { + "epoch": 8.418975275543641, + "grad_norm": 0.01327473670244217, + "learning_rate": 3.588538786784096e-05, + "loss": 0.3686, + "num_input_tokens_seen": 32788496, + "step": 56525 + }, + { + "epoch": 8.4197199880846, + "grad_norm": 5.213807582855225, + "learning_rate": 3.5882462550529325e-05, + "loss": 0.0019, + "num_input_tokens_seen": 32791280, + "step": 56530 + }, + { + "epoch": 8.420464700625558, + "grad_norm": 0.01391566451638937, + "learning_rate": 3.587953704936955e-05, + "loss": 0.199, + "num_input_tokens_seen": 32793904, + "step": 56535 + }, + { + "epoch": 8.421209413166517, + "grad_norm": 0.01922360062599182, + "learning_rate": 3.587661136441105e-05, + "loss": 0.0169, + "num_input_tokens_seen": 32796848, + "step": 56540 + }, + { + "epoch": 8.421954125707478, + "grad_norm": 6.667757987976074, + "learning_rate": 3.587368549570326e-05, + "loss": 0.0908, + "num_input_tokens_seen": 32799856, + "step": 56545 + }, + { + "epoch": 8.422698838248436, + "grad_norm": 0.01183977723121643, + "learning_rate": 3.58707594432956e-05, + "loss": 0.0009, + "num_input_tokens_seen": 32802864, + "step": 56550 + }, + { + "epoch": 8.423443550789395, + "grad_norm": 0.63404381275177, + "learning_rate": 3.586783320723751e-05, + "loss": 0.0009, + "num_input_tokens_seen": 32805840, + "step": 56555 + }, + { + "epoch": 8.424188263330354, + "grad_norm": 0.024309584870934486, + "learning_rate": 3.586490678757842e-05, + "loss": 0.2324, + "num_input_tokens_seen": 32808976, + "step": 56560 + }, + { + "epoch": 8.424932975871315, + "grad_norm": 8.286070823669434, + "learning_rate": 3.5861980184367775e-05, + "loss": 0.002, + "num_input_tokens_seen": 32812176, + "step": 56565 + }, + { + "epoch": 8.425677688412273, + "grad_norm": 194.83766174316406, + "learning_rate": 3.5859053397655014e-05, + "loss": 0.5488, + "num_input_tokens_seen": 32815344, + "step": 56570 + }, + { + "epoch": 8.426422400953232, + "grad_norm": 0.5503921508789062, + "learning_rate": 3.585612642748958e-05, + "loss": 0.0619, + "num_input_tokens_seen": 32818384, + "step": 56575 + }, + { + "epoch": 8.42716711349419, + "grad_norm": 0.021471310406923294, + "learning_rate": 3.585319927392093e-05, + "loss": 0.0515, + "num_input_tokens_seen": 32821488, + "step": 56580 + }, + { + "epoch": 8.42791182603515, + "grad_norm": 16.044574737548828, + "learning_rate": 3.585027193699851e-05, + "loss": 0.0054, + "num_input_tokens_seen": 32824208, + "step": 56585 + }, + { + "epoch": 8.42865653857611, + "grad_norm": 1.2867101430892944, + "learning_rate": 3.5847344416771766e-05, + "loss": 0.0063, + "num_input_tokens_seen": 32826832, + "step": 56590 + }, + { + "epoch": 8.429401251117069, + "grad_norm": 26.237285614013672, + "learning_rate": 3.584441671329016e-05, + "loss": 0.2526, + "num_input_tokens_seen": 32829744, + "step": 56595 + }, + { + "epoch": 8.430145963658028, + "grad_norm": 2.8038060665130615, + "learning_rate": 3.584148882660316e-05, + "loss": 0.5254, + "num_input_tokens_seen": 32832528, + "step": 56600 + }, + { + "epoch": 8.430890676198986, + "grad_norm": 0.007761875167489052, + "learning_rate": 3.583856075676023e-05, + "loss": 0.1066, + "num_input_tokens_seen": 32835184, + "step": 56605 + }, + { + "epoch": 8.431635388739947, + "grad_norm": 0.022533243522047997, + "learning_rate": 3.5835632503810834e-05, + "loss": 0.0009, + "num_input_tokens_seen": 32838000, + "step": 56610 + }, + { + "epoch": 8.432380101280906, + "grad_norm": 0.8977166414260864, + "learning_rate": 3.5832704067804436e-05, + "loss": 0.1268, + "num_input_tokens_seen": 32840912, + "step": 56615 + }, + { + "epoch": 8.433124813821864, + "grad_norm": 0.20003800094127655, + "learning_rate": 3.582977544879051e-05, + "loss": 0.1733, + "num_input_tokens_seen": 32843760, + "step": 56620 + }, + { + "epoch": 8.433869526362823, + "grad_norm": 15.147982597351074, + "learning_rate": 3.5826846646818536e-05, + "loss": 0.2013, + "num_input_tokens_seen": 32846800, + "step": 56625 + }, + { + "epoch": 8.434614238903784, + "grad_norm": 0.013453326188027859, + "learning_rate": 3.582391766193799e-05, + "loss": 0.0968, + "num_input_tokens_seen": 32849616, + "step": 56630 + }, + { + "epoch": 8.435358951444742, + "grad_norm": 0.042942605912685394, + "learning_rate": 3.582098849419835e-05, + "loss": 0.3919, + "num_input_tokens_seen": 32852752, + "step": 56635 + }, + { + "epoch": 8.436103663985701, + "grad_norm": 47.83561706542969, + "learning_rate": 3.581805914364912e-05, + "loss": 0.3462, + "num_input_tokens_seen": 32855600, + "step": 56640 + }, + { + "epoch": 8.43684837652666, + "grad_norm": 0.019306836649775505, + "learning_rate": 3.581512961033977e-05, + "loss": 0.1884, + "num_input_tokens_seen": 32858512, + "step": 56645 + }, + { + "epoch": 8.43759308906762, + "grad_norm": 1.5158259868621826, + "learning_rate": 3.5812199894319795e-05, + "loss": 0.1602, + "num_input_tokens_seen": 32861680, + "step": 56650 + }, + { + "epoch": 8.43833780160858, + "grad_norm": 4.2005462646484375, + "learning_rate": 3.58092699956387e-05, + "loss": 0.127, + "num_input_tokens_seen": 32864656, + "step": 56655 + }, + { + "epoch": 8.439082514149538, + "grad_norm": 24.51205062866211, + "learning_rate": 3.580633991434597e-05, + "loss": 0.096, + "num_input_tokens_seen": 32867952, + "step": 56660 + }, + { + "epoch": 8.439827226690497, + "grad_norm": 0.00548480125144124, + "learning_rate": 3.58034096504911e-05, + "loss": 0.2917, + "num_input_tokens_seen": 32870992, + "step": 56665 + }, + { + "epoch": 8.440571939231457, + "grad_norm": 2.916442632675171, + "learning_rate": 3.580047920412362e-05, + "loss": 0.2603, + "num_input_tokens_seen": 32873744, + "step": 56670 + }, + { + "epoch": 8.441316651772416, + "grad_norm": 0.36451441049575806, + "learning_rate": 3.579754857529301e-05, + "loss": 0.0014, + "num_input_tokens_seen": 32876688, + "step": 56675 + }, + { + "epoch": 8.442061364313375, + "grad_norm": 87.2187271118164, + "learning_rate": 3.57946177640488e-05, + "loss": 0.0773, + "num_input_tokens_seen": 32879728, + "step": 56680 + }, + { + "epoch": 8.442806076854334, + "grad_norm": 3.096790313720703, + "learning_rate": 3.579168677044049e-05, + "loss": 0.0286, + "num_input_tokens_seen": 32882448, + "step": 56685 + }, + { + "epoch": 8.443550789395294, + "grad_norm": 0.4264828860759735, + "learning_rate": 3.5788755594517595e-05, + "loss": 0.078, + "num_input_tokens_seen": 32885680, + "step": 56690 + }, + { + "epoch": 8.444295501936253, + "grad_norm": 0.047243889421224594, + "learning_rate": 3.5785824236329644e-05, + "loss": 0.2622, + "num_input_tokens_seen": 32888592, + "step": 56695 + }, + { + "epoch": 8.445040214477212, + "grad_norm": 25.586191177368164, + "learning_rate": 3.578289269592615e-05, + "loss": 0.1855, + "num_input_tokens_seen": 32891536, + "step": 56700 + }, + { + "epoch": 8.44578492701817, + "grad_norm": 8.93272876739502, + "learning_rate": 3.577996097335665e-05, + "loss": 0.1023, + "num_input_tokens_seen": 32894544, + "step": 56705 + }, + { + "epoch": 8.446529639559131, + "grad_norm": 7.932480812072754, + "learning_rate": 3.577702906867066e-05, + "loss": 0.0078, + "num_input_tokens_seen": 32897520, + "step": 56710 + }, + { + "epoch": 8.44727435210009, + "grad_norm": 0.04839529097080231, + "learning_rate": 3.577409698191773e-05, + "loss": 0.0006, + "num_input_tokens_seen": 32900400, + "step": 56715 + }, + { + "epoch": 8.448019064641048, + "grad_norm": 0.07956106215715408, + "learning_rate": 3.5771164713147364e-05, + "loss": 0.0003, + "num_input_tokens_seen": 32903024, + "step": 56720 + }, + { + "epoch": 8.448763777182007, + "grad_norm": 12.875699996948242, + "learning_rate": 3.576823226240913e-05, + "loss": 0.2159, + "num_input_tokens_seen": 32906288, + "step": 56725 + }, + { + "epoch": 8.449508489722968, + "grad_norm": 0.0015307610156014562, + "learning_rate": 3.576529962975255e-05, + "loss": 0.246, + "num_input_tokens_seen": 32909072, + "step": 56730 + }, + { + "epoch": 8.450253202263927, + "grad_norm": 86.26351165771484, + "learning_rate": 3.576236681522718e-05, + "loss": 0.0419, + "num_input_tokens_seen": 32911792, + "step": 56735 + }, + { + "epoch": 8.450997914804885, + "grad_norm": 0.023721111938357353, + "learning_rate": 3.575943381888255e-05, + "loss": 0.1017, + "num_input_tokens_seen": 32914672, + "step": 56740 + }, + { + "epoch": 8.451742627345844, + "grad_norm": 23.196298599243164, + "learning_rate": 3.575650064076823e-05, + "loss": 0.2192, + "num_input_tokens_seen": 32917424, + "step": 56745 + }, + { + "epoch": 8.452487339886805, + "grad_norm": 19.686336517333984, + "learning_rate": 3.575356728093376e-05, + "loss": 0.0284, + "num_input_tokens_seen": 32920240, + "step": 56750 + }, + { + "epoch": 8.453232052427763, + "grad_norm": 12.204832077026367, + "learning_rate": 3.575063373942871e-05, + "loss": 0.2601, + "num_input_tokens_seen": 32923056, + "step": 56755 + }, + { + "epoch": 8.453976764968722, + "grad_norm": 27.627304077148438, + "learning_rate": 3.5747700016302616e-05, + "loss": 0.1763, + "num_input_tokens_seen": 32926000, + "step": 56760 + }, + { + "epoch": 8.45472147750968, + "grad_norm": 0.016034729778766632, + "learning_rate": 3.574476611160506e-05, + "loss": 0.0006, + "num_input_tokens_seen": 32928592, + "step": 56765 + }, + { + "epoch": 8.45546619005064, + "grad_norm": 17.46991729736328, + "learning_rate": 3.5741832025385596e-05, + "loss": 0.327, + "num_input_tokens_seen": 32931728, + "step": 56770 + }, + { + "epoch": 8.4562109025916, + "grad_norm": 14.719318389892578, + "learning_rate": 3.57388977576938e-05, + "loss": 0.3293, + "num_input_tokens_seen": 32934640, + "step": 56775 + }, + { + "epoch": 8.456955615132559, + "grad_norm": 0.03280527889728546, + "learning_rate": 3.5735963308579256e-05, + "loss": 0.2108, + "num_input_tokens_seen": 32937520, + "step": 56780 + }, + { + "epoch": 8.457700327673518, + "grad_norm": 32.31098556518555, + "learning_rate": 3.573302867809151e-05, + "loss": 0.1015, + "num_input_tokens_seen": 32940624, + "step": 56785 + }, + { + "epoch": 8.458445040214476, + "grad_norm": 0.03355070576071739, + "learning_rate": 3.573009386628015e-05, + "loss": 0.0036, + "num_input_tokens_seen": 32943600, + "step": 56790 + }, + { + "epoch": 8.459189752755437, + "grad_norm": 0.05320638045668602, + "learning_rate": 3.5727158873194763e-05, + "loss": 0.2657, + "num_input_tokens_seen": 32946352, + "step": 56795 + }, + { + "epoch": 8.459934465296396, + "grad_norm": 0.03320559859275818, + "learning_rate": 3.572422369888493e-05, + "loss": 0.0362, + "num_input_tokens_seen": 32949392, + "step": 56800 + }, + { + "epoch": 8.460679177837354, + "grad_norm": 0.08040518313646317, + "learning_rate": 3.5721288343400235e-05, + "loss": 0.0223, + "num_input_tokens_seen": 32952240, + "step": 56805 + }, + { + "epoch": 8.461423890378313, + "grad_norm": 0.06018607318401337, + "learning_rate": 3.571835280679027e-05, + "loss": 0.0516, + "num_input_tokens_seen": 32955216, + "step": 56810 + }, + { + "epoch": 8.462168602919274, + "grad_norm": 0.0755753293633461, + "learning_rate": 3.5715417089104634e-05, + "loss": 0.0891, + "num_input_tokens_seen": 32958128, + "step": 56815 + }, + { + "epoch": 8.462913315460233, + "grad_norm": 0.387020468711853, + "learning_rate": 3.571248119039291e-05, + "loss": 0.1681, + "num_input_tokens_seen": 32961136, + "step": 56820 + }, + { + "epoch": 8.463658028001191, + "grad_norm": 0.04205517843365669, + "learning_rate": 3.570954511070471e-05, + "loss": 0.0006, + "num_input_tokens_seen": 32963920, + "step": 56825 + }, + { + "epoch": 8.46440274054215, + "grad_norm": 0.07376248389482498, + "learning_rate": 3.570660885008962e-05, + "loss": 0.0089, + "num_input_tokens_seen": 32966864, + "step": 56830 + }, + { + "epoch": 8.46514745308311, + "grad_norm": 0.11153364926576614, + "learning_rate": 3.570367240859727e-05, + "loss": 0.1156, + "num_input_tokens_seen": 32969520, + "step": 56835 + }, + { + "epoch": 8.46589216562407, + "grad_norm": 0.022584406659007072, + "learning_rate": 3.570073578627724e-05, + "loss": 0.0009, + "num_input_tokens_seen": 32972432, + "step": 56840 + }, + { + "epoch": 8.466636878165028, + "grad_norm": 12.580001831054688, + "learning_rate": 3.5697798983179165e-05, + "loss": 0.3751, + "num_input_tokens_seen": 32975312, + "step": 56845 + }, + { + "epoch": 8.467381590705987, + "grad_norm": 5.307372570037842, + "learning_rate": 3.569486199935264e-05, + "loss": 0.1864, + "num_input_tokens_seen": 32978224, + "step": 56850 + }, + { + "epoch": 8.468126303246947, + "grad_norm": 5.385730743408203, + "learning_rate": 3.56919248348473e-05, + "loss": 0.1665, + "num_input_tokens_seen": 32981008, + "step": 56855 + }, + { + "epoch": 8.468871015787906, + "grad_norm": 0.011356149800121784, + "learning_rate": 3.568898748971275e-05, + "loss": 0.0739, + "num_input_tokens_seen": 32984304, + "step": 56860 + }, + { + "epoch": 8.469615728328865, + "grad_norm": 0.028676694259047508, + "learning_rate": 3.568604996399862e-05, + "loss": 0.0111, + "num_input_tokens_seen": 32987248, + "step": 56865 + }, + { + "epoch": 8.470360440869824, + "grad_norm": 144.7747802734375, + "learning_rate": 3.5683112257754535e-05, + "loss": 0.1927, + "num_input_tokens_seen": 32990256, + "step": 56870 + }, + { + "epoch": 8.471105153410784, + "grad_norm": 0.636313259601593, + "learning_rate": 3.568017437103013e-05, + "loss": 0.2046, + "num_input_tokens_seen": 32993104, + "step": 56875 + }, + { + "epoch": 8.471849865951743, + "grad_norm": 1.2189815044403076, + "learning_rate": 3.567723630387504e-05, + "loss": 0.0155, + "num_input_tokens_seen": 32995696, + "step": 56880 + }, + { + "epoch": 8.472594578492702, + "grad_norm": 0.02653980441391468, + "learning_rate": 3.5674298056338885e-05, + "loss": 0.0024, + "num_input_tokens_seen": 32998416, + "step": 56885 + }, + { + "epoch": 8.47333929103366, + "grad_norm": 0.018786994740366936, + "learning_rate": 3.5671359628471315e-05, + "loss": 0.0009, + "num_input_tokens_seen": 33001872, + "step": 56890 + }, + { + "epoch": 8.474084003574621, + "grad_norm": 6.398202896118164, + "learning_rate": 3.566842102032198e-05, + "loss": 0.2262, + "num_input_tokens_seen": 33004528, + "step": 56895 + }, + { + "epoch": 8.47482871611558, + "grad_norm": 72.43417358398438, + "learning_rate": 3.56654822319405e-05, + "loss": 0.3269, + "num_input_tokens_seen": 33007632, + "step": 56900 + }, + { + "epoch": 8.475573428656539, + "grad_norm": 0.011596380732953548, + "learning_rate": 3.5662543263376544e-05, + "loss": 0.3303, + "num_input_tokens_seen": 33010640, + "step": 56905 + }, + { + "epoch": 8.476318141197497, + "grad_norm": 0.01189597137272358, + "learning_rate": 3.5659604114679754e-05, + "loss": 0.175, + "num_input_tokens_seen": 33013936, + "step": 56910 + }, + { + "epoch": 8.477062853738458, + "grad_norm": 5.164111614227295, + "learning_rate": 3.565666478589979e-05, + "loss": 0.4248, + "num_input_tokens_seen": 33017104, + "step": 56915 + }, + { + "epoch": 8.477807566279417, + "grad_norm": 4.640973091125488, + "learning_rate": 3.5653725277086306e-05, + "loss": 0.1587, + "num_input_tokens_seen": 33019888, + "step": 56920 + }, + { + "epoch": 8.478552278820375, + "grad_norm": 4.633573532104492, + "learning_rate": 3.565078558828896e-05, + "loss": 0.0076, + "num_input_tokens_seen": 33022640, + "step": 56925 + }, + { + "epoch": 8.479296991361334, + "grad_norm": 6.745461463928223, + "learning_rate": 3.564784571955741e-05, + "loss": 0.1436, + "num_input_tokens_seen": 33025648, + "step": 56930 + }, + { + "epoch": 8.480041703902295, + "grad_norm": 4.699532508850098, + "learning_rate": 3.5644905670941345e-05, + "loss": 0.0457, + "num_input_tokens_seen": 33028336, + "step": 56935 + }, + { + "epoch": 8.480786416443253, + "grad_norm": 0.10476863384246826, + "learning_rate": 3.56419654424904e-05, + "loss": 0.2646, + "num_input_tokens_seen": 33031248, + "step": 56940 + }, + { + "epoch": 8.481531128984212, + "grad_norm": 10.094320297241211, + "learning_rate": 3.5639025034254274e-05, + "loss": 0.1148, + "num_input_tokens_seen": 33034416, + "step": 56945 + }, + { + "epoch": 8.482275841525171, + "grad_norm": 0.019543996080756187, + "learning_rate": 3.563608444628264e-05, + "loss": 0.2409, + "num_input_tokens_seen": 33037328, + "step": 56950 + }, + { + "epoch": 8.48302055406613, + "grad_norm": 69.70411682128906, + "learning_rate": 3.563314367862515e-05, + "loss": 0.147, + "num_input_tokens_seen": 33040016, + "step": 56955 + }, + { + "epoch": 8.48376526660709, + "grad_norm": 0.058045562356710434, + "learning_rate": 3.5630202731331515e-05, + "loss": 0.071, + "num_input_tokens_seen": 33042704, + "step": 56960 + }, + { + "epoch": 8.484509979148049, + "grad_norm": 99.96322631835938, + "learning_rate": 3.562726160445141e-05, + "loss": 0.1891, + "num_input_tokens_seen": 33045520, + "step": 56965 + }, + { + "epoch": 8.485254691689008, + "grad_norm": 1.6182173490524292, + "learning_rate": 3.562432029803452e-05, + "loss": 0.2878, + "num_input_tokens_seen": 33048432, + "step": 56970 + }, + { + "epoch": 8.485999404229966, + "grad_norm": 97.43841552734375, + "learning_rate": 3.562137881213053e-05, + "loss": 0.189, + "num_input_tokens_seen": 33051408, + "step": 56975 + }, + { + "epoch": 8.486744116770927, + "grad_norm": 3.9514548778533936, + "learning_rate": 3.5618437146789155e-05, + "loss": 0.0485, + "num_input_tokens_seen": 33054288, + "step": 56980 + }, + { + "epoch": 8.487488829311886, + "grad_norm": 0.052827369421720505, + "learning_rate": 3.561549530206007e-05, + "loss": 0.07, + "num_input_tokens_seen": 33057456, + "step": 56985 + }, + { + "epoch": 8.488233541852845, + "grad_norm": 0.003749899100512266, + "learning_rate": 3.561255327799298e-05, + "loss": 0.0021, + "num_input_tokens_seen": 33060400, + "step": 56990 + }, + { + "epoch": 8.488978254393803, + "grad_norm": 0.016186842694878578, + "learning_rate": 3.5609611074637584e-05, + "loss": 0.1704, + "num_input_tokens_seen": 33063504, + "step": 56995 + }, + { + "epoch": 8.489722966934764, + "grad_norm": 0.8573358654975891, + "learning_rate": 3.5606668692043595e-05, + "loss": 0.0019, + "num_input_tokens_seen": 33066192, + "step": 57000 + }, + { + "epoch": 8.490467679475723, + "grad_norm": 26.15671730041504, + "learning_rate": 3.5603726130260715e-05, + "loss": 0.0869, + "num_input_tokens_seen": 33069104, + "step": 57005 + }, + { + "epoch": 8.491212392016681, + "grad_norm": 53.520381927490234, + "learning_rate": 3.5600783389338674e-05, + "loss": 0.0154, + "num_input_tokens_seen": 33071920, + "step": 57010 + }, + { + "epoch": 8.49195710455764, + "grad_norm": 53.64654541015625, + "learning_rate": 3.559784046932716e-05, + "loss": 0.0195, + "num_input_tokens_seen": 33074736, + "step": 57015 + }, + { + "epoch": 8.4927018170986, + "grad_norm": 0.07549524307250977, + "learning_rate": 3.5594897370275905e-05, + "loss": 0.0885, + "num_input_tokens_seen": 33077712, + "step": 57020 + }, + { + "epoch": 8.49344652963956, + "grad_norm": 0.035899050533771515, + "learning_rate": 3.5591954092234625e-05, + "loss": 0.3004, + "num_input_tokens_seen": 33080592, + "step": 57025 + }, + { + "epoch": 8.494191242180518, + "grad_norm": 0.005045606754720211, + "learning_rate": 3.558901063525305e-05, + "loss": 0.1354, + "num_input_tokens_seen": 33083856, + "step": 57030 + }, + { + "epoch": 8.494935954721477, + "grad_norm": 0.022587383165955544, + "learning_rate": 3.55860669993809e-05, + "loss": 0.0501, + "num_input_tokens_seen": 33086608, + "step": 57035 + }, + { + "epoch": 8.495680667262437, + "grad_norm": 112.2607421875, + "learning_rate": 3.55831231846679e-05, + "loss": 0.0572, + "num_input_tokens_seen": 33089392, + "step": 57040 + }, + { + "epoch": 8.496425379803396, + "grad_norm": 0.056675564497709274, + "learning_rate": 3.55801791911638e-05, + "loss": 0.1884, + "num_input_tokens_seen": 33092432, + "step": 57045 + }, + { + "epoch": 8.497170092344355, + "grad_norm": 71.87210083007812, + "learning_rate": 3.557723501891832e-05, + "loss": 0.3078, + "num_input_tokens_seen": 33095536, + "step": 57050 + }, + { + "epoch": 8.497914804885314, + "grad_norm": 0.038428328931331635, + "learning_rate": 3.557429066798121e-05, + "loss": 0.4575, + "num_input_tokens_seen": 33098480, + "step": 57055 + }, + { + "epoch": 8.498659517426274, + "grad_norm": 0.24569518864154816, + "learning_rate": 3.5571346138402204e-05, + "loss": 0.267, + "num_input_tokens_seen": 33101456, + "step": 57060 + }, + { + "epoch": 8.499404229967233, + "grad_norm": 0.02245914936065674, + "learning_rate": 3.5568401430231045e-05, + "loss": 0.317, + "num_input_tokens_seen": 33104272, + "step": 57065 + }, + { + "epoch": 8.500148942508192, + "grad_norm": 0.06826116144657135, + "learning_rate": 3.556545654351749e-05, + "loss": 0.1606, + "num_input_tokens_seen": 33107088, + "step": 57070 + }, + { + "epoch": 8.50089365504915, + "grad_norm": 0.02244681306183338, + "learning_rate": 3.556251147831128e-05, + "loss": 0.0432, + "num_input_tokens_seen": 33109808, + "step": 57075 + }, + { + "epoch": 8.501638367590111, + "grad_norm": 0.15929986536502838, + "learning_rate": 3.5559566234662175e-05, + "loss": 0.2585, + "num_input_tokens_seen": 33112688, + "step": 57080 + }, + { + "epoch": 8.50238308013107, + "grad_norm": 81.42227172851562, + "learning_rate": 3.555662081261994e-05, + "loss": 0.0452, + "num_input_tokens_seen": 33115984, + "step": 57085 + }, + { + "epoch": 8.503127792672029, + "grad_norm": 0.028322596102952957, + "learning_rate": 3.555367521223431e-05, + "loss": 0.0042, + "num_input_tokens_seen": 33119024, + "step": 57090 + }, + { + "epoch": 8.503872505212987, + "grad_norm": 0.11786390841007233, + "learning_rate": 3.555072943355508e-05, + "loss": 0.1857, + "num_input_tokens_seen": 33122064, + "step": 57095 + }, + { + "epoch": 8.504617217753946, + "grad_norm": 2.9070112705230713, + "learning_rate": 3.554778347663199e-05, + "loss": 0.0085, + "num_input_tokens_seen": 33125296, + "step": 57100 + }, + { + "epoch": 8.505361930294907, + "grad_norm": 0.026144949719309807, + "learning_rate": 3.554483734151482e-05, + "loss": 0.0045, + "num_input_tokens_seen": 33128080, + "step": 57105 + }, + { + "epoch": 8.506106642835865, + "grad_norm": 0.01255470234900713, + "learning_rate": 3.554189102825334e-05, + "loss": 0.0009, + "num_input_tokens_seen": 33131056, + "step": 57110 + }, + { + "epoch": 8.506851355376824, + "grad_norm": 0.13093970715999603, + "learning_rate": 3.553894453689733e-05, + "loss": 0.0117, + "num_input_tokens_seen": 33133840, + "step": 57115 + }, + { + "epoch": 8.507596067917785, + "grad_norm": 77.10431671142578, + "learning_rate": 3.553599786749656e-05, + "loss": 0.1469, + "num_input_tokens_seen": 33136752, + "step": 57120 + }, + { + "epoch": 8.508340780458743, + "grad_norm": 0.13293404877185822, + "learning_rate": 3.553305102010081e-05, + "loss": 0.1622, + "num_input_tokens_seen": 33139664, + "step": 57125 + }, + { + "epoch": 8.509085492999702, + "grad_norm": 0.0840025469660759, + "learning_rate": 3.553010399475987e-05, + "loss": 0.2882, + "num_input_tokens_seen": 33142416, + "step": 57130 + }, + { + "epoch": 8.509830205540661, + "grad_norm": 1.8904578685760498, + "learning_rate": 3.552715679152353e-05, + "loss": 0.1377, + "num_input_tokens_seen": 33145584, + "step": 57135 + }, + { + "epoch": 8.51057491808162, + "grad_norm": 42.79971694946289, + "learning_rate": 3.552420941044157e-05, + "loss": 0.101, + "num_input_tokens_seen": 33148464, + "step": 57140 + }, + { + "epoch": 8.51131963062258, + "grad_norm": 0.018637454137206078, + "learning_rate": 3.5521261851563796e-05, + "loss": 0.0135, + "num_input_tokens_seen": 33151440, + "step": 57145 + }, + { + "epoch": 8.512064343163539, + "grad_norm": 169.36666870117188, + "learning_rate": 3.551831411493999e-05, + "loss": 0.2728, + "num_input_tokens_seen": 33154352, + "step": 57150 + }, + { + "epoch": 8.512809055704498, + "grad_norm": 0.08309180289506912, + "learning_rate": 3.551536620061996e-05, + "loss": 0.2942, + "num_input_tokens_seen": 33157424, + "step": 57155 + }, + { + "epoch": 8.513553768245457, + "grad_norm": 0.2454727292060852, + "learning_rate": 3.55124181086535e-05, + "loss": 0.0023, + "num_input_tokens_seen": 33160208, + "step": 57160 + }, + { + "epoch": 8.514298480786417, + "grad_norm": 81.48548126220703, + "learning_rate": 3.5509469839090426e-05, + "loss": 0.1511, + "num_input_tokens_seen": 33162672, + "step": 57165 + }, + { + "epoch": 8.515043193327376, + "grad_norm": 22.33339500427246, + "learning_rate": 3.550652139198054e-05, + "loss": 0.3207, + "num_input_tokens_seen": 33165584, + "step": 57170 + }, + { + "epoch": 8.515787905868335, + "grad_norm": 17.475831985473633, + "learning_rate": 3.550357276737365e-05, + "loss": 0.2372, + "num_input_tokens_seen": 33168400, + "step": 57175 + }, + { + "epoch": 8.516532618409293, + "grad_norm": 9.632411003112793, + "learning_rate": 3.550062396531959e-05, + "loss": 0.2096, + "num_input_tokens_seen": 33171312, + "step": 57180 + }, + { + "epoch": 8.517277330950254, + "grad_norm": 0.20070476830005646, + "learning_rate": 3.549767498586814e-05, + "loss": 0.219, + "num_input_tokens_seen": 33174224, + "step": 57185 + }, + { + "epoch": 8.518022043491213, + "grad_norm": 0.011234208010137081, + "learning_rate": 3.549472582906914e-05, + "loss": 0.0503, + "num_input_tokens_seen": 33176880, + "step": 57190 + }, + { + "epoch": 8.518766756032171, + "grad_norm": 1.2656584978103638, + "learning_rate": 3.549177649497242e-05, + "loss": 0.0418, + "num_input_tokens_seen": 33179824, + "step": 57195 + }, + { + "epoch": 8.51951146857313, + "grad_norm": 0.03798915445804596, + "learning_rate": 3.54888269836278e-05, + "loss": 0.3205, + "num_input_tokens_seen": 33182864, + "step": 57200 + }, + { + "epoch": 8.52025618111409, + "grad_norm": 48.65483474731445, + "learning_rate": 3.5485877295085105e-05, + "loss": 0.2291, + "num_input_tokens_seen": 33185488, + "step": 57205 + }, + { + "epoch": 8.52100089365505, + "grad_norm": 0.04661623388528824, + "learning_rate": 3.5482927429394184e-05, + "loss": 0.1695, + "num_input_tokens_seen": 33188656, + "step": 57210 + }, + { + "epoch": 8.521745606196008, + "grad_norm": 3.500321865081787, + "learning_rate": 3.547997738660485e-05, + "loss": 0.0059, + "num_input_tokens_seen": 33191664, + "step": 57215 + }, + { + "epoch": 8.522490318736967, + "grad_norm": 54.064598083496094, + "learning_rate": 3.547702716676694e-05, + "loss": 0.3466, + "num_input_tokens_seen": 33194320, + "step": 57220 + }, + { + "epoch": 8.523235031277927, + "grad_norm": 33.139625549316406, + "learning_rate": 3.547407676993032e-05, + "loss": 0.0746, + "num_input_tokens_seen": 33197072, + "step": 57225 + }, + { + "epoch": 8.523979743818886, + "grad_norm": 0.00585112115368247, + "learning_rate": 3.54711261961448e-05, + "loss": 0.1602, + "num_input_tokens_seen": 33199856, + "step": 57230 + }, + { + "epoch": 8.524724456359845, + "grad_norm": 0.017397025600075722, + "learning_rate": 3.5468175445460263e-05, + "loss": 0.0183, + "num_input_tokens_seen": 33202672, + "step": 57235 + }, + { + "epoch": 8.525469168900804, + "grad_norm": 357.7966613769531, + "learning_rate": 3.546522451792653e-05, + "loss": 0.0817, + "num_input_tokens_seen": 33205520, + "step": 57240 + }, + { + "epoch": 8.526213881441764, + "grad_norm": 0.019687246531248093, + "learning_rate": 3.546227341359347e-05, + "loss": 0.0181, + "num_input_tokens_seen": 33208048, + "step": 57245 + }, + { + "epoch": 8.526958593982723, + "grad_norm": 0.0034757351968437433, + "learning_rate": 3.545932213251093e-05, + "loss": 0.0021, + "num_input_tokens_seen": 33211088, + "step": 57250 + }, + { + "epoch": 8.527703306523682, + "grad_norm": 0.05925394594669342, + "learning_rate": 3.545637067472878e-05, + "loss": 0.299, + "num_input_tokens_seen": 33213808, + "step": 57255 + }, + { + "epoch": 8.52844801906464, + "grad_norm": 5.340341091156006, + "learning_rate": 3.545341904029687e-05, + "loss": 0.0008, + "num_input_tokens_seen": 33216656, + "step": 57260 + }, + { + "epoch": 8.529192731605601, + "grad_norm": 0.008143446408212185, + "learning_rate": 3.545046722926507e-05, + "loss": 0.0092, + "num_input_tokens_seen": 33219632, + "step": 57265 + }, + { + "epoch": 8.52993744414656, + "grad_norm": 0.015502313151955605, + "learning_rate": 3.544751524168325e-05, + "loss": 0.002, + "num_input_tokens_seen": 33222352, + "step": 57270 + }, + { + "epoch": 8.530682156687519, + "grad_norm": 37.857933044433594, + "learning_rate": 3.544456307760128e-05, + "loss": 0.1497, + "num_input_tokens_seen": 33225136, + "step": 57275 + }, + { + "epoch": 8.531426869228477, + "grad_norm": 0.008053197525441647, + "learning_rate": 3.5441610737069026e-05, + "loss": 0.0002, + "num_input_tokens_seen": 33227984, + "step": 57280 + }, + { + "epoch": 8.532171581769436, + "grad_norm": 0.029039019718766212, + "learning_rate": 3.543865822013637e-05, + "loss": 0.0005, + "num_input_tokens_seen": 33230800, + "step": 57285 + }, + { + "epoch": 8.532916294310397, + "grad_norm": 0.2579662501811981, + "learning_rate": 3.5435705526853196e-05, + "loss": 0.2795, + "num_input_tokens_seen": 33233584, + "step": 57290 + }, + { + "epoch": 8.533661006851355, + "grad_norm": 0.040404658764600754, + "learning_rate": 3.5432752657269384e-05, + "loss": 0.1509, + "num_input_tokens_seen": 33236336, + "step": 57295 + }, + { + "epoch": 8.534405719392314, + "grad_norm": 2.236560344696045, + "learning_rate": 3.542979961143482e-05, + "loss": 0.0011, + "num_input_tokens_seen": 33239664, + "step": 57300 + }, + { + "epoch": 8.535150431933273, + "grad_norm": 23.959123611450195, + "learning_rate": 3.542684638939939e-05, + "loss": 0.3144, + "num_input_tokens_seen": 33243248, + "step": 57305 + }, + { + "epoch": 8.535895144474233, + "grad_norm": 61.49596405029297, + "learning_rate": 3.5423892991212994e-05, + "loss": 0.3075, + "num_input_tokens_seen": 33246192, + "step": 57310 + }, + { + "epoch": 8.536639857015192, + "grad_norm": 0.02656135894358158, + "learning_rate": 3.542093941692551e-05, + "loss": 0.0003, + "num_input_tokens_seen": 33249264, + "step": 57315 + }, + { + "epoch": 8.537384569556151, + "grad_norm": 58.847354888916016, + "learning_rate": 3.541798566658685e-05, + "loss": 0.0088, + "num_input_tokens_seen": 33252272, + "step": 57320 + }, + { + "epoch": 8.53812928209711, + "grad_norm": 26.271621704101562, + "learning_rate": 3.541503174024691e-05, + "loss": 0.0139, + "num_input_tokens_seen": 33255056, + "step": 57325 + }, + { + "epoch": 8.53887399463807, + "grad_norm": 0.0030151107348501682, + "learning_rate": 3.54120776379556e-05, + "loss": 0.033, + "num_input_tokens_seen": 33257840, + "step": 57330 + }, + { + "epoch": 8.539618707179029, + "grad_norm": 22.896438598632812, + "learning_rate": 3.540912335976281e-05, + "loss": 0.2124, + "num_input_tokens_seen": 33260624, + "step": 57335 + }, + { + "epoch": 8.540363419719988, + "grad_norm": 14.413724899291992, + "learning_rate": 3.540616890571847e-05, + "loss": 0.7626, + "num_input_tokens_seen": 33263696, + "step": 57340 + }, + { + "epoch": 8.541108132260947, + "grad_norm": 0.22579997777938843, + "learning_rate": 3.540321427587249e-05, + "loss": 0.6397, + "num_input_tokens_seen": 33267024, + "step": 57345 + }, + { + "epoch": 8.541852844801907, + "grad_norm": 12.50374984741211, + "learning_rate": 3.540025947027476e-05, + "loss": 0.1129, + "num_input_tokens_seen": 33269488, + "step": 57350 + }, + { + "epoch": 8.542597557342866, + "grad_norm": 0.02712075226008892, + "learning_rate": 3.5397304488975226e-05, + "loss": 0.151, + "num_input_tokens_seen": 33272528, + "step": 57355 + }, + { + "epoch": 8.543342269883825, + "grad_norm": 9.939598083496094, + "learning_rate": 3.53943493320238e-05, + "loss": 0.1973, + "num_input_tokens_seen": 33275888, + "step": 57360 + }, + { + "epoch": 8.544086982424783, + "grad_norm": 0.17597001791000366, + "learning_rate": 3.539139399947039e-05, + "loss": 0.0032, + "num_input_tokens_seen": 33278672, + "step": 57365 + }, + { + "epoch": 8.544831694965744, + "grad_norm": 0.31767117977142334, + "learning_rate": 3.5388438491364963e-05, + "loss": 0.0804, + "num_input_tokens_seen": 33281488, + "step": 57370 + }, + { + "epoch": 8.545576407506703, + "grad_norm": 2.495445728302002, + "learning_rate": 3.538548280775742e-05, + "loss": 0.183, + "num_input_tokens_seen": 33284432, + "step": 57375 + }, + { + "epoch": 8.546321120047661, + "grad_norm": 0.023395337164402008, + "learning_rate": 3.53825269486977e-05, + "loss": 0.143, + "num_input_tokens_seen": 33287408, + "step": 57380 + }, + { + "epoch": 8.54706583258862, + "grad_norm": 44.820899963378906, + "learning_rate": 3.5379570914235735e-05, + "loss": 0.1608, + "num_input_tokens_seen": 33290320, + "step": 57385 + }, + { + "epoch": 8.54781054512958, + "grad_norm": 0.015430313535034657, + "learning_rate": 3.537661470442147e-05, + "loss": 0.1138, + "num_input_tokens_seen": 33293232, + "step": 57390 + }, + { + "epoch": 8.54855525767054, + "grad_norm": 0.016979621723294258, + "learning_rate": 3.537365831930484e-05, + "loss": 0.0062, + "num_input_tokens_seen": 33296144, + "step": 57395 + }, + { + "epoch": 8.549299970211498, + "grad_norm": 64.59880065917969, + "learning_rate": 3.53707017589358e-05, + "loss": 0.1819, + "num_input_tokens_seen": 33298704, + "step": 57400 + }, + { + "epoch": 8.550044682752457, + "grad_norm": 0.8026207685470581, + "learning_rate": 3.53677450233643e-05, + "loss": 0.2438, + "num_input_tokens_seen": 33301648, + "step": 57405 + }, + { + "epoch": 8.550789395293418, + "grad_norm": 0.013360437005758286, + "learning_rate": 3.536478811264028e-05, + "loss": 0.1319, + "num_input_tokens_seen": 33305040, + "step": 57410 + }, + { + "epoch": 8.551534107834376, + "grad_norm": 0.010230199433863163, + "learning_rate": 3.5361831026813704e-05, + "loss": 0.4979, + "num_input_tokens_seen": 33308016, + "step": 57415 + }, + { + "epoch": 8.552278820375335, + "grad_norm": 0.07418221980333328, + "learning_rate": 3.535887376593453e-05, + "loss": 0.0407, + "num_input_tokens_seen": 33311088, + "step": 57420 + }, + { + "epoch": 8.553023532916294, + "grad_norm": 0.029435575008392334, + "learning_rate": 3.53559163300527e-05, + "loss": 0.0007, + "num_input_tokens_seen": 33313904, + "step": 57425 + }, + { + "epoch": 8.553768245457253, + "grad_norm": 44.261329650878906, + "learning_rate": 3.5352958719218186e-05, + "loss": 0.1575, + "num_input_tokens_seen": 33316624, + "step": 57430 + }, + { + "epoch": 8.554512957998213, + "grad_norm": 4.0135722160339355, + "learning_rate": 3.5350000933480966e-05, + "loss": 0.1767, + "num_input_tokens_seen": 33319280, + "step": 57435 + }, + { + "epoch": 8.555257670539172, + "grad_norm": 0.5692539811134338, + "learning_rate": 3.5347042972891e-05, + "loss": 0.1566, + "num_input_tokens_seen": 33322256, + "step": 57440 + }, + { + "epoch": 8.55600238308013, + "grad_norm": 0.0031357344705611467, + "learning_rate": 3.5344084837498245e-05, + "loss": 0.1769, + "num_input_tokens_seen": 33324944, + "step": 57445 + }, + { + "epoch": 8.556747095621091, + "grad_norm": 0.15307281911373138, + "learning_rate": 3.53411265273527e-05, + "loss": 0.1142, + "num_input_tokens_seen": 33327504, + "step": 57450 + }, + { + "epoch": 8.55749180816205, + "grad_norm": 0.06213743984699249, + "learning_rate": 3.5338168042504336e-05, + "loss": 0.0365, + "num_input_tokens_seen": 33330352, + "step": 57455 + }, + { + "epoch": 8.558236520703009, + "grad_norm": 0.1180574968457222, + "learning_rate": 3.533520938300313e-05, + "loss": 0.3198, + "num_input_tokens_seen": 33333200, + "step": 57460 + }, + { + "epoch": 8.558981233243967, + "grad_norm": 0.3104506731033325, + "learning_rate": 3.533225054889906e-05, + "loss": 0.1936, + "num_input_tokens_seen": 33336368, + "step": 57465 + }, + { + "epoch": 8.559725945784926, + "grad_norm": 23.652942657470703, + "learning_rate": 3.532929154024212e-05, + "loss": 0.3443, + "num_input_tokens_seen": 33339344, + "step": 57470 + }, + { + "epoch": 8.560470658325887, + "grad_norm": 2.9503049850463867, + "learning_rate": 3.5326332357082306e-05, + "loss": 0.0115, + "num_input_tokens_seen": 33342352, + "step": 57475 + }, + { + "epoch": 8.561215370866845, + "grad_norm": 24.31581687927246, + "learning_rate": 3.532337299946959e-05, + "loss": 0.211, + "num_input_tokens_seen": 33345136, + "step": 57480 + }, + { + "epoch": 8.561960083407804, + "grad_norm": 0.02472991868853569, + "learning_rate": 3.532041346745398e-05, + "loss": 0.0007, + "num_input_tokens_seen": 33348016, + "step": 57485 + }, + { + "epoch": 8.562704795948763, + "grad_norm": 0.06794535368680954, + "learning_rate": 3.5317453761085476e-05, + "loss": 0.0996, + "num_input_tokens_seen": 33351024, + "step": 57490 + }, + { + "epoch": 8.563449508489724, + "grad_norm": 1.5736713409423828, + "learning_rate": 3.531449388041408e-05, + "loss": 0.2166, + "num_input_tokens_seen": 33353776, + "step": 57495 + }, + { + "epoch": 8.564194221030682, + "grad_norm": 0.03084801323711872, + "learning_rate": 3.5311533825489795e-05, + "loss": 0.2233, + "num_input_tokens_seen": 33356592, + "step": 57500 + }, + { + "epoch": 8.564938933571641, + "grad_norm": 78.50550079345703, + "learning_rate": 3.530857359636262e-05, + "loss": 0.1024, + "num_input_tokens_seen": 33359856, + "step": 57505 + }, + { + "epoch": 8.5656836461126, + "grad_norm": 217.509765625, + "learning_rate": 3.5305613193082575e-05, + "loss": 0.3746, + "num_input_tokens_seen": 33362800, + "step": 57510 + }, + { + "epoch": 8.56642835865356, + "grad_norm": 0.00932924635708332, + "learning_rate": 3.530265261569967e-05, + "loss": 0.4663, + "num_input_tokens_seen": 33365616, + "step": 57515 + }, + { + "epoch": 8.567173071194519, + "grad_norm": 0.014579563401639462, + "learning_rate": 3.529969186426392e-05, + "loss": 0.0749, + "num_input_tokens_seen": 33368592, + "step": 57520 + }, + { + "epoch": 8.567917783735478, + "grad_norm": 0.035005833953619, + "learning_rate": 3.529673093882534e-05, + "loss": 0.0036, + "num_input_tokens_seen": 33371408, + "step": 57525 + }, + { + "epoch": 8.568662496276437, + "grad_norm": 0.02976039983332157, + "learning_rate": 3.5293769839433956e-05, + "loss": 0.2979, + "num_input_tokens_seen": 33374000, + "step": 57530 + }, + { + "epoch": 8.569407208817397, + "grad_norm": 0.07149790227413177, + "learning_rate": 3.529080856613979e-05, + "loss": 0.0702, + "num_input_tokens_seen": 33377328, + "step": 57535 + }, + { + "epoch": 8.570151921358356, + "grad_norm": 0.06355012208223343, + "learning_rate": 3.528784711899288e-05, + "loss": 0.0363, + "num_input_tokens_seen": 33380400, + "step": 57540 + }, + { + "epoch": 8.570896633899315, + "grad_norm": 0.017043787986040115, + "learning_rate": 3.5284885498043254e-05, + "loss": 0.1039, + "num_input_tokens_seen": 33383152, + "step": 57545 + }, + { + "epoch": 8.571641346440273, + "grad_norm": 0.2599053382873535, + "learning_rate": 3.528192370334094e-05, + "loss": 0.1769, + "num_input_tokens_seen": 33385968, + "step": 57550 + }, + { + "epoch": 8.572386058981234, + "grad_norm": 32.12520217895508, + "learning_rate": 3.527896173493596e-05, + "loss": 0.2176, + "num_input_tokens_seen": 33388720, + "step": 57555 + }, + { + "epoch": 8.573130771522193, + "grad_norm": 0.011294298805296421, + "learning_rate": 3.527599959287838e-05, + "loss": 0.001, + "num_input_tokens_seen": 33391632, + "step": 57560 + }, + { + "epoch": 8.573875484063151, + "grad_norm": 0.02248198911547661, + "learning_rate": 3.5273037277218224e-05, + "loss": 0.0005, + "num_input_tokens_seen": 33394384, + "step": 57565 + }, + { + "epoch": 8.57462019660411, + "grad_norm": 0.19568520784378052, + "learning_rate": 3.527007478800555e-05, + "loss": 0.0389, + "num_input_tokens_seen": 33397104, + "step": 57570 + }, + { + "epoch": 8.57536490914507, + "grad_norm": 41.781532287597656, + "learning_rate": 3.5267112125290396e-05, + "loss": 0.3645, + "num_input_tokens_seen": 33400272, + "step": 57575 + }, + { + "epoch": 8.57610962168603, + "grad_norm": 0.04005027189850807, + "learning_rate": 3.5264149289122825e-05, + "loss": 0.1261, + "num_input_tokens_seen": 33403344, + "step": 57580 + }, + { + "epoch": 8.576854334226988, + "grad_norm": 2.0081939697265625, + "learning_rate": 3.526118627955288e-05, + "loss": 0.204, + "num_input_tokens_seen": 33406160, + "step": 57585 + }, + { + "epoch": 8.577599046767947, + "grad_norm": 11.745515823364258, + "learning_rate": 3.525822309663061e-05, + "loss": 0.0957, + "num_input_tokens_seen": 33409072, + "step": 57590 + }, + { + "epoch": 8.578343759308908, + "grad_norm": 0.6001600623130798, + "learning_rate": 3.5255259740406104e-05, + "loss": 0.0103, + "num_input_tokens_seen": 33411888, + "step": 57595 + }, + { + "epoch": 8.579088471849866, + "grad_norm": 0.9026648998260498, + "learning_rate": 3.52522962109294e-05, + "loss": 0.0055, + "num_input_tokens_seen": 33414736, + "step": 57600 + }, + { + "epoch": 8.579833184390825, + "grad_norm": 0.18230490386486053, + "learning_rate": 3.5249332508250576e-05, + "loss": 0.1789, + "num_input_tokens_seen": 33417552, + "step": 57605 + }, + { + "epoch": 8.580577896931784, + "grad_norm": 0.003998921252787113, + "learning_rate": 3.524636863241969e-05, + "loss": 0.001, + "num_input_tokens_seen": 33420208, + "step": 57610 + }, + { + "epoch": 8.581322609472743, + "grad_norm": 0.10364067554473877, + "learning_rate": 3.5243404583486824e-05, + "loss": 0.0347, + "num_input_tokens_seen": 33422896, + "step": 57615 + }, + { + "epoch": 8.582067322013703, + "grad_norm": 122.76920318603516, + "learning_rate": 3.5240440361502046e-05, + "loss": 0.4336, + "num_input_tokens_seen": 33425968, + "step": 57620 + }, + { + "epoch": 8.582812034554662, + "grad_norm": 23.999879837036133, + "learning_rate": 3.523747596651544e-05, + "loss": 0.1767, + "num_input_tokens_seen": 33429040, + "step": 57625 + }, + { + "epoch": 8.58355674709562, + "grad_norm": 0.019255172461271286, + "learning_rate": 3.523451139857708e-05, + "loss": 0.0584, + "num_input_tokens_seen": 33431952, + "step": 57630 + }, + { + "epoch": 8.584301459636581, + "grad_norm": 0.0923546701669693, + "learning_rate": 3.5231546657737044e-05, + "loss": 0.0268, + "num_input_tokens_seen": 33434960, + "step": 57635 + }, + { + "epoch": 8.58504617217754, + "grad_norm": 0.25401371717453003, + "learning_rate": 3.522858174404544e-05, + "loss": 0.4032, + "num_input_tokens_seen": 33438064, + "step": 57640 + }, + { + "epoch": 8.585790884718499, + "grad_norm": 0.01227885577827692, + "learning_rate": 3.522561665755234e-05, + "loss": 0.4314, + "num_input_tokens_seen": 33440848, + "step": 57645 + }, + { + "epoch": 8.586535597259457, + "grad_norm": 0.05494220554828644, + "learning_rate": 3.5222651398307835e-05, + "loss": 0.0875, + "num_input_tokens_seen": 33443696, + "step": 57650 + }, + { + "epoch": 8.587280309800416, + "grad_norm": 0.2841570973396301, + "learning_rate": 3.5219685966362024e-05, + "loss": 0.1241, + "num_input_tokens_seen": 33446288, + "step": 57655 + }, + { + "epoch": 8.588025022341377, + "grad_norm": 1.4426368474960327, + "learning_rate": 3.521672036176501e-05, + "loss": 0.1349, + "num_input_tokens_seen": 33449072, + "step": 57660 + }, + { + "epoch": 8.588769734882336, + "grad_norm": 0.15335537493228912, + "learning_rate": 3.5213754584566886e-05, + "loss": 0.0021, + "num_input_tokens_seen": 33451760, + "step": 57665 + }, + { + "epoch": 8.589514447423294, + "grad_norm": 0.10825754702091217, + "learning_rate": 3.521078863481776e-05, + "loss": 0.1911, + "num_input_tokens_seen": 33454576, + "step": 57670 + }, + { + "epoch": 8.590259159964253, + "grad_norm": 35.8045654296875, + "learning_rate": 3.5207822512567736e-05, + "loss": 0.0699, + "num_input_tokens_seen": 33457744, + "step": 57675 + }, + { + "epoch": 8.591003872505214, + "grad_norm": 0.04118184745311737, + "learning_rate": 3.520485621786693e-05, + "loss": 0.0386, + "num_input_tokens_seen": 33460816, + "step": 57680 + }, + { + "epoch": 8.591748585046172, + "grad_norm": 0.04579984024167061, + "learning_rate": 3.5201889750765446e-05, + "loss": 0.0003, + "num_input_tokens_seen": 33463568, + "step": 57685 + }, + { + "epoch": 8.592493297587131, + "grad_norm": 42.54097366333008, + "learning_rate": 3.51989231113134e-05, + "loss": 0.1201, + "num_input_tokens_seen": 33466672, + "step": 57690 + }, + { + "epoch": 8.59323801012809, + "grad_norm": 0.23457376658916473, + "learning_rate": 3.519595629956092e-05, + "loss": 0.0004, + "num_input_tokens_seen": 33469712, + "step": 57695 + }, + { + "epoch": 8.59398272266905, + "grad_norm": 17.964174270629883, + "learning_rate": 3.519298931555812e-05, + "loss": 0.2016, + "num_input_tokens_seen": 33472464, + "step": 57700 + }, + { + "epoch": 8.59472743521001, + "grad_norm": 0.021599648520350456, + "learning_rate": 3.519002215935512e-05, + "loss": 0.0006, + "num_input_tokens_seen": 33475184, + "step": 57705 + }, + { + "epoch": 8.595472147750968, + "grad_norm": 32.24333572387695, + "learning_rate": 3.5187054831002064e-05, + "loss": 0.1164, + "num_input_tokens_seen": 33478256, + "step": 57710 + }, + { + "epoch": 8.596216860291927, + "grad_norm": 0.35517409443855286, + "learning_rate": 3.5184087330549056e-05, + "loss": 0.1201, + "num_input_tokens_seen": 33481104, + "step": 57715 + }, + { + "epoch": 8.596961572832887, + "grad_norm": 0.20670905709266663, + "learning_rate": 3.518111965804625e-05, + "loss": 0.1259, + "num_input_tokens_seen": 33483856, + "step": 57720 + }, + { + "epoch": 8.597706285373846, + "grad_norm": 0.02011966146528721, + "learning_rate": 3.517815181354378e-05, + "loss": 0.4857, + "num_input_tokens_seen": 33486672, + "step": 57725 + }, + { + "epoch": 8.598450997914805, + "grad_norm": 0.3026367127895355, + "learning_rate": 3.517518379709177e-05, + "loss": 0.1415, + "num_input_tokens_seen": 33489328, + "step": 57730 + }, + { + "epoch": 8.599195710455763, + "grad_norm": 20.793222427368164, + "learning_rate": 3.5172215608740376e-05, + "loss": 0.1016, + "num_input_tokens_seen": 33492048, + "step": 57735 + }, + { + "epoch": 8.599940422996724, + "grad_norm": 10.676593780517578, + "learning_rate": 3.516924724853974e-05, + "loss": 0.5172, + "num_input_tokens_seen": 33494640, + "step": 57740 + }, + { + "epoch": 8.600685135537683, + "grad_norm": 0.05926933512091637, + "learning_rate": 3.5166278716540016e-05, + "loss": 0.1419, + "num_input_tokens_seen": 33497296, + "step": 57745 + }, + { + "epoch": 8.601429848078642, + "grad_norm": 74.7957534790039, + "learning_rate": 3.5163310012791326e-05, + "loss": 0.3316, + "num_input_tokens_seen": 33499984, + "step": 57750 + }, + { + "epoch": 8.6021745606196, + "grad_norm": 0.028790438547730446, + "learning_rate": 3.516034113734385e-05, + "loss": 0.1729, + "num_input_tokens_seen": 33502736, + "step": 57755 + }, + { + "epoch": 8.60291927316056, + "grad_norm": 0.08171212673187256, + "learning_rate": 3.515737209024774e-05, + "loss": 0.0151, + "num_input_tokens_seen": 33505712, + "step": 57760 + }, + { + "epoch": 8.60366398570152, + "grad_norm": 69.46923828125, + "learning_rate": 3.515440287155315e-05, + "loss": 0.5473, + "num_input_tokens_seen": 33508592, + "step": 57765 + }, + { + "epoch": 8.604408698242478, + "grad_norm": 4.0572333335876465, + "learning_rate": 3.515143348131025e-05, + "loss": 0.1839, + "num_input_tokens_seen": 33511504, + "step": 57770 + }, + { + "epoch": 8.605153410783437, + "grad_norm": 0.032877977937459946, + "learning_rate": 3.514846391956919e-05, + "loss": 0.0019, + "num_input_tokens_seen": 33514288, + "step": 57775 + }, + { + "epoch": 8.605898123324398, + "grad_norm": 0.008386652916669846, + "learning_rate": 3.514549418638015e-05, + "loss": 0.0798, + "num_input_tokens_seen": 33517104, + "step": 57780 + }, + { + "epoch": 8.606642835865356, + "grad_norm": 0.03102223016321659, + "learning_rate": 3.5142524281793296e-05, + "loss": 0.1475, + "num_input_tokens_seen": 33520112, + "step": 57785 + }, + { + "epoch": 8.607387548406315, + "grad_norm": 0.05115342140197754, + "learning_rate": 3.513955420585881e-05, + "loss": 0.2717, + "num_input_tokens_seen": 33523056, + "step": 57790 + }, + { + "epoch": 8.608132260947274, + "grad_norm": 49.72142028808594, + "learning_rate": 3.513658395862685e-05, + "loss": 0.1071, + "num_input_tokens_seen": 33525904, + "step": 57795 + }, + { + "epoch": 8.608876973488233, + "grad_norm": 0.024765433743596077, + "learning_rate": 3.5133613540147605e-05, + "loss": 0.1958, + "num_input_tokens_seen": 33528688, + "step": 57800 + }, + { + "epoch": 8.609621686029193, + "grad_norm": 6.769944190979004, + "learning_rate": 3.513064295047127e-05, + "loss": 0.1188, + "num_input_tokens_seen": 33531472, + "step": 57805 + }, + { + "epoch": 8.610366398570152, + "grad_norm": 0.31750595569610596, + "learning_rate": 3.5127672189648016e-05, + "loss": 0.1331, + "num_input_tokens_seen": 33534256, + "step": 57810 + }, + { + "epoch": 8.61111111111111, + "grad_norm": 0.02350965142250061, + "learning_rate": 3.512470125772803e-05, + "loss": 0.1727, + "num_input_tokens_seen": 33536912, + "step": 57815 + }, + { + "epoch": 8.61185582365207, + "grad_norm": 19.822004318237305, + "learning_rate": 3.5121730154761507e-05, + "loss": 0.4435, + "num_input_tokens_seen": 33539920, + "step": 57820 + }, + { + "epoch": 8.61260053619303, + "grad_norm": 0.11410978436470032, + "learning_rate": 3.511875888079864e-05, + "loss": 0.239, + "num_input_tokens_seen": 33542768, + "step": 57825 + }, + { + "epoch": 8.613345248733989, + "grad_norm": 0.03448248654603958, + "learning_rate": 3.511578743588963e-05, + "loss": 0.0018, + "num_input_tokens_seen": 33545872, + "step": 57830 + }, + { + "epoch": 8.614089961274948, + "grad_norm": 0.0034366243053227663, + "learning_rate": 3.511281582008466e-05, + "loss": 0.0457, + "num_input_tokens_seen": 33548816, + "step": 57835 + }, + { + "epoch": 8.614834673815906, + "grad_norm": 0.06678255647420883, + "learning_rate": 3.5109844033433963e-05, + "loss": 0.0033, + "num_input_tokens_seen": 33551632, + "step": 57840 + }, + { + "epoch": 8.615579386356867, + "grad_norm": 0.04756155610084534, + "learning_rate": 3.5106872075987716e-05, + "loss": 0.0096, + "num_input_tokens_seen": 33554576, + "step": 57845 + }, + { + "epoch": 8.616324098897826, + "grad_norm": 0.007199425715953112, + "learning_rate": 3.5103899947796134e-05, + "loss": 0.0006, + "num_input_tokens_seen": 33557296, + "step": 57850 + }, + { + "epoch": 8.617068811438784, + "grad_norm": 0.017491746693849564, + "learning_rate": 3.510092764890944e-05, + "loss": 0.0815, + "num_input_tokens_seen": 33560272, + "step": 57855 + }, + { + "epoch": 8.617813523979743, + "grad_norm": 0.25981783866882324, + "learning_rate": 3.509795517937784e-05, + "loss": 0.0154, + "num_input_tokens_seen": 33563216, + "step": 57860 + }, + { + "epoch": 8.618558236520704, + "grad_norm": 0.0075636813417077065, + "learning_rate": 3.5094982539251545e-05, + "loss": 0.1564, + "num_input_tokens_seen": 33566032, + "step": 57865 + }, + { + "epoch": 8.619302949061662, + "grad_norm": 0.03581232950091362, + "learning_rate": 3.5092009728580784e-05, + "loss": 0.1846, + "num_input_tokens_seen": 33568976, + "step": 57870 + }, + { + "epoch": 8.620047661602621, + "grad_norm": 0.08979582786560059, + "learning_rate": 3.5089036747415775e-05, + "loss": 0.1299, + "num_input_tokens_seen": 33571696, + "step": 57875 + }, + { + "epoch": 8.62079237414358, + "grad_norm": 54.480255126953125, + "learning_rate": 3.508606359580674e-05, + "loss": 0.1653, + "num_input_tokens_seen": 33574768, + "step": 57880 + }, + { + "epoch": 8.62153708668454, + "grad_norm": 10.292516708374023, + "learning_rate": 3.508309027380392e-05, + "loss": 0.2235, + "num_input_tokens_seen": 33578096, + "step": 57885 + }, + { + "epoch": 8.6222817992255, + "grad_norm": 0.48227617144584656, + "learning_rate": 3.508011678145752e-05, + "loss": 0.0006, + "num_input_tokens_seen": 33580688, + "step": 57890 + }, + { + "epoch": 8.623026511766458, + "grad_norm": 0.21826745569705963, + "learning_rate": 3.5077143118817805e-05, + "loss": 0.1134, + "num_input_tokens_seen": 33583984, + "step": 57895 + }, + { + "epoch": 8.623771224307417, + "grad_norm": 0.053306907415390015, + "learning_rate": 3.5074169285935e-05, + "loss": 0.1866, + "num_input_tokens_seen": 33586864, + "step": 57900 + }, + { + "epoch": 8.624515936848377, + "grad_norm": 15.501823425292969, + "learning_rate": 3.5071195282859345e-05, + "loss": 0.44, + "num_input_tokens_seen": 33590032, + "step": 57905 + }, + { + "epoch": 8.625260649389336, + "grad_norm": 0.05522165074944496, + "learning_rate": 3.506822110964108e-05, + "loss": 0.0102, + "num_input_tokens_seen": 33593232, + "step": 57910 + }, + { + "epoch": 8.626005361930295, + "grad_norm": 3.011568307876587, + "learning_rate": 3.506524676633045e-05, + "loss": 0.1121, + "num_input_tokens_seen": 33595792, + "step": 57915 + }, + { + "epoch": 8.626750074471254, + "grad_norm": 33.57862854003906, + "learning_rate": 3.50622722529777e-05, + "loss": 0.1099, + "num_input_tokens_seen": 33598512, + "step": 57920 + }, + { + "epoch": 8.627494787012214, + "grad_norm": 0.06629599630832672, + "learning_rate": 3.5059297569633096e-05, + "loss": 0.0027, + "num_input_tokens_seen": 33601232, + "step": 57925 + }, + { + "epoch": 8.628239499553173, + "grad_norm": 57.328521728515625, + "learning_rate": 3.505632271634688e-05, + "loss": 0.2003, + "num_input_tokens_seen": 33604048, + "step": 57930 + }, + { + "epoch": 8.628984212094132, + "grad_norm": 1.119521141052246, + "learning_rate": 3.505334769316931e-05, + "loss": 0.1931, + "num_input_tokens_seen": 33607184, + "step": 57935 + }, + { + "epoch": 8.62972892463509, + "grad_norm": 0.0048863948322832584, + "learning_rate": 3.505037250015066e-05, + "loss": 0.1095, + "num_input_tokens_seen": 33609680, + "step": 57940 + }, + { + "epoch": 8.63047363717605, + "grad_norm": 30.93095588684082, + "learning_rate": 3.504739713734118e-05, + "loss": 0.0559, + "num_input_tokens_seen": 33612560, + "step": 57945 + }, + { + "epoch": 8.63121834971701, + "grad_norm": 0.39229726791381836, + "learning_rate": 3.504442160479112e-05, + "loss": 0.0617, + "num_input_tokens_seen": 33615728, + "step": 57950 + }, + { + "epoch": 8.631963062257968, + "grad_norm": 61.751705169677734, + "learning_rate": 3.5041445902550776e-05, + "loss": 0.1443, + "num_input_tokens_seen": 33618416, + "step": 57955 + }, + { + "epoch": 8.632707774798927, + "grad_norm": 1.1043715476989746, + "learning_rate": 3.503847003067041e-05, + "loss": 0.2659, + "num_input_tokens_seen": 33621232, + "step": 57960 + }, + { + "epoch": 8.633452487339888, + "grad_norm": 0.02239840105175972, + "learning_rate": 3.503549398920029e-05, + "loss": 0.0388, + "num_input_tokens_seen": 33624592, + "step": 57965 + }, + { + "epoch": 8.634197199880846, + "grad_norm": 0.3151904046535492, + "learning_rate": 3.503251777819071e-05, + "loss": 0.2444, + "num_input_tokens_seen": 33627696, + "step": 57970 + }, + { + "epoch": 8.634941912421805, + "grad_norm": 3.5638375282287598, + "learning_rate": 3.502954139769193e-05, + "loss": 0.0872, + "num_input_tokens_seen": 33630544, + "step": 57975 + }, + { + "epoch": 8.635686624962764, + "grad_norm": 13.909250259399414, + "learning_rate": 3.502656484775424e-05, + "loss": 0.0941, + "num_input_tokens_seen": 33633424, + "step": 57980 + }, + { + "epoch": 8.636431337503723, + "grad_norm": 83.5705337524414, + "learning_rate": 3.502358812842794e-05, + "loss": 0.1854, + "num_input_tokens_seen": 33636176, + "step": 57985 + }, + { + "epoch": 8.637176050044683, + "grad_norm": 60.02667999267578, + "learning_rate": 3.502061123976329e-05, + "loss": 0.0746, + "num_input_tokens_seen": 33639120, + "step": 57990 + }, + { + "epoch": 8.637920762585642, + "grad_norm": 0.47112801671028137, + "learning_rate": 3.50176341818106e-05, + "loss": 0.0712, + "num_input_tokens_seen": 33641872, + "step": 57995 + }, + { + "epoch": 8.6386654751266, + "grad_norm": 0.2558596432209015, + "learning_rate": 3.5014656954620174e-05, + "loss": 0.0752, + "num_input_tokens_seen": 33644816, + "step": 58000 + }, + { + "epoch": 8.63941018766756, + "grad_norm": 1.0861164331436157, + "learning_rate": 3.5011679558242286e-05, + "loss": 0.002, + "num_input_tokens_seen": 33647664, + "step": 58005 + }, + { + "epoch": 8.64015490020852, + "grad_norm": 0.1937018185853958, + "learning_rate": 3.5008701992727254e-05, + "loss": 0.0024, + "num_input_tokens_seen": 33650288, + "step": 58010 + }, + { + "epoch": 8.640899612749479, + "grad_norm": 25.89214324951172, + "learning_rate": 3.500572425812537e-05, + "loss": 0.1041, + "num_input_tokens_seen": 33653552, + "step": 58015 + }, + { + "epoch": 8.641644325290438, + "grad_norm": 0.020211966708302498, + "learning_rate": 3.500274635448694e-05, + "loss": 0.125, + "num_input_tokens_seen": 33656336, + "step": 58020 + }, + { + "epoch": 8.642389037831396, + "grad_norm": 0.01256625447422266, + "learning_rate": 3.499976828186229e-05, + "loss": 0.0012, + "num_input_tokens_seen": 33659376, + "step": 58025 + }, + { + "epoch": 8.643133750372357, + "grad_norm": 0.08746315538883209, + "learning_rate": 3.499679004030171e-05, + "loss": 0.0014, + "num_input_tokens_seen": 33662320, + "step": 58030 + }, + { + "epoch": 8.643878462913316, + "grad_norm": 115.26488494873047, + "learning_rate": 3.499381162985552e-05, + "loss": 0.166, + "num_input_tokens_seen": 33665200, + "step": 58035 + }, + { + "epoch": 8.644623175454274, + "grad_norm": 19.22502326965332, + "learning_rate": 3.499083305057405e-05, + "loss": 0.0426, + "num_input_tokens_seen": 33668272, + "step": 58040 + }, + { + "epoch": 8.645367887995233, + "grad_norm": 0.01698676496744156, + "learning_rate": 3.49878543025076e-05, + "loss": 0.0009, + "num_input_tokens_seen": 33671088, + "step": 58045 + }, + { + "epoch": 8.646112600536194, + "grad_norm": 16.963172912597656, + "learning_rate": 3.49848753857065e-05, + "loss": 0.0472, + "num_input_tokens_seen": 33673936, + "step": 58050 + }, + { + "epoch": 8.646857313077152, + "grad_norm": 0.011475466191768646, + "learning_rate": 3.4981896300221084e-05, + "loss": 0.0078, + "num_input_tokens_seen": 33676784, + "step": 58055 + }, + { + "epoch": 8.647602025618111, + "grad_norm": 59.62049102783203, + "learning_rate": 3.497891704610167e-05, + "loss": 0.2382, + "num_input_tokens_seen": 33679632, + "step": 58060 + }, + { + "epoch": 8.64834673815907, + "grad_norm": 0.0009783790446817875, + "learning_rate": 3.49759376233986e-05, + "loss": 0.0727, + "num_input_tokens_seen": 33682448, + "step": 58065 + }, + { + "epoch": 8.64909145070003, + "grad_norm": 0.00040430683293379843, + "learning_rate": 3.4972958032162204e-05, + "loss": 0.1737, + "num_input_tokens_seen": 33685328, + "step": 58070 + }, + { + "epoch": 8.64983616324099, + "grad_norm": 0.002432631328701973, + "learning_rate": 3.496997827244282e-05, + "loss": 0.1245, + "num_input_tokens_seen": 33688048, + "step": 58075 + }, + { + "epoch": 8.650580875781948, + "grad_norm": 0.012255042791366577, + "learning_rate": 3.496699834429078e-05, + "loss": 0.1043, + "num_input_tokens_seen": 33690800, + "step": 58080 + }, + { + "epoch": 8.651325588322907, + "grad_norm": 0.21761900186538696, + "learning_rate": 3.4964018247756434e-05, + "loss": 0.1474, + "num_input_tokens_seen": 33693648, + "step": 58085 + }, + { + "epoch": 8.652070300863867, + "grad_norm": 109.5237045288086, + "learning_rate": 3.4961037982890135e-05, + "loss": 1.0371, + "num_input_tokens_seen": 33696368, + "step": 58090 + }, + { + "epoch": 8.652815013404826, + "grad_norm": 0.00436414172872901, + "learning_rate": 3.495805754974221e-05, + "loss": 0.0622, + "num_input_tokens_seen": 33699056, + "step": 58095 + }, + { + "epoch": 8.653559725945785, + "grad_norm": 0.0052278907969594, + "learning_rate": 3.495507694836304e-05, + "loss": 0.0376, + "num_input_tokens_seen": 33702096, + "step": 58100 + }, + { + "epoch": 8.654304438486744, + "grad_norm": 70.08821868896484, + "learning_rate": 3.4952096178802946e-05, + "loss": 0.3961, + "num_input_tokens_seen": 33704784, + "step": 58105 + }, + { + "epoch": 8.655049151027704, + "grad_norm": 47.12893295288086, + "learning_rate": 3.4949115241112314e-05, + "loss": 0.3332, + "num_input_tokens_seen": 33707472, + "step": 58110 + }, + { + "epoch": 8.655793863568663, + "grad_norm": 18.11106300354004, + "learning_rate": 3.4946134135341486e-05, + "loss": 0.0059, + "num_input_tokens_seen": 33710320, + "step": 58115 + }, + { + "epoch": 8.656538576109622, + "grad_norm": 13.50656795501709, + "learning_rate": 3.494315286154083e-05, + "loss": 0.0149, + "num_input_tokens_seen": 33713072, + "step": 58120 + }, + { + "epoch": 8.65728328865058, + "grad_norm": 0.03658755123615265, + "learning_rate": 3.494017141976071e-05, + "loss": 0.2789, + "num_input_tokens_seen": 33716016, + "step": 58125 + }, + { + "epoch": 8.65802800119154, + "grad_norm": 3.867737054824829, + "learning_rate": 3.4937189810051494e-05, + "loss": 0.1672, + "num_input_tokens_seen": 33718832, + "step": 58130 + }, + { + "epoch": 8.6587727137325, + "grad_norm": 0.007353529799729586, + "learning_rate": 3.4934208032463565e-05, + "loss": 0.3353, + "num_input_tokens_seen": 33722096, + "step": 58135 + }, + { + "epoch": 8.659517426273458, + "grad_norm": 0.11213986575603485, + "learning_rate": 3.4931226087047285e-05, + "loss": 0.0018, + "num_input_tokens_seen": 33724816, + "step": 58140 + }, + { + "epoch": 8.660262138814417, + "grad_norm": 1.3084765672683716, + "learning_rate": 3.4928243973853044e-05, + "loss": 0.0032, + "num_input_tokens_seen": 33727888, + "step": 58145 + }, + { + "epoch": 8.661006851355378, + "grad_norm": 0.003408942138776183, + "learning_rate": 3.49252616929312e-05, + "loss": 0.157, + "num_input_tokens_seen": 33730576, + "step": 58150 + }, + { + "epoch": 8.661751563896336, + "grad_norm": 0.004435614682734013, + "learning_rate": 3.492227924433215e-05, + "loss": 0.386, + "num_input_tokens_seen": 33733200, + "step": 58155 + }, + { + "epoch": 8.662496276437295, + "grad_norm": 0.012858672067523003, + "learning_rate": 3.491929662810627e-05, + "loss": 0.0652, + "num_input_tokens_seen": 33736016, + "step": 58160 + }, + { + "epoch": 8.663240988978254, + "grad_norm": 0.055551256984472275, + "learning_rate": 3.491631384430396e-05, + "loss": 0.0745, + "num_input_tokens_seen": 33738704, + "step": 58165 + }, + { + "epoch": 8.663985701519213, + "grad_norm": 21.30733871459961, + "learning_rate": 3.4913330892975606e-05, + "loss": 0.1568, + "num_input_tokens_seen": 33741488, + "step": 58170 + }, + { + "epoch": 8.664730414060173, + "grad_norm": 15.23971939086914, + "learning_rate": 3.4910347774171606e-05, + "loss": 0.3965, + "num_input_tokens_seen": 33744272, + "step": 58175 + }, + { + "epoch": 8.665475126601132, + "grad_norm": 0.010313916951417923, + "learning_rate": 3.490736448794235e-05, + "loss": 0.2238, + "num_input_tokens_seen": 33746864, + "step": 58180 + }, + { + "epoch": 8.66621983914209, + "grad_norm": 16.335186004638672, + "learning_rate": 3.490438103433824e-05, + "loss": 0.004, + "num_input_tokens_seen": 33750000, + "step": 58185 + }, + { + "epoch": 8.66696455168305, + "grad_norm": 0.025712033733725548, + "learning_rate": 3.490139741340967e-05, + "loss": 0.0009, + "num_input_tokens_seen": 33753392, + "step": 58190 + }, + { + "epoch": 8.66770926422401, + "grad_norm": 0.15078182518482208, + "learning_rate": 3.4898413625207067e-05, + "loss": 0.0004, + "num_input_tokens_seen": 33756144, + "step": 58195 + }, + { + "epoch": 8.668453976764969, + "grad_norm": 0.08116776496171951, + "learning_rate": 3.4895429669780824e-05, + "loss": 0.2638, + "num_input_tokens_seen": 33758864, + "step": 58200 + }, + { + "epoch": 8.669198689305928, + "grad_norm": 0.31000766158103943, + "learning_rate": 3.4892445547181354e-05, + "loss": 0.001, + "num_input_tokens_seen": 33761648, + "step": 58205 + }, + { + "epoch": 8.669943401846886, + "grad_norm": 0.5073733329772949, + "learning_rate": 3.4889461257459065e-05, + "loss": 0.0364, + "num_input_tokens_seen": 33764304, + "step": 58210 + }, + { + "epoch": 8.670688114387847, + "grad_norm": 14.759803771972656, + "learning_rate": 3.488647680066438e-05, + "loss": 0.0106, + "num_input_tokens_seen": 33767280, + "step": 58215 + }, + { + "epoch": 8.671432826928806, + "grad_norm": 76.94149780273438, + "learning_rate": 3.4883492176847724e-05, + "loss": 0.1449, + "num_input_tokens_seen": 33770192, + "step": 58220 + }, + { + "epoch": 8.672177539469764, + "grad_norm": 0.016403667628765106, + "learning_rate": 3.488050738605951e-05, + "loss": 0.1236, + "num_input_tokens_seen": 33772912, + "step": 58225 + }, + { + "epoch": 8.672922252010723, + "grad_norm": 40.33180236816406, + "learning_rate": 3.4877522428350165e-05, + "loss": 0.1849, + "num_input_tokens_seen": 33775856, + "step": 58230 + }, + { + "epoch": 8.673666964551684, + "grad_norm": 0.01628287509083748, + "learning_rate": 3.487453730377011e-05, + "loss": 0.0009, + "num_input_tokens_seen": 33778928, + "step": 58235 + }, + { + "epoch": 8.674411677092642, + "grad_norm": 12.485048294067383, + "learning_rate": 3.4871552012369793e-05, + "loss": 0.0024, + "num_input_tokens_seen": 33781680, + "step": 58240 + }, + { + "epoch": 8.675156389633601, + "grad_norm": 0.014812168665230274, + "learning_rate": 3.486856655419964e-05, + "loss": 0.0422, + "num_input_tokens_seen": 33784528, + "step": 58245 + }, + { + "epoch": 8.67590110217456, + "grad_norm": 0.011829402297735214, + "learning_rate": 3.4865580929310074e-05, + "loss": 0.0003, + "num_input_tokens_seen": 33787408, + "step": 58250 + }, + { + "epoch": 8.67664581471552, + "grad_norm": 0.1940479725599289, + "learning_rate": 3.486259513775155e-05, + "loss": 0.179, + "num_input_tokens_seen": 33790352, + "step": 58255 + }, + { + "epoch": 8.67739052725648, + "grad_norm": 0.19124248623847961, + "learning_rate": 3.485960917957451e-05, + "loss": 0.0213, + "num_input_tokens_seen": 33793168, + "step": 58260 + }, + { + "epoch": 8.678135239797438, + "grad_norm": 72.21410369873047, + "learning_rate": 3.4856623054829395e-05, + "loss": 0.0356, + "num_input_tokens_seen": 33795952, + "step": 58265 + }, + { + "epoch": 8.678879952338397, + "grad_norm": 0.012679223902523518, + "learning_rate": 3.4853636763566646e-05, + "loss": 0.4086, + "num_input_tokens_seen": 33798704, + "step": 58270 + }, + { + "epoch": 8.679624664879357, + "grad_norm": 33.373046875, + "learning_rate": 3.485065030583672e-05, + "loss": 0.1009, + "num_input_tokens_seen": 33801616, + "step": 58275 + }, + { + "epoch": 8.680369377420316, + "grad_norm": 0.03552264720201492, + "learning_rate": 3.484766368169007e-05, + "loss": 0.003, + "num_input_tokens_seen": 33804752, + "step": 58280 + }, + { + "epoch": 8.681114089961275, + "grad_norm": 0.024880046024918556, + "learning_rate": 3.484467689117715e-05, + "loss": 0.0002, + "num_input_tokens_seen": 33807856, + "step": 58285 + }, + { + "epoch": 8.681858802502234, + "grad_norm": 0.007851139642298222, + "learning_rate": 3.4841689934348416e-05, + "loss": 0.1224, + "num_input_tokens_seen": 33810480, + "step": 58290 + }, + { + "epoch": 8.682603515043194, + "grad_norm": 16.132230758666992, + "learning_rate": 3.483870281125433e-05, + "loss": 0.2218, + "num_input_tokens_seen": 33813360, + "step": 58295 + }, + { + "epoch": 8.683348227584153, + "grad_norm": 0.004890208598226309, + "learning_rate": 3.483571552194537e-05, + "loss": 0.1446, + "num_input_tokens_seen": 33816304, + "step": 58300 + }, + { + "epoch": 8.684092940125112, + "grad_norm": 0.038722485303878784, + "learning_rate": 3.4832728066471994e-05, + "loss": 0.229, + "num_input_tokens_seen": 33819312, + "step": 58305 + }, + { + "epoch": 8.68483765266607, + "grad_norm": 21.23599624633789, + "learning_rate": 3.482974044488466e-05, + "loss": 0.1464, + "num_input_tokens_seen": 33822352, + "step": 58310 + }, + { + "epoch": 8.68558236520703, + "grad_norm": 0.01539632212370634, + "learning_rate": 3.4826752657233855e-05, + "loss": 0.0003, + "num_input_tokens_seen": 33825104, + "step": 58315 + }, + { + "epoch": 8.68632707774799, + "grad_norm": 77.27063751220703, + "learning_rate": 3.4823764703570054e-05, + "loss": 0.2765, + "num_input_tokens_seen": 33827856, + "step": 58320 + }, + { + "epoch": 8.687071790288948, + "grad_norm": 0.08410388231277466, + "learning_rate": 3.482077658394373e-05, + "loss": 0.0009, + "num_input_tokens_seen": 33830640, + "step": 58325 + }, + { + "epoch": 8.687816502829907, + "grad_norm": 0.188078835606575, + "learning_rate": 3.481778829840537e-05, + "loss": 0.2706, + "num_input_tokens_seen": 33833520, + "step": 58330 + }, + { + "epoch": 8.688561215370868, + "grad_norm": 0.0060294317081570625, + "learning_rate": 3.481479984700546e-05, + "loss": 0.0004, + "num_input_tokens_seen": 33836496, + "step": 58335 + }, + { + "epoch": 8.689305927911827, + "grad_norm": 0.6411148309707642, + "learning_rate": 3.481181122979447e-05, + "loss": 0.2492, + "num_input_tokens_seen": 33839504, + "step": 58340 + }, + { + "epoch": 8.690050640452785, + "grad_norm": 0.023183438926935196, + "learning_rate": 3.480882244682291e-05, + "loss": 0.129, + "num_input_tokens_seen": 33842384, + "step": 58345 + }, + { + "epoch": 8.690795352993744, + "grad_norm": 0.059129927307367325, + "learning_rate": 3.480583349814126e-05, + "loss": 0.0314, + "num_input_tokens_seen": 33845168, + "step": 58350 + }, + { + "epoch": 8.691540065534703, + "grad_norm": 0.05940757319331169, + "learning_rate": 3.480284438380002e-05, + "loss": 0.1441, + "num_input_tokens_seen": 33848304, + "step": 58355 + }, + { + "epoch": 8.692284778075663, + "grad_norm": 0.02146690897643566, + "learning_rate": 3.479985510384969e-05, + "loss": 0.0019, + "num_input_tokens_seen": 33851088, + "step": 58360 + }, + { + "epoch": 8.693029490616622, + "grad_norm": 0.41053149104118347, + "learning_rate": 3.479686565834077e-05, + "loss": 0.3995, + "num_input_tokens_seen": 33854192, + "step": 58365 + }, + { + "epoch": 8.69377420315758, + "grad_norm": 22.558040618896484, + "learning_rate": 3.479387604732376e-05, + "loss": 0.0803, + "num_input_tokens_seen": 33857392, + "step": 58370 + }, + { + "epoch": 8.69451891569854, + "grad_norm": 117.31671905517578, + "learning_rate": 3.479088627084916e-05, + "loss": 0.4185, + "num_input_tokens_seen": 33859984, + "step": 58375 + }, + { + "epoch": 8.6952636282395, + "grad_norm": 5.869563579559326, + "learning_rate": 3.4787896328967493e-05, + "loss": 0.0579, + "num_input_tokens_seen": 33863056, + "step": 58380 + }, + { + "epoch": 8.696008340780459, + "grad_norm": 0.04596462845802307, + "learning_rate": 3.478490622172926e-05, + "loss": 0.1916, + "num_input_tokens_seen": 33865872, + "step": 58385 + }, + { + "epoch": 8.696753053321418, + "grad_norm": 0.10576048493385315, + "learning_rate": 3.478191594918499e-05, + "loss": 0.0927, + "num_input_tokens_seen": 33868592, + "step": 58390 + }, + { + "epoch": 8.697497765862376, + "grad_norm": 0.018434759229421616, + "learning_rate": 3.477892551138519e-05, + "loss": 0.2668, + "num_input_tokens_seen": 33871696, + "step": 58395 + }, + { + "epoch": 8.698242478403337, + "grad_norm": 31.38631820678711, + "learning_rate": 3.4775934908380386e-05, + "loss": 0.0762, + "num_input_tokens_seen": 33874672, + "step": 58400 + }, + { + "epoch": 8.698987190944296, + "grad_norm": 2.229893922805786, + "learning_rate": 3.4772944140221094e-05, + "loss": 0.121, + "num_input_tokens_seen": 33877552, + "step": 58405 + }, + { + "epoch": 8.699731903485254, + "grad_norm": 11.781135559082031, + "learning_rate": 3.476995320695784e-05, + "loss": 0.0271, + "num_input_tokens_seen": 33880528, + "step": 58410 + }, + { + "epoch": 8.700476616026213, + "grad_norm": 26.87347412109375, + "learning_rate": 3.476696210864116e-05, + "loss": 0.1944, + "num_input_tokens_seen": 33883600, + "step": 58415 + }, + { + "epoch": 8.701221328567174, + "grad_norm": 0.01585618406534195, + "learning_rate": 3.476397084532158e-05, + "loss": 0.2298, + "num_input_tokens_seen": 33886512, + "step": 58420 + }, + { + "epoch": 8.701966041108133, + "grad_norm": 0.06359857320785522, + "learning_rate": 3.476097941704964e-05, + "loss": 0.1201, + "num_input_tokens_seen": 33889456, + "step": 58425 + }, + { + "epoch": 8.702710753649091, + "grad_norm": 0.006507745943963528, + "learning_rate": 3.475798782387587e-05, + "loss": 0.1987, + "num_input_tokens_seen": 33892304, + "step": 58430 + }, + { + "epoch": 8.70345546619005, + "grad_norm": 20.505516052246094, + "learning_rate": 3.475499606585081e-05, + "loss": 0.1221, + "num_input_tokens_seen": 33894960, + "step": 58435 + }, + { + "epoch": 8.70420017873101, + "grad_norm": 0.04330943524837494, + "learning_rate": 3.4752004143025016e-05, + "loss": 0.2567, + "num_input_tokens_seen": 33897808, + "step": 58440 + }, + { + "epoch": 8.70494489127197, + "grad_norm": 0.02185940369963646, + "learning_rate": 3.4749012055449015e-05, + "loss": 0.0108, + "num_input_tokens_seen": 33900624, + "step": 58445 + }, + { + "epoch": 8.705689603812928, + "grad_norm": 0.13831448554992676, + "learning_rate": 3.4746019803173365e-05, + "loss": 0.0333, + "num_input_tokens_seen": 33903472, + "step": 58450 + }, + { + "epoch": 8.706434316353887, + "grad_norm": 0.05423567071557045, + "learning_rate": 3.474302738624862e-05, + "loss": 0.3499, + "num_input_tokens_seen": 33906192, + "step": 58455 + }, + { + "epoch": 8.707179028894847, + "grad_norm": 0.04642316326498985, + "learning_rate": 3.474003480472532e-05, + "loss": 0.0012, + "num_input_tokens_seen": 33908880, + "step": 58460 + }, + { + "epoch": 8.707923741435806, + "grad_norm": 0.41793012619018555, + "learning_rate": 3.473704205865405e-05, + "loss": 0.1102, + "num_input_tokens_seen": 33912432, + "step": 58465 + }, + { + "epoch": 8.708668453976765, + "grad_norm": 0.027299825102090836, + "learning_rate": 3.473404914808534e-05, + "loss": 0.2032, + "num_input_tokens_seen": 33915440, + "step": 58470 + }, + { + "epoch": 8.709413166517724, + "grad_norm": 0.12561985850334167, + "learning_rate": 3.4731056073069754e-05, + "loss": 0.0007, + "num_input_tokens_seen": 33918704, + "step": 58475 + }, + { + "epoch": 8.710157879058684, + "grad_norm": 23.59846305847168, + "learning_rate": 3.472806283365788e-05, + "loss": 0.2237, + "num_input_tokens_seen": 33921584, + "step": 58480 + }, + { + "epoch": 8.710902591599643, + "grad_norm": 0.03350356966257095, + "learning_rate": 3.472506942990026e-05, + "loss": 0.3695, + "num_input_tokens_seen": 33924880, + "step": 58485 + }, + { + "epoch": 8.711647304140602, + "grad_norm": 0.294243186712265, + "learning_rate": 3.472207586184748e-05, + "loss": 0.0361, + "num_input_tokens_seen": 33927760, + "step": 58490 + }, + { + "epoch": 8.71239201668156, + "grad_norm": 19.943052291870117, + "learning_rate": 3.4719082129550106e-05, + "loss": 0.0164, + "num_input_tokens_seen": 33930736, + "step": 58495 + }, + { + "epoch": 8.71313672922252, + "grad_norm": 0.6254858374595642, + "learning_rate": 3.471608823305873e-05, + "loss": 0.001, + "num_input_tokens_seen": 33933680, + "step": 58500 + }, + { + "epoch": 8.71388144176348, + "grad_norm": 0.23537810146808624, + "learning_rate": 3.471309417242391e-05, + "loss": 0.0717, + "num_input_tokens_seen": 33936688, + "step": 58505 + }, + { + "epoch": 8.714626154304439, + "grad_norm": 0.005728176329284906, + "learning_rate": 3.471009994769624e-05, + "loss": 0.2464, + "num_input_tokens_seen": 33939696, + "step": 58510 + }, + { + "epoch": 8.715370866845397, + "grad_norm": 60.498287200927734, + "learning_rate": 3.470710555892629e-05, + "loss": 0.0342, + "num_input_tokens_seen": 33942896, + "step": 58515 + }, + { + "epoch": 8.716115579386356, + "grad_norm": 76.14079284667969, + "learning_rate": 3.470411100616466e-05, + "loss": 0.2148, + "num_input_tokens_seen": 33945936, + "step": 58520 + }, + { + "epoch": 8.716860291927317, + "grad_norm": 0.14650407433509827, + "learning_rate": 3.4701116289461945e-05, + "loss": 0.2107, + "num_input_tokens_seen": 33948912, + "step": 58525 + }, + { + "epoch": 8.717605004468275, + "grad_norm": 0.052925318479537964, + "learning_rate": 3.469812140886872e-05, + "loss": 0.0018, + "num_input_tokens_seen": 33951792, + "step": 58530 + }, + { + "epoch": 8.718349717009234, + "grad_norm": 35.429893493652344, + "learning_rate": 3.4695126364435604e-05, + "loss": 0.3384, + "num_input_tokens_seen": 33955088, + "step": 58535 + }, + { + "epoch": 8.719094429550193, + "grad_norm": 0.0973566398024559, + "learning_rate": 3.4692131156213175e-05, + "loss": 0.0004, + "num_input_tokens_seen": 33958160, + "step": 58540 + }, + { + "epoch": 8.719839142091153, + "grad_norm": 40.90204620361328, + "learning_rate": 3.468913578425203e-05, + "loss": 0.0115, + "num_input_tokens_seen": 33961104, + "step": 58545 + }, + { + "epoch": 8.720583854632112, + "grad_norm": 0.06015336886048317, + "learning_rate": 3.4686140248602804e-05, + "loss": 0.0028, + "num_input_tokens_seen": 33963888, + "step": 58550 + }, + { + "epoch": 8.721328567173071, + "grad_norm": 0.0047347149811685085, + "learning_rate": 3.468314454931607e-05, + "loss": 0.1736, + "num_input_tokens_seen": 33966544, + "step": 58555 + }, + { + "epoch": 8.72207327971403, + "grad_norm": 0.023308057337999344, + "learning_rate": 3.468014868644245e-05, + "loss": 0.0004, + "num_input_tokens_seen": 33969392, + "step": 58560 + }, + { + "epoch": 8.72281799225499, + "grad_norm": 215.2278289794922, + "learning_rate": 3.4677152660032565e-05, + "loss": 0.2073, + "num_input_tokens_seen": 33972112, + "step": 58565 + }, + { + "epoch": 8.723562704795949, + "grad_norm": 0.030448343604803085, + "learning_rate": 3.467415647013702e-05, + "loss": 0.0004, + "num_input_tokens_seen": 33974896, + "step": 58570 + }, + { + "epoch": 8.724307417336908, + "grad_norm": 27.29683494567871, + "learning_rate": 3.467116011680643e-05, + "loss": 0.191, + "num_input_tokens_seen": 33977872, + "step": 58575 + }, + { + "epoch": 8.725052129877866, + "grad_norm": 11.243863105773926, + "learning_rate": 3.4668163600091415e-05, + "loss": 0.0058, + "num_input_tokens_seen": 33980624, + "step": 58580 + }, + { + "epoch": 8.725796842418827, + "grad_norm": 99.50847625732422, + "learning_rate": 3.46651669200426e-05, + "loss": 0.2541, + "num_input_tokens_seen": 33983344, + "step": 58585 + }, + { + "epoch": 8.726541554959786, + "grad_norm": 0.02332666702568531, + "learning_rate": 3.4662170076710624e-05, + "loss": 0.5823, + "num_input_tokens_seen": 33986096, + "step": 58590 + }, + { + "epoch": 8.727286267500745, + "grad_norm": 15.621935844421387, + "learning_rate": 3.46591730701461e-05, + "loss": 0.1852, + "num_input_tokens_seen": 33989200, + "step": 58595 + }, + { + "epoch": 8.728030980041703, + "grad_norm": 14.955465316772461, + "learning_rate": 3.465617590039967e-05, + "loss": 0.2589, + "num_input_tokens_seen": 33992240, + "step": 58600 + }, + { + "epoch": 8.728775692582664, + "grad_norm": 22.756210327148438, + "learning_rate": 3.4653178567521956e-05, + "loss": 0.0757, + "num_input_tokens_seen": 33994896, + "step": 58605 + }, + { + "epoch": 8.729520405123623, + "grad_norm": 0.03345182165503502, + "learning_rate": 3.4650181071563595e-05, + "loss": 0.0689, + "num_input_tokens_seen": 33998000, + "step": 58610 + }, + { + "epoch": 8.730265117664581, + "grad_norm": 0.053819168359041214, + "learning_rate": 3.4647183412575243e-05, + "loss": 0.2256, + "num_input_tokens_seen": 34000944, + "step": 58615 + }, + { + "epoch": 8.73100983020554, + "grad_norm": 20.214975357055664, + "learning_rate": 3.464418559060753e-05, + "loss": 0.1026, + "num_input_tokens_seen": 34003696, + "step": 58620 + }, + { + "epoch": 8.7317545427465, + "grad_norm": 0.020199039950966835, + "learning_rate": 3.464118760571109e-05, + "loss": 0.083, + "num_input_tokens_seen": 34006576, + "step": 58625 + }, + { + "epoch": 8.73249925528746, + "grad_norm": 0.3101099133491516, + "learning_rate": 3.463818945793661e-05, + "loss": 0.0075, + "num_input_tokens_seen": 34009328, + "step": 58630 + }, + { + "epoch": 8.733243967828418, + "grad_norm": 0.009500031359493732, + "learning_rate": 3.46351911473347e-05, + "loss": 0.0004, + "num_input_tokens_seen": 34012304, + "step": 58635 + }, + { + "epoch": 8.733988680369377, + "grad_norm": 116.87918090820312, + "learning_rate": 3.463219267395603e-05, + "loss": 0.1719, + "num_input_tokens_seen": 34015024, + "step": 58640 + }, + { + "epoch": 8.734733392910336, + "grad_norm": 0.016003688797354698, + "learning_rate": 3.4629194037851254e-05, + "loss": 0.0996, + "num_input_tokens_seen": 34017904, + "step": 58645 + }, + { + "epoch": 8.735478105451296, + "grad_norm": 0.010822822339832783, + "learning_rate": 3.462619523907103e-05, + "loss": 0.1948, + "num_input_tokens_seen": 34021072, + "step": 58650 + }, + { + "epoch": 8.736222817992255, + "grad_norm": 16.522369384765625, + "learning_rate": 3.462319627766602e-05, + "loss": 0.1285, + "num_input_tokens_seen": 34023888, + "step": 58655 + }, + { + "epoch": 8.736967530533214, + "grad_norm": 0.005843320395797491, + "learning_rate": 3.462019715368689e-05, + "loss": 0.2586, + "num_input_tokens_seen": 34026928, + "step": 58660 + }, + { + "epoch": 8.737712243074174, + "grad_norm": 28.742198944091797, + "learning_rate": 3.461719786718431e-05, + "loss": 0.296, + "num_input_tokens_seen": 34029744, + "step": 58665 + }, + { + "epoch": 8.738456955615133, + "grad_norm": 0.0036454135552048683, + "learning_rate": 3.461419841820895e-05, + "loss": 0.0004, + "num_input_tokens_seen": 34032656, + "step": 58670 + }, + { + "epoch": 8.739201668156092, + "grad_norm": 109.90374755859375, + "learning_rate": 3.461119880681147e-05, + "loss": 0.1813, + "num_input_tokens_seen": 34035600, + "step": 58675 + }, + { + "epoch": 8.73994638069705, + "grad_norm": 4.008707046508789, + "learning_rate": 3.460819903304256e-05, + "loss": 0.138, + "num_input_tokens_seen": 34038288, + "step": 58680 + }, + { + "epoch": 8.74069109323801, + "grad_norm": 0.05419270694255829, + "learning_rate": 3.460519909695289e-05, + "loss": 0.1829, + "num_input_tokens_seen": 34040880, + "step": 58685 + }, + { + "epoch": 8.74143580577897, + "grad_norm": 13.98358154296875, + "learning_rate": 3.460219899859314e-05, + "loss": 0.4313, + "num_input_tokens_seen": 34044080, + "step": 58690 + }, + { + "epoch": 8.742180518319929, + "grad_norm": 0.023914910852909088, + "learning_rate": 3.459919873801401e-05, + "loss": 0.0446, + "num_input_tokens_seen": 34047024, + "step": 58695 + }, + { + "epoch": 8.742925230860887, + "grad_norm": 0.3127402067184448, + "learning_rate": 3.4596198315266165e-05, + "loss": 0.1764, + "num_input_tokens_seen": 34049968, + "step": 58700 + }, + { + "epoch": 8.743669943401846, + "grad_norm": 0.027480095624923706, + "learning_rate": 3.45931977304003e-05, + "loss": 0.2576, + "num_input_tokens_seen": 34053104, + "step": 58705 + }, + { + "epoch": 8.744414655942807, + "grad_norm": 0.4127323031425476, + "learning_rate": 3.4590196983467114e-05, + "loss": 0.1582, + "num_input_tokens_seen": 34056112, + "step": 58710 + }, + { + "epoch": 8.745159368483765, + "grad_norm": 0.010594488121569157, + "learning_rate": 3.45871960745173e-05, + "loss": 0.1288, + "num_input_tokens_seen": 34058960, + "step": 58715 + }, + { + "epoch": 8.745904081024724, + "grad_norm": 24.157691955566406, + "learning_rate": 3.458419500360154e-05, + "loss": 0.081, + "num_input_tokens_seen": 34061872, + "step": 58720 + }, + { + "epoch": 8.746648793565683, + "grad_norm": 0.24923498928546906, + "learning_rate": 3.458119377077056e-05, + "loss": 0.0017, + "num_input_tokens_seen": 34064720, + "step": 58725 + }, + { + "epoch": 8.747393506106643, + "grad_norm": 34.25231170654297, + "learning_rate": 3.4578192376075044e-05, + "loss": 0.2865, + "num_input_tokens_seen": 34067408, + "step": 58730 + }, + { + "epoch": 8.748138218647602, + "grad_norm": 0.08410941064357758, + "learning_rate": 3.45751908195657e-05, + "loss": 0.0464, + "num_input_tokens_seen": 34070544, + "step": 58735 + }, + { + "epoch": 8.748882931188561, + "grad_norm": 82.72888946533203, + "learning_rate": 3.457218910129324e-05, + "loss": 0.2348, + "num_input_tokens_seen": 34073424, + "step": 58740 + }, + { + "epoch": 8.74962764372952, + "grad_norm": 0.3111756443977356, + "learning_rate": 3.4569187221308376e-05, + "loss": 0.0017, + "num_input_tokens_seen": 34076080, + "step": 58745 + }, + { + "epoch": 8.75037235627048, + "grad_norm": 0.04379109665751457, + "learning_rate": 3.456618517966183e-05, + "loss": 0.0218, + "num_input_tokens_seen": 34078864, + "step": 58750 + }, + { + "epoch": 8.751117068811439, + "grad_norm": 0.022073054686188698, + "learning_rate": 3.4563182976404286e-05, + "loss": 0.2705, + "num_input_tokens_seen": 34082000, + "step": 58755 + }, + { + "epoch": 8.751861781352398, + "grad_norm": 0.007181359454989433, + "learning_rate": 3.456018061158649e-05, + "loss": 0.0118, + "num_input_tokens_seen": 34085200, + "step": 58760 + }, + { + "epoch": 8.752606493893357, + "grad_norm": 0.09178128838539124, + "learning_rate": 3.455717808525917e-05, + "loss": 0.1809, + "num_input_tokens_seen": 34088016, + "step": 58765 + }, + { + "epoch": 8.753351206434317, + "grad_norm": 8.0665922164917, + "learning_rate": 3.4554175397473036e-05, + "loss": 0.5811, + "num_input_tokens_seen": 34090864, + "step": 58770 + }, + { + "epoch": 8.754095918975276, + "grad_norm": 0.01751665212213993, + "learning_rate": 3.455117254827882e-05, + "loss": 0.1966, + "num_input_tokens_seen": 34094128, + "step": 58775 + }, + { + "epoch": 8.754840631516235, + "grad_norm": 0.055268775671720505, + "learning_rate": 3.454816953772724e-05, + "loss": 0.001, + "num_input_tokens_seen": 34097232, + "step": 58780 + }, + { + "epoch": 8.755585344057193, + "grad_norm": 0.09886667132377625, + "learning_rate": 3.4545166365869054e-05, + "loss": 0.2451, + "num_input_tokens_seen": 34100016, + "step": 58785 + }, + { + "epoch": 8.756330056598154, + "grad_norm": 0.14192073047161102, + "learning_rate": 3.454216303275498e-05, + "loss": 0.1022, + "num_input_tokens_seen": 34102864, + "step": 58790 + }, + { + "epoch": 8.757074769139113, + "grad_norm": 0.24091164767742157, + "learning_rate": 3.4539159538435755e-05, + "loss": 0.1109, + "num_input_tokens_seen": 34105744, + "step": 58795 + }, + { + "epoch": 8.757819481680071, + "grad_norm": 0.7346237897872925, + "learning_rate": 3.453615588296213e-05, + "loss": 0.0919, + "num_input_tokens_seen": 34108368, + "step": 58800 + }, + { + "epoch": 8.75856419422103, + "grad_norm": 34.05007553100586, + "learning_rate": 3.4533152066384844e-05, + "loss": 0.2708, + "num_input_tokens_seen": 34111280, + "step": 58805 + }, + { + "epoch": 8.75930890676199, + "grad_norm": 0.010787052102386951, + "learning_rate": 3.453014808875464e-05, + "loss": 0.03, + "num_input_tokens_seen": 34114096, + "step": 58810 + }, + { + "epoch": 8.76005361930295, + "grad_norm": 13.073047637939453, + "learning_rate": 3.4527143950122266e-05, + "loss": 0.1271, + "num_input_tokens_seen": 34116976, + "step": 58815 + }, + { + "epoch": 8.760798331843908, + "grad_norm": 1.8598378896713257, + "learning_rate": 3.4524139650538485e-05, + "loss": 0.0017, + "num_input_tokens_seen": 34120016, + "step": 58820 + }, + { + "epoch": 8.761543044384867, + "grad_norm": 29.050222396850586, + "learning_rate": 3.452113519005404e-05, + "loss": 0.1114, + "num_input_tokens_seen": 34122832, + "step": 58825 + }, + { + "epoch": 8.762287756925826, + "grad_norm": 4.7288007736206055, + "learning_rate": 3.45181305687197e-05, + "loss": 0.0113, + "num_input_tokens_seen": 34125712, + "step": 58830 + }, + { + "epoch": 8.763032469466786, + "grad_norm": 55.438846588134766, + "learning_rate": 3.451512578658621e-05, + "loss": 0.2275, + "num_input_tokens_seen": 34128656, + "step": 58835 + }, + { + "epoch": 8.763777182007745, + "grad_norm": 0.13567541539669037, + "learning_rate": 3.4512120843704344e-05, + "loss": 0.28, + "num_input_tokens_seen": 34131632, + "step": 58840 + }, + { + "epoch": 8.764521894548704, + "grad_norm": 0.522576093673706, + "learning_rate": 3.4509115740124866e-05, + "loss": 0.0613, + "num_input_tokens_seen": 34134704, + "step": 58845 + }, + { + "epoch": 8.765266607089664, + "grad_norm": 0.035492442548274994, + "learning_rate": 3.4506110475898535e-05, + "loss": 0.1397, + "num_input_tokens_seen": 34137808, + "step": 58850 + }, + { + "epoch": 8.766011319630623, + "grad_norm": 73.3956298828125, + "learning_rate": 3.4503105051076126e-05, + "loss": 0.0888, + "num_input_tokens_seen": 34140560, + "step": 58855 + }, + { + "epoch": 8.766756032171582, + "grad_norm": 0.1499793380498886, + "learning_rate": 3.450009946570843e-05, + "loss": 0.1101, + "num_input_tokens_seen": 34143376, + "step": 58860 + }, + { + "epoch": 8.76750074471254, + "grad_norm": 0.003837483236566186, + "learning_rate": 3.44970937198462e-05, + "loss": 0.1427, + "num_input_tokens_seen": 34145936, + "step": 58865 + }, + { + "epoch": 8.7682454572535, + "grad_norm": 64.8023681640625, + "learning_rate": 3.449408781354023e-05, + "loss": 0.2257, + "num_input_tokens_seen": 34149232, + "step": 58870 + }, + { + "epoch": 8.76899016979446, + "grad_norm": 0.20052708685398102, + "learning_rate": 3.449108174684129e-05, + "loss": 0.0008, + "num_input_tokens_seen": 34152176, + "step": 58875 + }, + { + "epoch": 8.769734882335419, + "grad_norm": 0.01746215671300888, + "learning_rate": 3.448807551980017e-05, + "loss": 0.151, + "num_input_tokens_seen": 34155120, + "step": 58880 + }, + { + "epoch": 8.770479594876377, + "grad_norm": 27.54904556274414, + "learning_rate": 3.448506913246766e-05, + "loss": 0.0922, + "num_input_tokens_seen": 34158128, + "step": 58885 + }, + { + "epoch": 8.771224307417336, + "grad_norm": 0.021850500255823135, + "learning_rate": 3.448206258489455e-05, + "loss": 0.1101, + "num_input_tokens_seen": 34161168, + "step": 58890 + }, + { + "epoch": 8.771969019958297, + "grad_norm": 0.013119803741574287, + "learning_rate": 3.4479055877131616e-05, + "loss": 0.4504, + "num_input_tokens_seen": 34164592, + "step": 58895 + }, + { + "epoch": 8.772713732499255, + "grad_norm": 0.00984260905534029, + "learning_rate": 3.4476049009229685e-05, + "loss": 0.2106, + "num_input_tokens_seen": 34167504, + "step": 58900 + }, + { + "epoch": 8.773458445040214, + "grad_norm": 13.543330192565918, + "learning_rate": 3.447304198123953e-05, + "loss": 0.3446, + "num_input_tokens_seen": 34170320, + "step": 58905 + }, + { + "epoch": 8.774203157581173, + "grad_norm": 0.0768343061208725, + "learning_rate": 3.447003479321196e-05, + "loss": 0.1293, + "num_input_tokens_seen": 34173488, + "step": 58910 + }, + { + "epoch": 8.774947870122134, + "grad_norm": 0.06155060604214668, + "learning_rate": 3.4467027445197774e-05, + "loss": 0.5144, + "num_input_tokens_seen": 34176304, + "step": 58915 + }, + { + "epoch": 8.775692582663092, + "grad_norm": 4.602540016174316, + "learning_rate": 3.446401993724778e-05, + "loss": 0.0534, + "num_input_tokens_seen": 34178992, + "step": 58920 + }, + { + "epoch": 8.776437295204051, + "grad_norm": 48.650997161865234, + "learning_rate": 3.446101226941279e-05, + "loss": 0.0172, + "num_input_tokens_seen": 34181776, + "step": 58925 + }, + { + "epoch": 8.77718200774501, + "grad_norm": 53.255245208740234, + "learning_rate": 3.4458004441743605e-05, + "loss": 0.1142, + "num_input_tokens_seen": 34184624, + "step": 58930 + }, + { + "epoch": 8.77792672028597, + "grad_norm": 0.03431737795472145, + "learning_rate": 3.445499645429107e-05, + "loss": 0.0275, + "num_input_tokens_seen": 34187472, + "step": 58935 + }, + { + "epoch": 8.778671432826929, + "grad_norm": 36.95559310913086, + "learning_rate": 3.445198830710596e-05, + "loss": 0.1307, + "num_input_tokens_seen": 34190160, + "step": 58940 + }, + { + "epoch": 8.779416145367888, + "grad_norm": 6.599283218383789, + "learning_rate": 3.4448980000239114e-05, + "loss": 0.0551, + "num_input_tokens_seen": 34192944, + "step": 58945 + }, + { + "epoch": 8.780160857908847, + "grad_norm": 0.16673167049884796, + "learning_rate": 3.444597153374136e-05, + "loss": 0.0294, + "num_input_tokens_seen": 34195952, + "step": 58950 + }, + { + "epoch": 8.780905570449807, + "grad_norm": 0.0432441420853138, + "learning_rate": 3.444296290766352e-05, + "loss": 0.2496, + "num_input_tokens_seen": 34198896, + "step": 58955 + }, + { + "epoch": 8.781650282990766, + "grad_norm": 0.10340632498264313, + "learning_rate": 3.443995412205642e-05, + "loss": 0.0474, + "num_input_tokens_seen": 34201744, + "step": 58960 + }, + { + "epoch": 8.782394995531725, + "grad_norm": 0.0032935950439423323, + "learning_rate": 3.443694517697089e-05, + "loss": 0.2421, + "num_input_tokens_seen": 34204720, + "step": 58965 + }, + { + "epoch": 8.783139708072683, + "grad_norm": 0.825555145740509, + "learning_rate": 3.4433936072457754e-05, + "loss": 0.1371, + "num_input_tokens_seen": 34207760, + "step": 58970 + }, + { + "epoch": 8.783884420613644, + "grad_norm": 0.026462722569704056, + "learning_rate": 3.443092680856787e-05, + "loss": 0.02, + "num_input_tokens_seen": 34210800, + "step": 58975 + }, + { + "epoch": 8.784629133154603, + "grad_norm": 0.012568891048431396, + "learning_rate": 3.442791738535205e-05, + "loss": 0.3408, + "num_input_tokens_seen": 34213712, + "step": 58980 + }, + { + "epoch": 8.785373845695561, + "grad_norm": 0.083432137966156, + "learning_rate": 3.4424907802861143e-05, + "loss": 0.1212, + "num_input_tokens_seen": 34216464, + "step": 58985 + }, + { + "epoch": 8.78611855823652, + "grad_norm": 0.04417197033762932, + "learning_rate": 3.4421898061146005e-05, + "loss": 0.1231, + "num_input_tokens_seen": 34219536, + "step": 58990 + }, + { + "epoch": 8.78686327077748, + "grad_norm": 40.87661361694336, + "learning_rate": 3.4418888160257486e-05, + "loss": 0.2976, + "num_input_tokens_seen": 34222416, + "step": 58995 + }, + { + "epoch": 8.78760798331844, + "grad_norm": 26.20378875732422, + "learning_rate": 3.441587810024642e-05, + "loss": 0.109, + "num_input_tokens_seen": 34225296, + "step": 59000 + }, + { + "epoch": 8.788352695859398, + "grad_norm": 0.5214197635650635, + "learning_rate": 3.441286788116365e-05, + "loss": 0.2238, + "num_input_tokens_seen": 34228176, + "step": 59005 + }, + { + "epoch": 8.789097408400357, + "grad_norm": 0.7698008418083191, + "learning_rate": 3.440985750306006e-05, + "loss": 0.001, + "num_input_tokens_seen": 34231056, + "step": 59010 + }, + { + "epoch": 8.789842120941316, + "grad_norm": 0.16878663003444672, + "learning_rate": 3.4406846965986476e-05, + "loss": 0.3422, + "num_input_tokens_seen": 34234064, + "step": 59015 + }, + { + "epoch": 8.790586833482276, + "grad_norm": 10.117059707641602, + "learning_rate": 3.440383626999378e-05, + "loss": 0.1502, + "num_input_tokens_seen": 34236752, + "step": 59020 + }, + { + "epoch": 8.791331546023235, + "grad_norm": 23.68743896484375, + "learning_rate": 3.440082541513283e-05, + "loss": 0.1885, + "num_input_tokens_seen": 34239504, + "step": 59025 + }, + { + "epoch": 8.792076258564194, + "grad_norm": 58.10013198852539, + "learning_rate": 3.439781440145449e-05, + "loss": 0.3515, + "num_input_tokens_seen": 34242480, + "step": 59030 + }, + { + "epoch": 8.792820971105153, + "grad_norm": 0.07096157222986221, + "learning_rate": 3.4394803229009634e-05, + "loss": 0.114, + "num_input_tokens_seen": 34245712, + "step": 59035 + }, + { + "epoch": 8.793565683646113, + "grad_norm": 5.275511741638184, + "learning_rate": 3.439179189784911e-05, + "loss": 0.2451, + "num_input_tokens_seen": 34248912, + "step": 59040 + }, + { + "epoch": 8.794310396187072, + "grad_norm": 30.278621673583984, + "learning_rate": 3.438878040802381e-05, + "loss": 0.0848, + "num_input_tokens_seen": 34251760, + "step": 59045 + }, + { + "epoch": 8.79505510872803, + "grad_norm": 0.06181745231151581, + "learning_rate": 3.438576875958461e-05, + "loss": 0.0248, + "num_input_tokens_seen": 34254544, + "step": 59050 + }, + { + "epoch": 8.79579982126899, + "grad_norm": 218.71945190429688, + "learning_rate": 3.438275695258239e-05, + "loss": 0.2882, + "num_input_tokens_seen": 34257456, + "step": 59055 + }, + { + "epoch": 8.79654453380995, + "grad_norm": 69.296875, + "learning_rate": 3.4379744987068025e-05, + "loss": 0.2929, + "num_input_tokens_seen": 34260304, + "step": 59060 + }, + { + "epoch": 8.797289246350909, + "grad_norm": 0.02501535415649414, + "learning_rate": 3.43767328630924e-05, + "loss": 0.1399, + "num_input_tokens_seen": 34262992, + "step": 59065 + }, + { + "epoch": 8.798033958891867, + "grad_norm": 0.05696319416165352, + "learning_rate": 3.437372058070641e-05, + "loss": 0.3354, + "num_input_tokens_seen": 34265776, + "step": 59070 + }, + { + "epoch": 8.798778671432826, + "grad_norm": 0.023921608924865723, + "learning_rate": 3.4370708139960934e-05, + "loss": 0.062, + "num_input_tokens_seen": 34268784, + "step": 59075 + }, + { + "epoch": 8.799523383973787, + "grad_norm": 28.62864112854004, + "learning_rate": 3.4367695540906864e-05, + "loss": 0.1248, + "num_input_tokens_seen": 34271888, + "step": 59080 + }, + { + "epoch": 8.800268096514746, + "grad_norm": 33.3172492980957, + "learning_rate": 3.43646827835951e-05, + "loss": 0.0395, + "num_input_tokens_seen": 34275120, + "step": 59085 + }, + { + "epoch": 8.801012809055704, + "grad_norm": 33.278099060058594, + "learning_rate": 3.436166986807654e-05, + "loss": 0.2999, + "num_input_tokens_seen": 34278000, + "step": 59090 + }, + { + "epoch": 8.801757521596663, + "grad_norm": 0.42286816239356995, + "learning_rate": 3.435865679440208e-05, + "loss": 0.1177, + "num_input_tokens_seen": 34280624, + "step": 59095 + }, + { + "epoch": 8.802502234137624, + "grad_norm": 13.314081192016602, + "learning_rate": 3.435564356262263e-05, + "loss": 0.2069, + "num_input_tokens_seen": 34283696, + "step": 59100 + }, + { + "epoch": 8.803246946678582, + "grad_norm": 50.835330963134766, + "learning_rate": 3.435263017278909e-05, + "loss": 0.1445, + "num_input_tokens_seen": 34286832, + "step": 59105 + }, + { + "epoch": 8.803991659219541, + "grad_norm": 0.20144522190093994, + "learning_rate": 3.4349616624952365e-05, + "loss": 0.1612, + "num_input_tokens_seen": 34289776, + "step": 59110 + }, + { + "epoch": 8.8047363717605, + "grad_norm": 0.7080044746398926, + "learning_rate": 3.434660291916337e-05, + "loss": 0.1967, + "num_input_tokens_seen": 34292432, + "step": 59115 + }, + { + "epoch": 8.80548108430146, + "grad_norm": 3.0339386463165283, + "learning_rate": 3.4343589055473025e-05, + "loss": 0.0108, + "num_input_tokens_seen": 34295632, + "step": 59120 + }, + { + "epoch": 8.80622579684242, + "grad_norm": 0.027939409017562866, + "learning_rate": 3.4340575033932234e-05, + "loss": 0.0608, + "num_input_tokens_seen": 34298512, + "step": 59125 + }, + { + "epoch": 8.806970509383378, + "grad_norm": 0.017125237733125687, + "learning_rate": 3.433756085459192e-05, + "loss": 0.0905, + "num_input_tokens_seen": 34301360, + "step": 59130 + }, + { + "epoch": 8.807715221924337, + "grad_norm": 0.008531865663826466, + "learning_rate": 3.4334546517503006e-05, + "loss": 0.1423, + "num_input_tokens_seen": 34303984, + "step": 59135 + }, + { + "epoch": 8.808459934465297, + "grad_norm": 0.46953096985816956, + "learning_rate": 3.4331532022716416e-05, + "loss": 0.13, + "num_input_tokens_seen": 34306992, + "step": 59140 + }, + { + "epoch": 8.809204647006256, + "grad_norm": 43.07401657104492, + "learning_rate": 3.432851737028308e-05, + "loss": 0.3711, + "num_input_tokens_seen": 34309712, + "step": 59145 + }, + { + "epoch": 8.809949359547215, + "grad_norm": 0.6279772520065308, + "learning_rate": 3.432550256025391e-05, + "loss": 0.0346, + "num_input_tokens_seen": 34312528, + "step": 59150 + }, + { + "epoch": 8.810694072088173, + "grad_norm": 3.8508975505828857, + "learning_rate": 3.4322487592679876e-05, + "loss": 0.2598, + "num_input_tokens_seen": 34315056, + "step": 59155 + }, + { + "epoch": 8.811438784629132, + "grad_norm": 0.7083685398101807, + "learning_rate": 3.4319472467611876e-05, + "loss": 0.4249, + "num_input_tokens_seen": 34317968, + "step": 59160 + }, + { + "epoch": 8.812183497170093, + "grad_norm": 0.008674832992255688, + "learning_rate": 3.431645718510086e-05, + "loss": 0.05, + "num_input_tokens_seen": 34320944, + "step": 59165 + }, + { + "epoch": 8.812928209711052, + "grad_norm": 0.08803122490644455, + "learning_rate": 3.431344174519777e-05, + "loss": 0.2281, + "num_input_tokens_seen": 34323824, + "step": 59170 + }, + { + "epoch": 8.81367292225201, + "grad_norm": 23.76774787902832, + "learning_rate": 3.431042614795354e-05, + "loss": 0.1673, + "num_input_tokens_seen": 34326768, + "step": 59175 + }, + { + "epoch": 8.81441763479297, + "grad_norm": 0.2612098753452301, + "learning_rate": 3.430741039341914e-05, + "loss": 0.0669, + "num_input_tokens_seen": 34329360, + "step": 59180 + }, + { + "epoch": 8.81516234733393, + "grad_norm": 5.907101631164551, + "learning_rate": 3.4304394481645485e-05, + "loss": 0.4255, + "num_input_tokens_seen": 34332592, + "step": 59185 + }, + { + "epoch": 8.815907059874888, + "grad_norm": 0.06346282362937927, + "learning_rate": 3.430137841268355e-05, + "loss": 0.0545, + "num_input_tokens_seen": 34335408, + "step": 59190 + }, + { + "epoch": 8.816651772415847, + "grad_norm": 0.015224038623273373, + "learning_rate": 3.4298362186584275e-05, + "loss": 0.2573, + "num_input_tokens_seen": 34338320, + "step": 59195 + }, + { + "epoch": 8.817396484956806, + "grad_norm": 22.860034942626953, + "learning_rate": 3.4295345803398634e-05, + "loss": 0.2662, + "num_input_tokens_seen": 34341392, + "step": 59200 + }, + { + "epoch": 8.818141197497766, + "grad_norm": 9.424515724182129, + "learning_rate": 3.429232926317756e-05, + "loss": 0.3563, + "num_input_tokens_seen": 34344080, + "step": 59205 + }, + { + "epoch": 8.818885910038725, + "grad_norm": 68.20187377929688, + "learning_rate": 3.428931256597203e-05, + "loss": 0.1918, + "num_input_tokens_seen": 34346864, + "step": 59210 + }, + { + "epoch": 8.819630622579684, + "grad_norm": 69.9737548828125, + "learning_rate": 3.428629571183301e-05, + "loss": 0.057, + "num_input_tokens_seen": 34349712, + "step": 59215 + }, + { + "epoch": 8.820375335120643, + "grad_norm": 8.845540046691895, + "learning_rate": 3.428327870081145e-05, + "loss": 0.1012, + "num_input_tokens_seen": 34352784, + "step": 59220 + }, + { + "epoch": 8.821120047661603, + "grad_norm": 0.018472885712981224, + "learning_rate": 3.428026153295834e-05, + "loss": 0.0896, + "num_input_tokens_seen": 34355600, + "step": 59225 + }, + { + "epoch": 8.821864760202562, + "grad_norm": 0.16120322048664093, + "learning_rate": 3.427724420832464e-05, + "loss": 0.0028, + "num_input_tokens_seen": 34358288, + "step": 59230 + }, + { + "epoch": 8.82260947274352, + "grad_norm": 0.044584762305021286, + "learning_rate": 3.427422672696135e-05, + "loss": 0.2927, + "num_input_tokens_seen": 34361360, + "step": 59235 + }, + { + "epoch": 8.82335418528448, + "grad_norm": 74.9128646850586, + "learning_rate": 3.42712090889194e-05, + "loss": 0.1798, + "num_input_tokens_seen": 34364144, + "step": 59240 + }, + { + "epoch": 8.82409889782544, + "grad_norm": 0.04301559552550316, + "learning_rate": 3.426819129424979e-05, + "loss": 0.0038, + "num_input_tokens_seen": 34366864, + "step": 59245 + }, + { + "epoch": 8.824843610366399, + "grad_norm": 1.3716074228286743, + "learning_rate": 3.426517334300352e-05, + "loss": 0.001, + "num_input_tokens_seen": 34369808, + "step": 59250 + }, + { + "epoch": 8.825588322907358, + "grad_norm": 0.7752940058708191, + "learning_rate": 3.426215523523157e-05, + "loss": 0.1356, + "num_input_tokens_seen": 34372912, + "step": 59255 + }, + { + "epoch": 8.826333035448316, + "grad_norm": 97.36577606201172, + "learning_rate": 3.425913697098491e-05, + "loss": 0.5471, + "num_input_tokens_seen": 34376016, + "step": 59260 + }, + { + "epoch": 8.827077747989277, + "grad_norm": 0.022447599098086357, + "learning_rate": 3.4256118550314556e-05, + "loss": 0.0201, + "num_input_tokens_seen": 34379024, + "step": 59265 + }, + { + "epoch": 8.827822460530236, + "grad_norm": 0.1387144923210144, + "learning_rate": 3.425309997327147e-05, + "loss": 0.0248, + "num_input_tokens_seen": 34381744, + "step": 59270 + }, + { + "epoch": 8.828567173071194, + "grad_norm": 0.03585664555430412, + "learning_rate": 3.4250081239906674e-05, + "loss": 0.0456, + "num_input_tokens_seen": 34384496, + "step": 59275 + }, + { + "epoch": 8.829311885612153, + "grad_norm": 0.01649121567606926, + "learning_rate": 3.424706235027115e-05, + "loss": 0.0904, + "num_input_tokens_seen": 34387344, + "step": 59280 + }, + { + "epoch": 8.830056598153114, + "grad_norm": 0.49296098947525024, + "learning_rate": 3.4244043304415907e-05, + "loss": 0.0039, + "num_input_tokens_seen": 34390160, + "step": 59285 + }, + { + "epoch": 8.830801310694072, + "grad_norm": 0.025551743805408478, + "learning_rate": 3.424102410239195e-05, + "loss": 0.353, + "num_input_tokens_seen": 34393040, + "step": 59290 + }, + { + "epoch": 8.831546023235031, + "grad_norm": 0.009577128104865551, + "learning_rate": 3.423800474425029e-05, + "loss": 0.2468, + "num_input_tokens_seen": 34396016, + "step": 59295 + }, + { + "epoch": 8.83229073577599, + "grad_norm": 0.014345169998705387, + "learning_rate": 3.4234985230041916e-05, + "loss": 0.0075, + "num_input_tokens_seen": 34398608, + "step": 59300 + }, + { + "epoch": 8.83303544831695, + "grad_norm": 5.739204406738281, + "learning_rate": 3.4231965559817856e-05, + "loss": 0.0383, + "num_input_tokens_seen": 34401520, + "step": 59305 + }, + { + "epoch": 8.83378016085791, + "grad_norm": 30.72942352294922, + "learning_rate": 3.4228945733629124e-05, + "loss": 0.2018, + "num_input_tokens_seen": 34404432, + "step": 59310 + }, + { + "epoch": 8.834524873398868, + "grad_norm": 0.053217485547065735, + "learning_rate": 3.422592575152673e-05, + "loss": 0.0019, + "num_input_tokens_seen": 34407312, + "step": 59315 + }, + { + "epoch": 8.835269585939827, + "grad_norm": 0.03870954364538193, + "learning_rate": 3.4222905613561706e-05, + "loss": 0.001, + "num_input_tokens_seen": 34410256, + "step": 59320 + }, + { + "epoch": 8.836014298480787, + "grad_norm": 0.01447589136660099, + "learning_rate": 3.421988531978506e-05, + "loss": 0.1784, + "num_input_tokens_seen": 34412944, + "step": 59325 + }, + { + "epoch": 8.836759011021746, + "grad_norm": 0.020398898050189018, + "learning_rate": 3.421686487024782e-05, + "loss": 0.2284, + "num_input_tokens_seen": 34415568, + "step": 59330 + }, + { + "epoch": 8.837503723562705, + "grad_norm": 0.011727681383490562, + "learning_rate": 3.4213844265001015e-05, + "loss": 0.6271, + "num_input_tokens_seen": 34418320, + "step": 59335 + }, + { + "epoch": 8.838248436103664, + "grad_norm": 0.014833427034318447, + "learning_rate": 3.421082350409568e-05, + "loss": 0.0044, + "num_input_tokens_seen": 34421200, + "step": 59340 + }, + { + "epoch": 8.838993148644622, + "grad_norm": 0.03594449162483215, + "learning_rate": 3.420780258758284e-05, + "loss": 0.0002, + "num_input_tokens_seen": 34424240, + "step": 59345 + }, + { + "epoch": 8.839737861185583, + "grad_norm": 0.06582437455654144, + "learning_rate": 3.420478151551353e-05, + "loss": 0.1706, + "num_input_tokens_seen": 34427376, + "step": 59350 + }, + { + "epoch": 8.840482573726542, + "grad_norm": 0.016067935153841972, + "learning_rate": 3.42017602879388e-05, + "loss": 0.001, + "num_input_tokens_seen": 34430384, + "step": 59355 + }, + { + "epoch": 8.8412272862675, + "grad_norm": 17.888004302978516, + "learning_rate": 3.419873890490968e-05, + "loss": 0.0284, + "num_input_tokens_seen": 34433200, + "step": 59360 + }, + { + "epoch": 8.84197199880846, + "grad_norm": 0.05872466415166855, + "learning_rate": 3.4195717366477216e-05, + "loss": 0.0935, + "num_input_tokens_seen": 34435760, + "step": 59365 + }, + { + "epoch": 8.84271671134942, + "grad_norm": 0.15415847301483154, + "learning_rate": 3.419269567269245e-05, + "loss": 0.087, + "num_input_tokens_seen": 34438480, + "step": 59370 + }, + { + "epoch": 8.843461423890378, + "grad_norm": 0.08773830533027649, + "learning_rate": 3.418967382360643e-05, + "loss": 0.0005, + "num_input_tokens_seen": 34441520, + "step": 59375 + }, + { + "epoch": 8.844206136431337, + "grad_norm": 0.10994653403759003, + "learning_rate": 3.4186651819270224e-05, + "loss": 0.2691, + "num_input_tokens_seen": 34444432, + "step": 59380 + }, + { + "epoch": 8.844950848972296, + "grad_norm": 58.218231201171875, + "learning_rate": 3.4183629659734855e-05, + "loss": 0.3387, + "num_input_tokens_seen": 34447376, + "step": 59385 + }, + { + "epoch": 8.845695561513256, + "grad_norm": 56.969058990478516, + "learning_rate": 3.418060734505141e-05, + "loss": 0.2845, + "num_input_tokens_seen": 34450160, + "step": 59390 + }, + { + "epoch": 8.846440274054215, + "grad_norm": 0.047456495463848114, + "learning_rate": 3.417758487527093e-05, + "loss": 0.24, + "num_input_tokens_seen": 34453168, + "step": 59395 + }, + { + "epoch": 8.847184986595174, + "grad_norm": 28.823213577270508, + "learning_rate": 3.417456225044449e-05, + "loss": 0.2303, + "num_input_tokens_seen": 34456272, + "step": 59400 + }, + { + "epoch": 8.847929699136133, + "grad_norm": 5.890193939208984, + "learning_rate": 3.417153947062313e-05, + "loss": 0.1226, + "num_input_tokens_seen": 34459184, + "step": 59405 + }, + { + "epoch": 8.848674411677093, + "grad_norm": 0.2066379338502884, + "learning_rate": 3.416851653585794e-05, + "loss": 0.301, + "num_input_tokens_seen": 34462000, + "step": 59410 + }, + { + "epoch": 8.849419124218052, + "grad_norm": 10.556022644042969, + "learning_rate": 3.416549344619998e-05, + "loss": 0.1467, + "num_input_tokens_seen": 34465040, + "step": 59415 + }, + { + "epoch": 8.85016383675901, + "grad_norm": 0.08027180284261703, + "learning_rate": 3.416247020170032e-05, + "loss": 0.0005, + "num_input_tokens_seen": 34467984, + "step": 59420 + }, + { + "epoch": 8.85090854929997, + "grad_norm": 0.05494158715009689, + "learning_rate": 3.415944680241004e-05, + "loss": 0.0029, + "num_input_tokens_seen": 34470576, + "step": 59425 + }, + { + "epoch": 8.85165326184093, + "grad_norm": 97.2529525756836, + "learning_rate": 3.415642324838023e-05, + "loss": 0.0172, + "num_input_tokens_seen": 34473392, + "step": 59430 + }, + { + "epoch": 8.852397974381889, + "grad_norm": 0.94580078125, + "learning_rate": 3.415339953966194e-05, + "loss": 0.0017, + "num_input_tokens_seen": 34476336, + "step": 59435 + }, + { + "epoch": 8.853142686922848, + "grad_norm": 84.14909362792969, + "learning_rate": 3.4150375676306276e-05, + "loss": 0.1461, + "num_input_tokens_seen": 34479280, + "step": 59440 + }, + { + "epoch": 8.853887399463806, + "grad_norm": 0.02899971976876259, + "learning_rate": 3.4147351658364304e-05, + "loss": 0.0201, + "num_input_tokens_seen": 34482096, + "step": 59445 + }, + { + "epoch": 8.854632112004767, + "grad_norm": 0.08411090075969696, + "learning_rate": 3.4144327485887126e-05, + "loss": 0.0002, + "num_input_tokens_seen": 34484784, + "step": 59450 + }, + { + "epoch": 8.855376824545726, + "grad_norm": 0.003514103125780821, + "learning_rate": 3.414130315892583e-05, + "loss": 0.1138, + "num_input_tokens_seen": 34487888, + "step": 59455 + }, + { + "epoch": 8.856121537086684, + "grad_norm": 7.757139682769775, + "learning_rate": 3.4138278677531515e-05, + "loss": 0.0025, + "num_input_tokens_seen": 34490800, + "step": 59460 + }, + { + "epoch": 8.856866249627643, + "grad_norm": 0.060807690024375916, + "learning_rate": 3.413525404175527e-05, + "loss": 0.0012, + "num_input_tokens_seen": 34493680, + "step": 59465 + }, + { + "epoch": 8.857610962168604, + "grad_norm": 0.014030683785676956, + "learning_rate": 3.413222925164818e-05, + "loss": 0.0115, + "num_input_tokens_seen": 34496720, + "step": 59470 + }, + { + "epoch": 8.858355674709562, + "grad_norm": 33.09156799316406, + "learning_rate": 3.412920430726137e-05, + "loss": 0.2827, + "num_input_tokens_seen": 34499664, + "step": 59475 + }, + { + "epoch": 8.859100387250521, + "grad_norm": 43.37638473510742, + "learning_rate": 3.412617920864593e-05, + "loss": 0.1987, + "num_input_tokens_seen": 34502416, + "step": 59480 + }, + { + "epoch": 8.85984509979148, + "grad_norm": 0.0018125836504623294, + "learning_rate": 3.412315395585296e-05, + "loss": 0.0004, + "num_input_tokens_seen": 34505328, + "step": 59485 + }, + { + "epoch": 8.86058981233244, + "grad_norm": 14.414834976196289, + "learning_rate": 3.4120128548933575e-05, + "loss": 0.5304, + "num_input_tokens_seen": 34508176, + "step": 59490 + }, + { + "epoch": 8.8613345248734, + "grad_norm": 0.017246989533305168, + "learning_rate": 3.4117102987938895e-05, + "loss": 0.0005, + "num_input_tokens_seen": 34511152, + "step": 59495 + }, + { + "epoch": 8.862079237414358, + "grad_norm": 0.66968834400177, + "learning_rate": 3.411407727292003e-05, + "loss": 0.0133, + "num_input_tokens_seen": 34513840, + "step": 59500 + }, + { + "epoch": 8.862823949955317, + "grad_norm": 0.0007698560366407037, + "learning_rate": 3.411105140392808e-05, + "loss": 0.1338, + "num_input_tokens_seen": 34516848, + "step": 59505 + }, + { + "epoch": 8.863568662496277, + "grad_norm": 0.024755043908953667, + "learning_rate": 3.4108025381014184e-05, + "loss": 0.0171, + "num_input_tokens_seen": 34519696, + "step": 59510 + }, + { + "epoch": 8.864313375037236, + "grad_norm": 0.04255327582359314, + "learning_rate": 3.4104999204229466e-05, + "loss": 0.0179, + "num_input_tokens_seen": 34522800, + "step": 59515 + }, + { + "epoch": 8.865058087578195, + "grad_norm": 0.13709156215190887, + "learning_rate": 3.410197287362503e-05, + "loss": 0.0006, + "num_input_tokens_seen": 34525680, + "step": 59520 + }, + { + "epoch": 8.865802800119154, + "grad_norm": 0.34478873014450073, + "learning_rate": 3.409894638925201e-05, + "loss": 0.0088, + "num_input_tokens_seen": 34528592, + "step": 59525 + }, + { + "epoch": 8.866547512660112, + "grad_norm": 0.04193273186683655, + "learning_rate": 3.409591975116155e-05, + "loss": 0.0103, + "num_input_tokens_seen": 34531216, + "step": 59530 + }, + { + "epoch": 8.867292225201073, + "grad_norm": 0.008083541877567768, + "learning_rate": 3.409289295940476e-05, + "loss": 0.03, + "num_input_tokens_seen": 34534064, + "step": 59535 + }, + { + "epoch": 8.868036937742032, + "grad_norm": 0.010075614787638187, + "learning_rate": 3.408986601403278e-05, + "loss": 0.0941, + "num_input_tokens_seen": 34537040, + "step": 59540 + }, + { + "epoch": 8.86878165028299, + "grad_norm": 214.61265563964844, + "learning_rate": 3.4086838915096765e-05, + "loss": 0.3546, + "num_input_tokens_seen": 34539728, + "step": 59545 + }, + { + "epoch": 8.86952636282395, + "grad_norm": 0.016525963321328163, + "learning_rate": 3.408381166264784e-05, + "loss": 0.0005, + "num_input_tokens_seen": 34542384, + "step": 59550 + }, + { + "epoch": 8.87027107536491, + "grad_norm": 13.486468315124512, + "learning_rate": 3.408078425673714e-05, + "loss": 0.481, + "num_input_tokens_seen": 34545296, + "step": 59555 + }, + { + "epoch": 8.871015787905868, + "grad_norm": 0.023961810395121574, + "learning_rate": 3.407775669741583e-05, + "loss": 0.002, + "num_input_tokens_seen": 34548208, + "step": 59560 + }, + { + "epoch": 8.871760500446827, + "grad_norm": 79.95726776123047, + "learning_rate": 3.4074728984735043e-05, + "loss": 0.1355, + "num_input_tokens_seen": 34551120, + "step": 59565 + }, + { + "epoch": 8.872505212987786, + "grad_norm": 0.41364818811416626, + "learning_rate": 3.407170111874593e-05, + "loss": 0.2377, + "num_input_tokens_seen": 34553936, + "step": 59570 + }, + { + "epoch": 8.873249925528746, + "grad_norm": 31.772397994995117, + "learning_rate": 3.4068673099499646e-05, + "loss": 0.2579, + "num_input_tokens_seen": 34556592, + "step": 59575 + }, + { + "epoch": 8.873994638069705, + "grad_norm": 0.18057747185230255, + "learning_rate": 3.4065644927047354e-05, + "loss": 0.2103, + "num_input_tokens_seen": 34559440, + "step": 59580 + }, + { + "epoch": 8.874739350610664, + "grad_norm": 1.140176773071289, + "learning_rate": 3.40626166014402e-05, + "loss": 0.0018, + "num_input_tokens_seen": 34562032, + "step": 59585 + }, + { + "epoch": 8.875484063151623, + "grad_norm": 125.86876678466797, + "learning_rate": 3.4059588122729344e-05, + "loss": 0.3189, + "num_input_tokens_seen": 34564848, + "step": 59590 + }, + { + "epoch": 8.876228775692583, + "grad_norm": 21.707761764526367, + "learning_rate": 3.405655949096597e-05, + "loss": 0.173, + "num_input_tokens_seen": 34567696, + "step": 59595 + }, + { + "epoch": 8.876973488233542, + "grad_norm": 0.04265119135379791, + "learning_rate": 3.405353070620122e-05, + "loss": 0.0603, + "num_input_tokens_seen": 34570832, + "step": 59600 + }, + { + "epoch": 8.8777182007745, + "grad_norm": 0.05021039769053459, + "learning_rate": 3.4050501768486266e-05, + "loss": 0.0006, + "num_input_tokens_seen": 34573744, + "step": 59605 + }, + { + "epoch": 8.87846291331546, + "grad_norm": 0.008300574496388435, + "learning_rate": 3.404747267787228e-05, + "loss": 0.0005, + "num_input_tokens_seen": 34576464, + "step": 59610 + }, + { + "epoch": 8.87920762585642, + "grad_norm": 21.86861801147461, + "learning_rate": 3.404444343441045e-05, + "loss": 0.0565, + "num_input_tokens_seen": 34579408, + "step": 59615 + }, + { + "epoch": 8.879952338397379, + "grad_norm": 0.0014022768009454012, + "learning_rate": 3.404141403815193e-05, + "loss": 0.3232, + "num_input_tokens_seen": 34582416, + "step": 59620 + }, + { + "epoch": 8.880697050938338, + "grad_norm": 0.010787341743707657, + "learning_rate": 3.4038384489147926e-05, + "loss": 0.01, + "num_input_tokens_seen": 34585200, + "step": 59625 + }, + { + "epoch": 8.881441763479296, + "grad_norm": 24.226205825805664, + "learning_rate": 3.4035354787449584e-05, + "loss": 0.1965, + "num_input_tokens_seen": 34587920, + "step": 59630 + }, + { + "epoch": 8.882186476020257, + "grad_norm": 0.011952620930969715, + "learning_rate": 3.403232493310811e-05, + "loss": 0.0271, + "num_input_tokens_seen": 34590736, + "step": 59635 + }, + { + "epoch": 8.882931188561216, + "grad_norm": 0.014578219503164291, + "learning_rate": 3.402929492617469e-05, + "loss": 0.1406, + "num_input_tokens_seen": 34594096, + "step": 59640 + }, + { + "epoch": 8.883675901102174, + "grad_norm": 33.0220832824707, + "learning_rate": 3.402626476670051e-05, + "loss": 0.2162, + "num_input_tokens_seen": 34596944, + "step": 59645 + }, + { + "epoch": 8.884420613643133, + "grad_norm": 0.012883302755653858, + "learning_rate": 3.4023234454736756e-05, + "loss": 0.0369, + "num_input_tokens_seen": 34600368, + "step": 59650 + }, + { + "epoch": 8.885165326184094, + "grad_norm": 2.280089855194092, + "learning_rate": 3.402020399033463e-05, + "loss": 0.0559, + "num_input_tokens_seen": 34603248, + "step": 59655 + }, + { + "epoch": 8.885910038725052, + "grad_norm": 0.015484843403100967, + "learning_rate": 3.401717337354533e-05, + "loss": 0.5249, + "num_input_tokens_seen": 34606000, + "step": 59660 + }, + { + "epoch": 8.886654751266011, + "grad_norm": 0.049081750214099884, + "learning_rate": 3.401414260442004e-05, + "loss": 0.1887, + "num_input_tokens_seen": 34608976, + "step": 59665 + }, + { + "epoch": 8.88739946380697, + "grad_norm": 0.013395285233855247, + "learning_rate": 3.401111168300998e-05, + "loss": 0.2479, + "num_input_tokens_seen": 34611824, + "step": 59670 + }, + { + "epoch": 8.88814417634793, + "grad_norm": 0.05728926509618759, + "learning_rate": 3.400808060936635e-05, + "loss": 0.2201, + "num_input_tokens_seen": 34614704, + "step": 59675 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 0.015859035775065422, + "learning_rate": 3.4005049383540345e-05, + "loss": 0.3089, + "num_input_tokens_seen": 34617520, + "step": 59680 + }, + { + "epoch": 8.889633601429848, + "grad_norm": 6.910933494567871, + "learning_rate": 3.400201800558318e-05, + "loss": 0.0032, + "num_input_tokens_seen": 34620336, + "step": 59685 + }, + { + "epoch": 8.890378313970807, + "grad_norm": 0.017915930598974228, + "learning_rate": 3.399898647554608e-05, + "loss": 0.003, + "num_input_tokens_seen": 34623408, + "step": 59690 + }, + { + "epoch": 8.891123026511767, + "grad_norm": 0.02275726944208145, + "learning_rate": 3.399595479348024e-05, + "loss": 0.0074, + "num_input_tokens_seen": 34626288, + "step": 59695 + }, + { + "epoch": 8.891867739052726, + "grad_norm": 0.048208095133304596, + "learning_rate": 3.3992922959436894e-05, + "loss": 0.0091, + "num_input_tokens_seen": 34629296, + "step": 59700 + }, + { + "epoch": 8.892612451593685, + "grad_norm": 23.133970260620117, + "learning_rate": 3.3989890973467255e-05, + "loss": 0.3909, + "num_input_tokens_seen": 34632272, + "step": 59705 + }, + { + "epoch": 8.893357164134644, + "grad_norm": 0.0033717527985572815, + "learning_rate": 3.3986858835622536e-05, + "loss": 0.0277, + "num_input_tokens_seen": 34634800, + "step": 59710 + }, + { + "epoch": 8.894101876675602, + "grad_norm": 32.2020378112793, + "learning_rate": 3.398382654595398e-05, + "loss": 0.098, + "num_input_tokens_seen": 34637680, + "step": 59715 + }, + { + "epoch": 8.894846589216563, + "grad_norm": 192.67958068847656, + "learning_rate": 3.39807941045128e-05, + "loss": 0.088, + "num_input_tokens_seen": 34640752, + "step": 59720 + }, + { + "epoch": 8.895591301757522, + "grad_norm": 4.015116214752197, + "learning_rate": 3.397776151135024e-05, + "loss": 0.1869, + "num_input_tokens_seen": 34643472, + "step": 59725 + }, + { + "epoch": 8.89633601429848, + "grad_norm": 0.0677848756313324, + "learning_rate": 3.397472876651752e-05, + "loss": 0.0013, + "num_input_tokens_seen": 34646544, + "step": 59730 + }, + { + "epoch": 8.89708072683944, + "grad_norm": 0.010460014455020428, + "learning_rate": 3.397169587006588e-05, + "loss": 0.1082, + "num_input_tokens_seen": 34649648, + "step": 59735 + }, + { + "epoch": 8.8978254393804, + "grad_norm": 0.7042471170425415, + "learning_rate": 3.396866282204655e-05, + "loss": 0.2072, + "num_input_tokens_seen": 34652496, + "step": 59740 + }, + { + "epoch": 8.898570151921358, + "grad_norm": 0.16471922397613525, + "learning_rate": 3.3965629622510776e-05, + "loss": 0.1985, + "num_input_tokens_seen": 34655632, + "step": 59745 + }, + { + "epoch": 8.899314864462317, + "grad_norm": 0.1594524383544922, + "learning_rate": 3.3962596271509806e-05, + "loss": 0.207, + "num_input_tokens_seen": 34658512, + "step": 59750 + }, + { + "epoch": 8.900059577003276, + "grad_norm": 0.05206022784113884, + "learning_rate": 3.395956276909488e-05, + "loss": 0.1121, + "num_input_tokens_seen": 34661616, + "step": 59755 + }, + { + "epoch": 8.900804289544237, + "grad_norm": 18.184341430664062, + "learning_rate": 3.395652911531725e-05, + "loss": 0.1571, + "num_input_tokens_seen": 34664272, + "step": 59760 + }, + { + "epoch": 8.901549002085195, + "grad_norm": 0.03172154352068901, + "learning_rate": 3.395349531022817e-05, + "loss": 0.0308, + "num_input_tokens_seen": 34667120, + "step": 59765 + }, + { + "epoch": 8.902293714626154, + "grad_norm": 0.06139421835541725, + "learning_rate": 3.395046135387888e-05, + "loss": 0.2381, + "num_input_tokens_seen": 34669840, + "step": 59770 + }, + { + "epoch": 8.903038427167113, + "grad_norm": 0.0038436399772763252, + "learning_rate": 3.394742724632064e-05, + "loss": 0.1212, + "num_input_tokens_seen": 34672528, + "step": 59775 + }, + { + "epoch": 8.903783139708073, + "grad_norm": 19.908815383911133, + "learning_rate": 3.3944392987604703e-05, + "loss": 0.1566, + "num_input_tokens_seen": 34675312, + "step": 59780 + }, + { + "epoch": 8.904527852249032, + "grad_norm": 71.73047637939453, + "learning_rate": 3.394135857778235e-05, + "loss": 0.1339, + "num_input_tokens_seen": 34678288, + "step": 59785 + }, + { + "epoch": 8.90527256478999, + "grad_norm": 3.633782148361206, + "learning_rate": 3.3938324016904825e-05, + "loss": 0.1397, + "num_input_tokens_seen": 34681456, + "step": 59790 + }, + { + "epoch": 8.90601727733095, + "grad_norm": 2.0349128246307373, + "learning_rate": 3.3935289305023405e-05, + "loss": 0.3221, + "num_input_tokens_seen": 34684304, + "step": 59795 + }, + { + "epoch": 8.90676198987191, + "grad_norm": 0.3944162130355835, + "learning_rate": 3.393225444218936e-05, + "loss": 0.1191, + "num_input_tokens_seen": 34687056, + "step": 59800 + }, + { + "epoch": 8.907506702412869, + "grad_norm": 9.04712200164795, + "learning_rate": 3.392921942845394e-05, + "loss": 0.1842, + "num_input_tokens_seen": 34689936, + "step": 59805 + }, + { + "epoch": 8.908251414953828, + "grad_norm": 0.110429547727108, + "learning_rate": 3.392618426386843e-05, + "loss": 0.0027, + "num_input_tokens_seen": 34692752, + "step": 59810 + }, + { + "epoch": 8.908996127494786, + "grad_norm": 1.4323996305465698, + "learning_rate": 3.3923148948484115e-05, + "loss": 0.1767, + "num_input_tokens_seen": 34695568, + "step": 59815 + }, + { + "epoch": 8.909740840035747, + "grad_norm": 0.10474056750535965, + "learning_rate": 3.392011348235228e-05, + "loss": 0.1377, + "num_input_tokens_seen": 34698960, + "step": 59820 + }, + { + "epoch": 8.910485552576706, + "grad_norm": 19.115114212036133, + "learning_rate": 3.391707786552418e-05, + "loss": 0.182, + "num_input_tokens_seen": 34701808, + "step": 59825 + }, + { + "epoch": 8.911230265117664, + "grad_norm": 0.21808281540870667, + "learning_rate": 3.391404209805112e-05, + "loss": 0.1358, + "num_input_tokens_seen": 34704752, + "step": 59830 + }, + { + "epoch": 8.911974977658623, + "grad_norm": 82.21502685546875, + "learning_rate": 3.3911006179984375e-05, + "loss": 0.3587, + "num_input_tokens_seen": 34707664, + "step": 59835 + }, + { + "epoch": 8.912719690199584, + "grad_norm": 0.09699084609746933, + "learning_rate": 3.3907970111375237e-05, + "loss": 0.12, + "num_input_tokens_seen": 34710704, + "step": 59840 + }, + { + "epoch": 8.913464402740543, + "grad_norm": 24.752010345458984, + "learning_rate": 3.3904933892275e-05, + "loss": 0.2159, + "num_input_tokens_seen": 34713616, + "step": 59845 + }, + { + "epoch": 8.914209115281501, + "grad_norm": 0.10691813379526138, + "learning_rate": 3.390189752273495e-05, + "loss": 0.0255, + "num_input_tokens_seen": 34716336, + "step": 59850 + }, + { + "epoch": 8.91495382782246, + "grad_norm": 17.09926986694336, + "learning_rate": 3.389886100280639e-05, + "loss": 0.0791, + "num_input_tokens_seen": 34719344, + "step": 59855 + }, + { + "epoch": 8.915698540363419, + "grad_norm": 0.10641808062791824, + "learning_rate": 3.389582433254062e-05, + "loss": 0.04, + "num_input_tokens_seen": 34722288, + "step": 59860 + }, + { + "epoch": 8.91644325290438, + "grad_norm": 34.75326156616211, + "learning_rate": 3.3892787511988936e-05, + "loss": 0.3489, + "num_input_tokens_seen": 34725392, + "step": 59865 + }, + { + "epoch": 8.917187965445338, + "grad_norm": 0.0016091043362393975, + "learning_rate": 3.3889750541202654e-05, + "loss": 0.0004, + "num_input_tokens_seen": 34728112, + "step": 59870 + }, + { + "epoch": 8.917932677986297, + "grad_norm": 5.9693827629089355, + "learning_rate": 3.388671342023306e-05, + "loss": 0.1274, + "num_input_tokens_seen": 34731344, + "step": 59875 + }, + { + "epoch": 8.918677390527257, + "grad_norm": 0.060223665088415146, + "learning_rate": 3.388367614913149e-05, + "loss": 0.0017, + "num_input_tokens_seen": 34734320, + "step": 59880 + }, + { + "epoch": 8.919422103068216, + "grad_norm": 0.02299591340124607, + "learning_rate": 3.388063872794923e-05, + "loss": 0.0601, + "num_input_tokens_seen": 34737168, + "step": 59885 + }, + { + "epoch": 8.920166815609175, + "grad_norm": 11.603372573852539, + "learning_rate": 3.3877601156737604e-05, + "loss": 0.3016, + "num_input_tokens_seen": 34740208, + "step": 59890 + }, + { + "epoch": 8.920911528150134, + "grad_norm": 48.40475845336914, + "learning_rate": 3.3874563435547934e-05, + "loss": 0.1885, + "num_input_tokens_seen": 34743152, + "step": 59895 + }, + { + "epoch": 8.921656240691092, + "grad_norm": 0.02658959850668907, + "learning_rate": 3.387152556443153e-05, + "loss": 0.0676, + "num_input_tokens_seen": 34746128, + "step": 59900 + }, + { + "epoch": 8.922400953232053, + "grad_norm": 0.007959118112921715, + "learning_rate": 3.386848754343972e-05, + "loss": 0.0016, + "num_input_tokens_seen": 34749296, + "step": 59905 + }, + { + "epoch": 8.923145665773012, + "grad_norm": 0.015796920284628868, + "learning_rate": 3.386544937262382e-05, + "loss": 0.1699, + "num_input_tokens_seen": 34752176, + "step": 59910 + }, + { + "epoch": 8.92389037831397, + "grad_norm": 43.44316101074219, + "learning_rate": 3.386241105203517e-05, + "loss": 0.4365, + "num_input_tokens_seen": 34755280, + "step": 59915 + }, + { + "epoch": 8.92463509085493, + "grad_norm": 0.1784876137971878, + "learning_rate": 3.38593725817251e-05, + "loss": 0.2119, + "num_input_tokens_seen": 34758256, + "step": 59920 + }, + { + "epoch": 8.92537980339589, + "grad_norm": 0.0035157129168510437, + "learning_rate": 3.385633396174492e-05, + "loss": 0.2738, + "num_input_tokens_seen": 34761104, + "step": 59925 + }, + { + "epoch": 8.926124515936849, + "grad_norm": 1.014460802078247, + "learning_rate": 3.385329519214599e-05, + "loss": 0.4126, + "num_input_tokens_seen": 34764048, + "step": 59930 + }, + { + "epoch": 8.926869228477807, + "grad_norm": 39.92100143432617, + "learning_rate": 3.385025627297963e-05, + "loss": 0.0675, + "num_input_tokens_seen": 34766896, + "step": 59935 + }, + { + "epoch": 8.927613941018766, + "grad_norm": 0.073447585105896, + "learning_rate": 3.384721720429718e-05, + "loss": 0.1365, + "num_input_tokens_seen": 34769616, + "step": 59940 + }, + { + "epoch": 8.928358653559727, + "grad_norm": 0.08934015780687332, + "learning_rate": 3.384417798614999e-05, + "loss": 0.0082, + "num_input_tokens_seen": 34772848, + "step": 59945 + }, + { + "epoch": 8.929103366100685, + "grad_norm": 0.052696406841278076, + "learning_rate": 3.3841138618589416e-05, + "loss": 0.3128, + "num_input_tokens_seen": 34775760, + "step": 59950 + }, + { + "epoch": 8.929848078641644, + "grad_norm": 0.8258777856826782, + "learning_rate": 3.383809910166678e-05, + "loss": 0.2135, + "num_input_tokens_seen": 34778768, + "step": 59955 + }, + { + "epoch": 8.930592791182603, + "grad_norm": 0.03392631188035011, + "learning_rate": 3.383505943543344e-05, + "loss": 0.004, + "num_input_tokens_seen": 34781872, + "step": 59960 + }, + { + "epoch": 8.931337503723563, + "grad_norm": 33.165382385253906, + "learning_rate": 3.383201961994076e-05, + "loss": 0.2652, + "num_input_tokens_seen": 34784624, + "step": 59965 + }, + { + "epoch": 8.932082216264522, + "grad_norm": 0.07838007062673569, + "learning_rate": 3.382897965524007e-05, + "loss": 0.0017, + "num_input_tokens_seen": 34787760, + "step": 59970 + }, + { + "epoch": 8.932826928805481, + "grad_norm": 0.6407788991928101, + "learning_rate": 3.382593954138276e-05, + "loss": 0.1018, + "num_input_tokens_seen": 34790512, + "step": 59975 + }, + { + "epoch": 8.93357164134644, + "grad_norm": 0.07491957396268845, + "learning_rate": 3.382289927842015e-05, + "loss": 0.0147, + "num_input_tokens_seen": 34793232, + "step": 59980 + }, + { + "epoch": 8.9343163538874, + "grad_norm": 0.03933074325323105, + "learning_rate": 3.381985886640364e-05, + "loss": 0.0114, + "num_input_tokens_seen": 34796336, + "step": 59985 + }, + { + "epoch": 8.935061066428359, + "grad_norm": 0.010716366581618786, + "learning_rate": 3.381681830538458e-05, + "loss": 0.0018, + "num_input_tokens_seen": 34799184, + "step": 59990 + }, + { + "epoch": 8.935805778969318, + "grad_norm": 7.649359226226807, + "learning_rate": 3.3813777595414324e-05, + "loss": 0.2742, + "num_input_tokens_seen": 34802384, + "step": 59995 + }, + { + "epoch": 8.936550491510276, + "grad_norm": 0.024224458262324333, + "learning_rate": 3.3810736736544265e-05, + "loss": 0.0118, + "num_input_tokens_seen": 34805296, + "step": 60000 + }, + { + "epoch": 8.937295204051237, + "grad_norm": 3.441347360610962, + "learning_rate": 3.380769572882576e-05, + "loss": 0.201, + "num_input_tokens_seen": 34808048, + "step": 60005 + }, + { + "epoch": 8.938039916592196, + "grad_norm": 0.06846056878566742, + "learning_rate": 3.380465457231018e-05, + "loss": 0.1919, + "num_input_tokens_seen": 34810864, + "step": 60010 + }, + { + "epoch": 8.938784629133155, + "grad_norm": 0.903172492980957, + "learning_rate": 3.3801613267048916e-05, + "loss": 0.1789, + "num_input_tokens_seen": 34813712, + "step": 60015 + }, + { + "epoch": 8.939529341674113, + "grad_norm": 0.0017343328800052404, + "learning_rate": 3.379857181309334e-05, + "loss": 0.2687, + "num_input_tokens_seen": 34816592, + "step": 60020 + }, + { + "epoch": 8.940274054215074, + "grad_norm": 7.613529682159424, + "learning_rate": 3.379553021049484e-05, + "loss": 0.0023, + "num_input_tokens_seen": 34819344, + "step": 60025 + }, + { + "epoch": 8.941018766756033, + "grad_norm": 0.033117104321718216, + "learning_rate": 3.379248845930479e-05, + "loss": 0.3934, + "num_input_tokens_seen": 34822256, + "step": 60030 + }, + { + "epoch": 8.941763479296991, + "grad_norm": 43.5401611328125, + "learning_rate": 3.378944655957458e-05, + "loss": 0.1989, + "num_input_tokens_seen": 34825040, + "step": 60035 + }, + { + "epoch": 8.94250819183795, + "grad_norm": 152.00584411621094, + "learning_rate": 3.3786404511355616e-05, + "loss": 0.3358, + "num_input_tokens_seen": 34828304, + "step": 60040 + }, + { + "epoch": 8.943252904378909, + "grad_norm": 0.016636308282613754, + "learning_rate": 3.378336231469927e-05, + "loss": 0.0071, + "num_input_tokens_seen": 34831184, + "step": 60045 + }, + { + "epoch": 8.94399761691987, + "grad_norm": 0.08333270996809006, + "learning_rate": 3.378031996965695e-05, + "loss": 0.0007, + "num_input_tokens_seen": 34833840, + "step": 60050 + }, + { + "epoch": 8.944742329460828, + "grad_norm": 0.13794733583927155, + "learning_rate": 3.3777277476280036e-05, + "loss": 0.1253, + "num_input_tokens_seen": 34836592, + "step": 60055 + }, + { + "epoch": 8.945487042001787, + "grad_norm": 0.037100598216056824, + "learning_rate": 3.3774234834619956e-05, + "loss": 0.2326, + "num_input_tokens_seen": 34839440, + "step": 60060 + }, + { + "epoch": 8.946231754542747, + "grad_norm": 16.161781311035156, + "learning_rate": 3.377119204472809e-05, + "loss": 0.4328, + "num_input_tokens_seen": 34842064, + "step": 60065 + }, + { + "epoch": 8.946976467083706, + "grad_norm": 0.03135456144809723, + "learning_rate": 3.376814910665584e-05, + "loss": 0.0866, + "num_input_tokens_seen": 34844688, + "step": 60070 + }, + { + "epoch": 8.947721179624665, + "grad_norm": 0.005196639336645603, + "learning_rate": 3.3765106020454636e-05, + "loss": 0.0925, + "num_input_tokens_seen": 34847696, + "step": 60075 + }, + { + "epoch": 8.948465892165624, + "grad_norm": 0.02566615119576454, + "learning_rate": 3.376206278617587e-05, + "loss": 0.0104, + "num_input_tokens_seen": 34851600, + "step": 60080 + }, + { + "epoch": 8.949210604706582, + "grad_norm": 0.02942618541419506, + "learning_rate": 3.375901940387096e-05, + "loss": 0.1794, + "num_input_tokens_seen": 34854800, + "step": 60085 + }, + { + "epoch": 8.949955317247543, + "grad_norm": 0.03874823823571205, + "learning_rate": 3.375597587359131e-05, + "loss": 0.0021, + "num_input_tokens_seen": 34857680, + "step": 60090 + }, + { + "epoch": 8.950700029788502, + "grad_norm": 0.005318894516676664, + "learning_rate": 3.375293219538836e-05, + "loss": 0.2789, + "num_input_tokens_seen": 34860624, + "step": 60095 + }, + { + "epoch": 8.95144474232946, + "grad_norm": 51.779422760009766, + "learning_rate": 3.374988836931351e-05, + "loss": 0.0069, + "num_input_tokens_seen": 34863408, + "step": 60100 + }, + { + "epoch": 8.95218945487042, + "grad_norm": 0.21740250289440155, + "learning_rate": 3.374684439541819e-05, + "loss": 0.088, + "num_input_tokens_seen": 34866320, + "step": 60105 + }, + { + "epoch": 8.95293416741138, + "grad_norm": 0.021107999607920647, + "learning_rate": 3.374380027375382e-05, + "loss": 0.0106, + "num_input_tokens_seen": 34869072, + "step": 60110 + }, + { + "epoch": 8.953678879952339, + "grad_norm": 8.0030517578125, + "learning_rate": 3.374075600437183e-05, + "loss": 0.3057, + "num_input_tokens_seen": 34872080, + "step": 60115 + }, + { + "epoch": 8.954423592493297, + "grad_norm": 107.75801086425781, + "learning_rate": 3.373771158732366e-05, + "loss": 0.0905, + "num_input_tokens_seen": 34875120, + "step": 60120 + }, + { + "epoch": 8.955168305034256, + "grad_norm": 0.029897356405854225, + "learning_rate": 3.3734667022660724e-05, + "loss": 0.0004, + "num_input_tokens_seen": 34878128, + "step": 60125 + }, + { + "epoch": 8.955913017575217, + "grad_norm": 73.18272399902344, + "learning_rate": 3.373162231043447e-05, + "loss": 0.1169, + "num_input_tokens_seen": 34880752, + "step": 60130 + }, + { + "epoch": 8.956657730116175, + "grad_norm": 6.02034330368042, + "learning_rate": 3.3728577450696336e-05, + "loss": 0.0054, + "num_input_tokens_seen": 34883856, + "step": 60135 + }, + { + "epoch": 8.957402442657134, + "grad_norm": 0.0036385669372975826, + "learning_rate": 3.372553244349775e-05, + "loss": 0.0008, + "num_input_tokens_seen": 34887024, + "step": 60140 + }, + { + "epoch": 8.958147155198093, + "grad_norm": 58.50241470336914, + "learning_rate": 3.372248728889017e-05, + "loss": 0.2366, + "num_input_tokens_seen": 34890032, + "step": 60145 + }, + { + "epoch": 8.958891867739053, + "grad_norm": 0.09048902243375778, + "learning_rate": 3.371944198692502e-05, + "loss": 0.0606, + "num_input_tokens_seen": 34893040, + "step": 60150 + }, + { + "epoch": 8.959636580280012, + "grad_norm": 0.007176087703555822, + "learning_rate": 3.3716396537653774e-05, + "loss": 0.1987, + "num_input_tokens_seen": 34895856, + "step": 60155 + }, + { + "epoch": 8.960381292820971, + "grad_norm": 0.13632076978683472, + "learning_rate": 3.371335094112786e-05, + "loss": 0.0005, + "num_input_tokens_seen": 34898928, + "step": 60160 + }, + { + "epoch": 8.96112600536193, + "grad_norm": 123.16046142578125, + "learning_rate": 3.371030519739874e-05, + "loss": 0.3583, + "num_input_tokens_seen": 34901808, + "step": 60165 + }, + { + "epoch": 8.96187071790289, + "grad_norm": 0.042870067059993744, + "learning_rate": 3.370725930651786e-05, + "loss": 0.0029, + "num_input_tokens_seen": 34904592, + "step": 60170 + }, + { + "epoch": 8.962615430443849, + "grad_norm": 0.020030247047543526, + "learning_rate": 3.370421326853669e-05, + "loss": 0.2428, + "num_input_tokens_seen": 34907536, + "step": 60175 + }, + { + "epoch": 8.963360142984808, + "grad_norm": 0.0025954432785511017, + "learning_rate": 3.370116708350668e-05, + "loss": 0.1893, + "num_input_tokens_seen": 34910640, + "step": 60180 + }, + { + "epoch": 8.964104855525767, + "grad_norm": 41.423465728759766, + "learning_rate": 3.369812075147929e-05, + "loss": 0.0613, + "num_input_tokens_seen": 34913552, + "step": 60185 + }, + { + "epoch": 8.964849568066727, + "grad_norm": 0.05061667039990425, + "learning_rate": 3.369507427250601e-05, + "loss": 0.1624, + "num_input_tokens_seen": 34916464, + "step": 60190 + }, + { + "epoch": 8.965594280607686, + "grad_norm": 122.8521957397461, + "learning_rate": 3.369202764663827e-05, + "loss": 0.5028, + "num_input_tokens_seen": 34919504, + "step": 60195 + }, + { + "epoch": 8.966338993148645, + "grad_norm": 49.067420959472656, + "learning_rate": 3.368898087392756e-05, + "loss": 0.2061, + "num_input_tokens_seen": 34922512, + "step": 60200 + }, + { + "epoch": 8.967083705689603, + "grad_norm": 27.019622802734375, + "learning_rate": 3.3685933954425353e-05, + "loss": 0.3438, + "num_input_tokens_seen": 34925296, + "step": 60205 + }, + { + "epoch": 8.967828418230564, + "grad_norm": 0.008045128546655178, + "learning_rate": 3.368288688818312e-05, + "loss": 0.2601, + "num_input_tokens_seen": 34927952, + "step": 60210 + }, + { + "epoch": 8.968573130771523, + "grad_norm": 0.054601360112428665, + "learning_rate": 3.367983967525234e-05, + "loss": 0.1653, + "num_input_tokens_seen": 34930704, + "step": 60215 + }, + { + "epoch": 8.969317843312481, + "grad_norm": 0.06141229718923569, + "learning_rate": 3.367679231568448e-05, + "loss": 0.0124, + "num_input_tokens_seen": 34933648, + "step": 60220 + }, + { + "epoch": 8.97006255585344, + "grad_norm": 11.98270034790039, + "learning_rate": 3.367374480953104e-05, + "loss": 0.1037, + "num_input_tokens_seen": 34936944, + "step": 60225 + }, + { + "epoch": 8.970807268394399, + "grad_norm": 117.71585845947266, + "learning_rate": 3.367069715684349e-05, + "loss": 0.1942, + "num_input_tokens_seen": 34940016, + "step": 60230 + }, + { + "epoch": 8.97155198093536, + "grad_norm": 0.2640589475631714, + "learning_rate": 3.366764935767333e-05, + "loss": 0.0719, + "num_input_tokens_seen": 34942992, + "step": 60235 + }, + { + "epoch": 8.972296693476318, + "grad_norm": 0.007646047044545412, + "learning_rate": 3.366460141207205e-05, + "loss": 0.0756, + "num_input_tokens_seen": 34946032, + "step": 60240 + }, + { + "epoch": 8.973041406017277, + "grad_norm": 20.27012825012207, + "learning_rate": 3.366155332009113e-05, + "loss": 0.29, + "num_input_tokens_seen": 34949008, + "step": 60245 + }, + { + "epoch": 8.973786118558236, + "grad_norm": 6.203778266906738, + "learning_rate": 3.3658505081782064e-05, + "loss": 0.0565, + "num_input_tokens_seen": 34951696, + "step": 60250 + }, + { + "epoch": 8.974530831099196, + "grad_norm": 0.011805564165115356, + "learning_rate": 3.3655456697196366e-05, + "loss": 0.0268, + "num_input_tokens_seen": 34954672, + "step": 60255 + }, + { + "epoch": 8.975275543640155, + "grad_norm": 0.6985951662063599, + "learning_rate": 3.365240816638552e-05, + "loss": 0.3512, + "num_input_tokens_seen": 34957520, + "step": 60260 + }, + { + "epoch": 8.976020256181114, + "grad_norm": 0.2567092180252075, + "learning_rate": 3.3649359489401025e-05, + "loss": 0.0498, + "num_input_tokens_seen": 34960496, + "step": 60265 + }, + { + "epoch": 8.976764968722073, + "grad_norm": 0.0069394600577652454, + "learning_rate": 3.3646310666294396e-05, + "loss": 0.2092, + "num_input_tokens_seen": 34963120, + "step": 60270 + }, + { + "epoch": 8.977509681263033, + "grad_norm": 0.35937538743019104, + "learning_rate": 3.364326169711713e-05, + "loss": 0.1593, + "num_input_tokens_seen": 34966128, + "step": 60275 + }, + { + "epoch": 8.978254393803992, + "grad_norm": 0.031733911484479904, + "learning_rate": 3.364021258192075e-05, + "loss": 0.2289, + "num_input_tokens_seen": 34969008, + "step": 60280 + }, + { + "epoch": 8.97899910634495, + "grad_norm": 0.0351828895509243, + "learning_rate": 3.363716332075676e-05, + "loss": 0.2424, + "num_input_tokens_seen": 34972304, + "step": 60285 + }, + { + "epoch": 8.97974381888591, + "grad_norm": 3.613781452178955, + "learning_rate": 3.363411391367668e-05, + "loss": 0.024, + "num_input_tokens_seen": 34975536, + "step": 60290 + }, + { + "epoch": 8.98048853142687, + "grad_norm": 0.02874722331762314, + "learning_rate": 3.363106436073202e-05, + "loss": 0.0514, + "num_input_tokens_seen": 34978416, + "step": 60295 + }, + { + "epoch": 8.981233243967829, + "grad_norm": 0.10472816228866577, + "learning_rate": 3.362801466197429e-05, + "loss": 0.0875, + "num_input_tokens_seen": 34981424, + "step": 60300 + }, + { + "epoch": 8.981977956508787, + "grad_norm": 0.005947847384959459, + "learning_rate": 3.362496481745502e-05, + "loss": 0.0038, + "num_input_tokens_seen": 34984272, + "step": 60305 + }, + { + "epoch": 8.982722669049746, + "grad_norm": 31.449337005615234, + "learning_rate": 3.362191482722574e-05, + "loss": 0.1374, + "num_input_tokens_seen": 34987056, + "step": 60310 + }, + { + "epoch": 8.983467381590707, + "grad_norm": 17.74801254272461, + "learning_rate": 3.361886469133798e-05, + "loss": 0.2451, + "num_input_tokens_seen": 34989904, + "step": 60315 + }, + { + "epoch": 8.984212094131665, + "grad_norm": 0.29549792408943176, + "learning_rate": 3.361581440984325e-05, + "loss": 0.1985, + "num_input_tokens_seen": 34992720, + "step": 60320 + }, + { + "epoch": 8.984956806672624, + "grad_norm": 0.09893517941236496, + "learning_rate": 3.3612763982793094e-05, + "loss": 0.164, + "num_input_tokens_seen": 34995600, + "step": 60325 + }, + { + "epoch": 8.985701519213583, + "grad_norm": 0.0034942454658448696, + "learning_rate": 3.360971341023905e-05, + "loss": 0.0017, + "num_input_tokens_seen": 34998672, + "step": 60330 + }, + { + "epoch": 8.986446231754543, + "grad_norm": 5.4595723152160645, + "learning_rate": 3.360666269223264e-05, + "loss": 0.0332, + "num_input_tokens_seen": 35001776, + "step": 60335 + }, + { + "epoch": 8.987190944295502, + "grad_norm": 1.2461169958114624, + "learning_rate": 3.360361182882542e-05, + "loss": 0.4456, + "num_input_tokens_seen": 35004720, + "step": 60340 + }, + { + "epoch": 8.987935656836461, + "grad_norm": 14.11937141418457, + "learning_rate": 3.3600560820068916e-05, + "loss": 0.1198, + "num_input_tokens_seen": 35007760, + "step": 60345 + }, + { + "epoch": 8.98868036937742, + "grad_norm": 0.04311896488070488, + "learning_rate": 3.3597509666014684e-05, + "loss": 0.13, + "num_input_tokens_seen": 35010352, + "step": 60350 + }, + { + "epoch": 8.98942508191838, + "grad_norm": 23.23336410522461, + "learning_rate": 3.359445836671426e-05, + "loss": 0.5949, + "num_input_tokens_seen": 35012912, + "step": 60355 + }, + { + "epoch": 8.990169794459339, + "grad_norm": 0.0314047746360302, + "learning_rate": 3.359140692221919e-05, + "loss": 0.3579, + "num_input_tokens_seen": 35015600, + "step": 60360 + }, + { + "epoch": 8.990914507000298, + "grad_norm": 7.61025857925415, + "learning_rate": 3.3588355332581045e-05, + "loss": 0.0227, + "num_input_tokens_seen": 35018768, + "step": 60365 + }, + { + "epoch": 8.991659219541257, + "grad_norm": 55.9481086730957, + "learning_rate": 3.358530359785136e-05, + "loss": 0.2538, + "num_input_tokens_seen": 35021936, + "step": 60370 + }, + { + "epoch": 8.992403932082215, + "grad_norm": 0.011889154091477394, + "learning_rate": 3.35822517180817e-05, + "loss": 0.1346, + "num_input_tokens_seen": 35024816, + "step": 60375 + }, + { + "epoch": 8.993148644623176, + "grad_norm": 9.406024932861328, + "learning_rate": 3.357919969332361e-05, + "loss": 0.2061, + "num_input_tokens_seen": 35027728, + "step": 60380 + }, + { + "epoch": 8.993893357164135, + "grad_norm": 0.09008385986089706, + "learning_rate": 3.357614752362867e-05, + "loss": 0.0006, + "num_input_tokens_seen": 35030448, + "step": 60385 + }, + { + "epoch": 8.994638069705093, + "grad_norm": 1.292377233505249, + "learning_rate": 3.3573095209048435e-05, + "loss": 0.0962, + "num_input_tokens_seen": 35033552, + "step": 60390 + }, + { + "epoch": 8.995382782246054, + "grad_norm": 0.06309560686349869, + "learning_rate": 3.357004274963446e-05, + "loss": 0.1051, + "num_input_tokens_seen": 35036240, + "step": 60395 + }, + { + "epoch": 8.996127494787013, + "grad_norm": 0.03182396665215492, + "learning_rate": 3.356699014543833e-05, + "loss": 0.1823, + "num_input_tokens_seen": 35039504, + "step": 60400 + }, + { + "epoch": 8.996872207327971, + "grad_norm": 0.10224290937185287, + "learning_rate": 3.3563937396511607e-05, + "loss": 0.0012, + "num_input_tokens_seen": 35042288, + "step": 60405 + }, + { + "epoch": 8.99761691986893, + "grad_norm": 0.012005234137177467, + "learning_rate": 3.3560884502905865e-05, + "loss": 0.1817, + "num_input_tokens_seen": 35044944, + "step": 60410 + }, + { + "epoch": 8.998361632409889, + "grad_norm": 4.470086097717285, + "learning_rate": 3.355783146467268e-05, + "loss": 0.1658, + "num_input_tokens_seen": 35047824, + "step": 60415 + }, + { + "epoch": 8.99910634495085, + "grad_norm": 30.038366317749023, + "learning_rate": 3.355477828186363e-05, + "loss": 0.4777, + "num_input_tokens_seen": 35050704, + "step": 60420 + }, + { + "epoch": 8.999851057491808, + "grad_norm": 0.056563522666692734, + "learning_rate": 3.3551724954530303e-05, + "loss": 0.0754, + "num_input_tokens_seen": 35053648, + "step": 60425 + }, + { + "epoch": 9.0, + "eval_loss": 1.4896568059921265, + "eval_runtime": 51.2152, + "eval_samples_per_second": 58.264, + "eval_steps_per_second": 14.566, + "num_input_tokens_seen": 35053760, + "step": 60426 + }, + { + "epoch": 9.000595770032767, + "grad_norm": 1.3057371377944946, + "learning_rate": 3.3548671482724267e-05, + "loss": 0.002, + "num_input_tokens_seen": 35056000, + "step": 60430 + }, + { + "epoch": 9.001340482573726, + "grad_norm": 0.0588049553334713, + "learning_rate": 3.354561786649711e-05, + "loss": 0.3108, + "num_input_tokens_seen": 35058880, + "step": 60435 + }, + { + "epoch": 9.002085195114686, + "grad_norm": 0.05821533501148224, + "learning_rate": 3.354256410590043e-05, + "loss": 0.067, + "num_input_tokens_seen": 35062048, + "step": 60440 + }, + { + "epoch": 9.002829907655645, + "grad_norm": 0.06585041433572769, + "learning_rate": 3.353951020098582e-05, + "loss": 0.0755, + "num_input_tokens_seen": 35065120, + "step": 60445 + }, + { + "epoch": 9.003574620196604, + "grad_norm": 0.006235796958208084, + "learning_rate": 3.353645615180485e-05, + "loss": 0.0012, + "num_input_tokens_seen": 35067840, + "step": 60450 + }, + { + "epoch": 9.004319332737563, + "grad_norm": 0.026192769408226013, + "learning_rate": 3.3533401958409136e-05, + "loss": 0.1307, + "num_input_tokens_seen": 35070752, + "step": 60455 + }, + { + "epoch": 9.005064045278523, + "grad_norm": 0.13195201754570007, + "learning_rate": 3.3530347620850276e-05, + "loss": 0.0205, + "num_input_tokens_seen": 35073728, + "step": 60460 + }, + { + "epoch": 9.005808757819482, + "grad_norm": 0.08108798414468765, + "learning_rate": 3.3527293139179854e-05, + "loss": 0.0012, + "num_input_tokens_seen": 35076800, + "step": 60465 + }, + { + "epoch": 9.00655347036044, + "grad_norm": 83.28528594970703, + "learning_rate": 3.352423851344948e-05, + "loss": 0.1422, + "num_input_tokens_seen": 35079712, + "step": 60470 + }, + { + "epoch": 9.0072981829014, + "grad_norm": 3.550662040710449, + "learning_rate": 3.352118374371076e-05, + "loss": 0.0328, + "num_input_tokens_seen": 35082528, + "step": 60475 + }, + { + "epoch": 9.00804289544236, + "grad_norm": 0.014908909797668457, + "learning_rate": 3.351812883001531e-05, + "loss": 0.004, + "num_input_tokens_seen": 35085376, + "step": 60480 + }, + { + "epoch": 9.008787607983319, + "grad_norm": 0.48651644587516785, + "learning_rate": 3.3515073772414725e-05, + "loss": 0.0006, + "num_input_tokens_seen": 35088064, + "step": 60485 + }, + { + "epoch": 9.009532320524277, + "grad_norm": 0.11103012412786484, + "learning_rate": 3.351201857096062e-05, + "loss": 0.0005, + "num_input_tokens_seen": 35090688, + "step": 60490 + }, + { + "epoch": 9.010277033065236, + "grad_norm": 1.441458821296692, + "learning_rate": 3.350896322570462e-05, + "loss": 0.0655, + "num_input_tokens_seen": 35093760, + "step": 60495 + }, + { + "epoch": 9.011021745606197, + "grad_norm": 0.09882368892431259, + "learning_rate": 3.350590773669833e-05, + "loss": 0.033, + "num_input_tokens_seen": 35096352, + "step": 60500 + }, + { + "epoch": 9.011766458147155, + "grad_norm": 0.001514960778877139, + "learning_rate": 3.350285210399337e-05, + "loss": 0.1753, + "num_input_tokens_seen": 35099392, + "step": 60505 + }, + { + "epoch": 9.012511170688114, + "grad_norm": 0.017120681703090668, + "learning_rate": 3.3499796327641366e-05, + "loss": 0.0002, + "num_input_tokens_seen": 35102144, + "step": 60510 + }, + { + "epoch": 9.013255883229073, + "grad_norm": 0.010189399123191833, + "learning_rate": 3.349674040769394e-05, + "loss": 0.113, + "num_input_tokens_seen": 35104992, + "step": 60515 + }, + { + "epoch": 9.014000595770034, + "grad_norm": 0.003762404201552272, + "learning_rate": 3.349368434420274e-05, + "loss": 0.0018, + "num_input_tokens_seen": 35108064, + "step": 60520 + }, + { + "epoch": 9.014745308310992, + "grad_norm": 78.91748046875, + "learning_rate": 3.349062813721936e-05, + "loss": 0.0562, + "num_input_tokens_seen": 35110944, + "step": 60525 + }, + { + "epoch": 9.015490020851951, + "grad_norm": 0.06438206136226654, + "learning_rate": 3.348757178679545e-05, + "loss": 0.1058, + "num_input_tokens_seen": 35113824, + "step": 60530 + }, + { + "epoch": 9.01623473339291, + "grad_norm": 0.028811411932110786, + "learning_rate": 3.3484515292982634e-05, + "loss": 0.0004, + "num_input_tokens_seen": 35116640, + "step": 60535 + }, + { + "epoch": 9.01697944593387, + "grad_norm": 0.5800718665122986, + "learning_rate": 3.348145865583256e-05, + "loss": 0.1541, + "num_input_tokens_seen": 35119616, + "step": 60540 + }, + { + "epoch": 9.017724158474829, + "grad_norm": 4.706333637237549, + "learning_rate": 3.347840187539686e-05, + "loss": 0.2477, + "num_input_tokens_seen": 35122272, + "step": 60545 + }, + { + "epoch": 9.018468871015788, + "grad_norm": 0.09630478918552399, + "learning_rate": 3.347534495172718e-05, + "loss": 0.1633, + "num_input_tokens_seen": 35125056, + "step": 60550 + }, + { + "epoch": 9.019213583556747, + "grad_norm": 0.02377817966043949, + "learning_rate": 3.3472287884875167e-05, + "loss": 0.0004, + "num_input_tokens_seen": 35128064, + "step": 60555 + }, + { + "epoch": 9.019958296097707, + "grad_norm": 0.002041410654783249, + "learning_rate": 3.346923067489245e-05, + "loss": 0.0042, + "num_input_tokens_seen": 35131136, + "step": 60560 + }, + { + "epoch": 9.020703008638666, + "grad_norm": 0.2620948553085327, + "learning_rate": 3.3466173321830705e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35133952, + "step": 60565 + }, + { + "epoch": 9.021447721179625, + "grad_norm": 0.026894042268395424, + "learning_rate": 3.346311582574155e-05, + "loss": 0.4052, + "num_input_tokens_seen": 35136992, + "step": 60570 + }, + { + "epoch": 9.022192433720583, + "grad_norm": 0.04356953874230385, + "learning_rate": 3.3460058186676656e-05, + "loss": 0.1505, + "num_input_tokens_seen": 35139712, + "step": 60575 + }, + { + "epoch": 9.022937146261542, + "grad_norm": 115.14259338378906, + "learning_rate": 3.345700040468768e-05, + "loss": 0.1588, + "num_input_tokens_seen": 35142656, + "step": 60580 + }, + { + "epoch": 9.023681858802503, + "grad_norm": 0.05437279865145683, + "learning_rate": 3.345394247982628e-05, + "loss": 0.107, + "num_input_tokens_seen": 35145536, + "step": 60585 + }, + { + "epoch": 9.024426571343461, + "grad_norm": 0.24749653041362762, + "learning_rate": 3.345088441214411e-05, + "loss": 0.0624, + "num_input_tokens_seen": 35148416, + "step": 60590 + }, + { + "epoch": 9.02517128388442, + "grad_norm": 0.13753363490104675, + "learning_rate": 3.344782620169284e-05, + "loss": 0.0005, + "num_input_tokens_seen": 35151168, + "step": 60595 + }, + { + "epoch": 9.025915996425379, + "grad_norm": 0.01573813706636429, + "learning_rate": 3.344476784852413e-05, + "loss": 0.1503, + "num_input_tokens_seen": 35154016, + "step": 60600 + }, + { + "epoch": 9.02666070896634, + "grad_norm": 0.011530070565640926, + "learning_rate": 3.344170935268966e-05, + "loss": 0.0009, + "num_input_tokens_seen": 35156992, + "step": 60605 + }, + { + "epoch": 9.027405421507298, + "grad_norm": 0.004786888137459755, + "learning_rate": 3.3438650714241084e-05, + "loss": 0.1504, + "num_input_tokens_seen": 35160160, + "step": 60610 + }, + { + "epoch": 9.028150134048257, + "grad_norm": 0.06914481520652771, + "learning_rate": 3.3435591933230074e-05, + "loss": 0.0198, + "num_input_tokens_seen": 35162720, + "step": 60615 + }, + { + "epoch": 9.028894846589216, + "grad_norm": 0.006420169956982136, + "learning_rate": 3.343253300970832e-05, + "loss": 0.0455, + "num_input_tokens_seen": 35165568, + "step": 60620 + }, + { + "epoch": 9.029639559130176, + "grad_norm": 13.053417205810547, + "learning_rate": 3.3429473943727486e-05, + "loss": 0.2099, + "num_input_tokens_seen": 35168320, + "step": 60625 + }, + { + "epoch": 9.030384271671135, + "grad_norm": 0.013998482376337051, + "learning_rate": 3.342641473533926e-05, + "loss": 0.0009, + "num_input_tokens_seen": 35171200, + "step": 60630 + }, + { + "epoch": 9.031128984212094, + "grad_norm": 0.025731125846505165, + "learning_rate": 3.3423355384595316e-05, + "loss": 0.2947, + "num_input_tokens_seen": 35174048, + "step": 60635 + }, + { + "epoch": 9.031873696753053, + "grad_norm": 0.0614352785050869, + "learning_rate": 3.342029589154735e-05, + "loss": 0.1415, + "num_input_tokens_seen": 35176960, + "step": 60640 + }, + { + "epoch": 9.032618409294013, + "grad_norm": 40.56733322143555, + "learning_rate": 3.3417236256247044e-05, + "loss": 0.3281, + "num_input_tokens_seen": 35179648, + "step": 60645 + }, + { + "epoch": 9.033363121834972, + "grad_norm": 0.020847788080573082, + "learning_rate": 3.341417647874608e-05, + "loss": 0.256, + "num_input_tokens_seen": 35182784, + "step": 60650 + }, + { + "epoch": 9.03410783437593, + "grad_norm": 0.0020232696551829576, + "learning_rate": 3.341111655909616e-05, + "loss": 0.0013, + "num_input_tokens_seen": 35185856, + "step": 60655 + }, + { + "epoch": 9.03485254691689, + "grad_norm": 0.03927648067474365, + "learning_rate": 3.340805649734898e-05, + "loss": 0.1404, + "num_input_tokens_seen": 35188896, + "step": 60660 + }, + { + "epoch": 9.03559725945785, + "grad_norm": 0.012214094400405884, + "learning_rate": 3.340499629355622e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35191648, + "step": 60665 + }, + { + "epoch": 9.036341971998809, + "grad_norm": 0.021254295483231544, + "learning_rate": 3.34019359477696e-05, + "loss": 0.0557, + "num_input_tokens_seen": 35194816, + "step": 60670 + }, + { + "epoch": 9.037086684539767, + "grad_norm": 0.008323868736624718, + "learning_rate": 3.33988754600408e-05, + "loss": 0.0005, + "num_input_tokens_seen": 35197568, + "step": 60675 + }, + { + "epoch": 9.037831397080726, + "grad_norm": 0.08149518817663193, + "learning_rate": 3.339581483042155e-05, + "loss": 0.453, + "num_input_tokens_seen": 35200480, + "step": 60680 + }, + { + "epoch": 9.038576109621687, + "grad_norm": 0.008399400860071182, + "learning_rate": 3.339275405896353e-05, + "loss": 0.0029, + "num_input_tokens_seen": 35203552, + "step": 60685 + }, + { + "epoch": 9.039320822162646, + "grad_norm": 0.01029303390532732, + "learning_rate": 3.338969314571847e-05, + "loss": 0.0448, + "num_input_tokens_seen": 35206336, + "step": 60690 + }, + { + "epoch": 9.040065534703604, + "grad_norm": 0.01880253665149212, + "learning_rate": 3.338663209073806e-05, + "loss": 0.0907, + "num_input_tokens_seen": 35209280, + "step": 60695 + }, + { + "epoch": 9.040810247244563, + "grad_norm": 0.002182022901251912, + "learning_rate": 3.338357089407403e-05, + "loss": 0.0004, + "num_input_tokens_seen": 35212320, + "step": 60700 + }, + { + "epoch": 9.041554959785524, + "grad_norm": 10.561300277709961, + "learning_rate": 3.338050955577809e-05, + "loss": 0.005, + "num_input_tokens_seen": 35215424, + "step": 60705 + }, + { + "epoch": 9.042299672326482, + "grad_norm": 0.009845450520515442, + "learning_rate": 3.337744807590196e-05, + "loss": 0.0005, + "num_input_tokens_seen": 35218336, + "step": 60710 + }, + { + "epoch": 9.043044384867441, + "grad_norm": 14.546971321105957, + "learning_rate": 3.337438645449735e-05, + "loss": 0.0971, + "num_input_tokens_seen": 35221472, + "step": 60715 + }, + { + "epoch": 9.0437890974084, + "grad_norm": 0.043410561978816986, + "learning_rate": 3.3371324691616004e-05, + "loss": 0.2331, + "num_input_tokens_seen": 35224320, + "step": 60720 + }, + { + "epoch": 9.04453380994936, + "grad_norm": 71.93350219726562, + "learning_rate": 3.3368262787309636e-05, + "loss": 0.1723, + "num_input_tokens_seen": 35227168, + "step": 60725 + }, + { + "epoch": 9.04527852249032, + "grad_norm": 0.0072713918052613735, + "learning_rate": 3.3365200741629973e-05, + "loss": 0.0008, + "num_input_tokens_seen": 35229856, + "step": 60730 + }, + { + "epoch": 9.046023235031278, + "grad_norm": 0.011095628142356873, + "learning_rate": 3.336213855462874e-05, + "loss": 0.1258, + "num_input_tokens_seen": 35232896, + "step": 60735 + }, + { + "epoch": 9.046767947572237, + "grad_norm": 0.015059252269566059, + "learning_rate": 3.3359076226357675e-05, + "loss": 0.2155, + "num_input_tokens_seen": 35235616, + "step": 60740 + }, + { + "epoch": 9.047512660113195, + "grad_norm": 0.06776992976665497, + "learning_rate": 3.335601375686851e-05, + "loss": 0.0006, + "num_input_tokens_seen": 35238240, + "step": 60745 + }, + { + "epoch": 9.048257372654156, + "grad_norm": 44.798946380615234, + "learning_rate": 3.335295114621299e-05, + "loss": 0.1554, + "num_input_tokens_seen": 35241056, + "step": 60750 + }, + { + "epoch": 9.049002085195115, + "grad_norm": 0.043769728392362595, + "learning_rate": 3.334988839444285e-05, + "loss": 0.0471, + "num_input_tokens_seen": 35243808, + "step": 60755 + }, + { + "epoch": 9.049746797736073, + "grad_norm": 0.0049782791174948215, + "learning_rate": 3.3346825501609834e-05, + "loss": 0.0348, + "num_input_tokens_seen": 35246944, + "step": 60760 + }, + { + "epoch": 9.050491510277032, + "grad_norm": 0.02518167719244957, + "learning_rate": 3.3343762467765685e-05, + "loss": 0.0034, + "num_input_tokens_seen": 35250048, + "step": 60765 + }, + { + "epoch": 9.051236222817993, + "grad_norm": 0.1420789659023285, + "learning_rate": 3.334069929296215e-05, + "loss": 0.0006, + "num_input_tokens_seen": 35253376, + "step": 60770 + }, + { + "epoch": 9.051980935358952, + "grad_norm": 45.031551361083984, + "learning_rate": 3.333763597725097e-05, + "loss": 0.0352, + "num_input_tokens_seen": 35256256, + "step": 60775 + }, + { + "epoch": 9.05272564789991, + "grad_norm": 0.008949932642281055, + "learning_rate": 3.333457252068391e-05, + "loss": 0.0002, + "num_input_tokens_seen": 35259264, + "step": 60780 + }, + { + "epoch": 9.053470360440869, + "grad_norm": 0.016124477609992027, + "learning_rate": 3.333150892331271e-05, + "loss": 0.1879, + "num_input_tokens_seen": 35262144, + "step": 60785 + }, + { + "epoch": 9.05421507298183, + "grad_norm": 0.3415646255016327, + "learning_rate": 3.3328445185189145e-05, + "loss": 0.0012, + "num_input_tokens_seen": 35265280, + "step": 60790 + }, + { + "epoch": 9.054959785522788, + "grad_norm": 0.025098562240600586, + "learning_rate": 3.332538130636496e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35268000, + "step": 60795 + }, + { + "epoch": 9.055704498063747, + "grad_norm": 0.0830632671713829, + "learning_rate": 3.3322317286891913e-05, + "loss": 0.0324, + "num_input_tokens_seen": 35270944, + "step": 60800 + }, + { + "epoch": 9.056449210604706, + "grad_norm": 0.01390018779784441, + "learning_rate": 3.331925312682178e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35273760, + "step": 60805 + }, + { + "epoch": 9.057193923145666, + "grad_norm": 34.655731201171875, + "learning_rate": 3.331618882620632e-05, + "loss": 0.0366, + "num_input_tokens_seen": 35276640, + "step": 60810 + }, + { + "epoch": 9.057938635686625, + "grad_norm": 0.018933989107608795, + "learning_rate": 3.3313124385097306e-05, + "loss": 0.001, + "num_input_tokens_seen": 35279488, + "step": 60815 + }, + { + "epoch": 9.058683348227584, + "grad_norm": 0.034298744052648544, + "learning_rate": 3.33100598035465e-05, + "loss": 0.229, + "num_input_tokens_seen": 35282016, + "step": 60820 + }, + { + "epoch": 9.059428060768543, + "grad_norm": 0.0029669145587831736, + "learning_rate": 3.3306995081605686e-05, + "loss": 0.079, + "num_input_tokens_seen": 35285088, + "step": 60825 + }, + { + "epoch": 9.060172773309503, + "grad_norm": 0.08182889223098755, + "learning_rate": 3.3303930219326625e-05, + "loss": 0.011, + "num_input_tokens_seen": 35288128, + "step": 60830 + }, + { + "epoch": 9.060917485850462, + "grad_norm": 0.011065836995840073, + "learning_rate": 3.33008652167611e-05, + "loss": 0.2126, + "num_input_tokens_seen": 35290976, + "step": 60835 + }, + { + "epoch": 9.06166219839142, + "grad_norm": 0.0060925860889256, + "learning_rate": 3.32978000739609e-05, + "loss": 0.0837, + "num_input_tokens_seen": 35293728, + "step": 60840 + }, + { + "epoch": 9.06240691093238, + "grad_norm": 9.10488224029541, + "learning_rate": 3.32947347909778e-05, + "loss": 0.0457, + "num_input_tokens_seen": 35296608, + "step": 60845 + }, + { + "epoch": 9.06315162347334, + "grad_norm": 35.5156135559082, + "learning_rate": 3.329166936786359e-05, + "loss": 0.2292, + "num_input_tokens_seen": 35299488, + "step": 60850 + }, + { + "epoch": 9.063896336014299, + "grad_norm": 0.02698454074561596, + "learning_rate": 3.328860380467005e-05, + "loss": 0.0813, + "num_input_tokens_seen": 35302624, + "step": 60855 + }, + { + "epoch": 9.064641048555258, + "grad_norm": 0.007419394329190254, + "learning_rate": 3.328553810144897e-05, + "loss": 0.1621, + "num_input_tokens_seen": 35305536, + "step": 60860 + }, + { + "epoch": 9.065385761096216, + "grad_norm": 12.646089553833008, + "learning_rate": 3.328247225825215e-05, + "loss": 0.0123, + "num_input_tokens_seen": 35308224, + "step": 60865 + }, + { + "epoch": 9.066130473637177, + "grad_norm": 0.10908599197864532, + "learning_rate": 3.327940627513137e-05, + "loss": 0.0005, + "num_input_tokens_seen": 35310976, + "step": 60870 + }, + { + "epoch": 9.066875186178136, + "grad_norm": 0.003450715448707342, + "learning_rate": 3.327634015213844e-05, + "loss": 0.1841, + "num_input_tokens_seen": 35314016, + "step": 60875 + }, + { + "epoch": 9.067619898719094, + "grad_norm": 37.27259826660156, + "learning_rate": 3.327327388932516e-05, + "loss": 0.1295, + "num_input_tokens_seen": 35317184, + "step": 60880 + }, + { + "epoch": 9.068364611260053, + "grad_norm": 0.040841180831193924, + "learning_rate": 3.327020748674333e-05, + "loss": 0.033, + "num_input_tokens_seen": 35319744, + "step": 60885 + }, + { + "epoch": 9.069109323801014, + "grad_norm": 31.64320945739746, + "learning_rate": 3.326714094444474e-05, + "loss": 0.3756, + "num_input_tokens_seen": 35322400, + "step": 60890 + }, + { + "epoch": 9.069854036341972, + "grad_norm": 0.004025039728730917, + "learning_rate": 3.326407426248121e-05, + "loss": 0.2394, + "num_input_tokens_seen": 35325472, + "step": 60895 + }, + { + "epoch": 9.070598748882931, + "grad_norm": 0.011970501393079758, + "learning_rate": 3.326100744090455e-05, + "loss": 0.0014, + "num_input_tokens_seen": 35328352, + "step": 60900 + }, + { + "epoch": 9.07134346142389, + "grad_norm": 0.04974907636642456, + "learning_rate": 3.3257940479766544e-05, + "loss": 0.0007, + "num_input_tokens_seen": 35331360, + "step": 60905 + }, + { + "epoch": 9.07208817396485, + "grad_norm": 0.07968681305646896, + "learning_rate": 3.3254873379119044e-05, + "loss": 0.0285, + "num_input_tokens_seen": 35334240, + "step": 60910 + }, + { + "epoch": 9.07283288650581, + "grad_norm": 5.110403537750244, + "learning_rate": 3.325180613901385e-05, + "loss": 0.1057, + "num_input_tokens_seen": 35337216, + "step": 60915 + }, + { + "epoch": 9.073577599046768, + "grad_norm": 0.002441137097775936, + "learning_rate": 3.3248738759502775e-05, + "loss": 0.0014, + "num_input_tokens_seen": 35340000, + "step": 60920 + }, + { + "epoch": 9.074322311587727, + "grad_norm": 0.09547221660614014, + "learning_rate": 3.3245671240637635e-05, + "loss": 0.048, + "num_input_tokens_seen": 35343040, + "step": 60925 + }, + { + "epoch": 9.075067024128685, + "grad_norm": 141.2460174560547, + "learning_rate": 3.324260358247028e-05, + "loss": 0.1373, + "num_input_tokens_seen": 35345824, + "step": 60930 + }, + { + "epoch": 9.075811736669646, + "grad_norm": 0.06782472878694534, + "learning_rate": 3.323953578505249e-05, + "loss": 0.0018, + "num_input_tokens_seen": 35348416, + "step": 60935 + }, + { + "epoch": 9.076556449210605, + "grad_norm": 0.11554582417011261, + "learning_rate": 3.323646784843613e-05, + "loss": 0.0829, + "num_input_tokens_seen": 35351488, + "step": 60940 + }, + { + "epoch": 9.077301161751564, + "grad_norm": 0.2397802472114563, + "learning_rate": 3.323339977267301e-05, + "loss": 0.0007, + "num_input_tokens_seen": 35354112, + "step": 60945 + }, + { + "epoch": 9.078045874292522, + "grad_norm": 0.11059494316577911, + "learning_rate": 3.3230331557814975e-05, + "loss": 0.0011, + "num_input_tokens_seen": 35357120, + "step": 60950 + }, + { + "epoch": 9.078790586833483, + "grad_norm": 0.0678129717707634, + "learning_rate": 3.322726320391386e-05, + "loss": 0.0007, + "num_input_tokens_seen": 35360000, + "step": 60955 + }, + { + "epoch": 9.079535299374442, + "grad_norm": 0.01061168871819973, + "learning_rate": 3.322419471102148e-05, + "loss": 0.0002, + "num_input_tokens_seen": 35363008, + "step": 60960 + }, + { + "epoch": 9.0802800119154, + "grad_norm": 0.009476852603256702, + "learning_rate": 3.3221126079189704e-05, + "loss": 0.0336, + "num_input_tokens_seen": 35365984, + "step": 60965 + }, + { + "epoch": 9.081024724456359, + "grad_norm": 40.8291130065918, + "learning_rate": 3.321805730847035e-05, + "loss": 0.2579, + "num_input_tokens_seen": 35369056, + "step": 60970 + }, + { + "epoch": 9.08176943699732, + "grad_norm": 30.932573318481445, + "learning_rate": 3.321498839891527e-05, + "loss": 0.1846, + "num_input_tokens_seen": 35371968, + "step": 60975 + }, + { + "epoch": 9.082514149538278, + "grad_norm": 1.077776551246643, + "learning_rate": 3.321191935057631e-05, + "loss": 0.3228, + "num_input_tokens_seen": 35374816, + "step": 60980 + }, + { + "epoch": 9.083258862079237, + "grad_norm": 0.02333880215883255, + "learning_rate": 3.3208850163505314e-05, + "loss": 0.0006, + "num_input_tokens_seen": 35378048, + "step": 60985 + }, + { + "epoch": 9.084003574620196, + "grad_norm": 0.013092984445393085, + "learning_rate": 3.3205780837754154e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35380576, + "step": 60990 + }, + { + "epoch": 9.084748287161156, + "grad_norm": 40.06123733520508, + "learning_rate": 3.3202711373374654e-05, + "loss": 0.011, + "num_input_tokens_seen": 35383488, + "step": 60995 + }, + { + "epoch": 9.085492999702115, + "grad_norm": 0.002351297764107585, + "learning_rate": 3.319964177041868e-05, + "loss": 0.1978, + "num_input_tokens_seen": 35386528, + "step": 61000 + }, + { + "epoch": 9.086237712243074, + "grad_norm": 0.007149823009967804, + "learning_rate": 3.31965720289381e-05, + "loss": 0.1289, + "num_input_tokens_seen": 35389216, + "step": 61005 + }, + { + "epoch": 9.086982424784033, + "grad_norm": 0.0304911769926548, + "learning_rate": 3.319350214898476e-05, + "loss": 0.0985, + "num_input_tokens_seen": 35391808, + "step": 61010 + }, + { + "epoch": 9.087727137324993, + "grad_norm": 0.3398071825504303, + "learning_rate": 3.319043213061053e-05, + "loss": 0.0005, + "num_input_tokens_seen": 35394720, + "step": 61015 + }, + { + "epoch": 9.088471849865952, + "grad_norm": 0.15087372064590454, + "learning_rate": 3.318736197386728e-05, + "loss": 0.0249, + "num_input_tokens_seen": 35397664, + "step": 61020 + }, + { + "epoch": 9.08921656240691, + "grad_norm": 52.046600341796875, + "learning_rate": 3.3184291678806866e-05, + "loss": 0.1358, + "num_input_tokens_seen": 35400448, + "step": 61025 + }, + { + "epoch": 9.08996127494787, + "grad_norm": 0.1102445051074028, + "learning_rate": 3.3181221245481164e-05, + "loss": 0.0006, + "num_input_tokens_seen": 35403232, + "step": 61030 + }, + { + "epoch": 9.09070598748883, + "grad_norm": 0.23306934535503387, + "learning_rate": 3.317815067394204e-05, + "loss": 0.0712, + "num_input_tokens_seen": 35406400, + "step": 61035 + }, + { + "epoch": 9.091450700029789, + "grad_norm": 0.011967552825808525, + "learning_rate": 3.317507996424137e-05, + "loss": 0.0241, + "num_input_tokens_seen": 35409504, + "step": 61040 + }, + { + "epoch": 9.092195412570748, + "grad_norm": 0.023039227351546288, + "learning_rate": 3.317200911643103e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35412608, + "step": 61045 + }, + { + "epoch": 9.092940125111706, + "grad_norm": 8.45799446105957, + "learning_rate": 3.316893813056292e-05, + "loss": 0.1659, + "num_input_tokens_seen": 35415776, + "step": 61050 + }, + { + "epoch": 9.093684837652667, + "grad_norm": 14.311676025390625, + "learning_rate": 3.3165867006688894e-05, + "loss": 0.1534, + "num_input_tokens_seen": 35418368, + "step": 61055 + }, + { + "epoch": 9.094429550193626, + "grad_norm": 0.004167626146227121, + "learning_rate": 3.3162795744860845e-05, + "loss": 0.5247, + "num_input_tokens_seen": 35421152, + "step": 61060 + }, + { + "epoch": 9.095174262734584, + "grad_norm": 0.0049347844906151295, + "learning_rate": 3.315972434513065e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35424224, + "step": 61065 + }, + { + "epoch": 9.095918975275543, + "grad_norm": 0.1436084657907486, + "learning_rate": 3.315665280755021e-05, + "loss": 0.0004, + "num_input_tokens_seen": 35427040, + "step": 61070 + }, + { + "epoch": 9.096663687816504, + "grad_norm": 0.775601327419281, + "learning_rate": 3.315358113217141e-05, + "loss": 0.1601, + "num_input_tokens_seen": 35429856, + "step": 61075 + }, + { + "epoch": 9.097408400357462, + "grad_norm": 42.18739318847656, + "learning_rate": 3.315050931904614e-05, + "loss": 0.0272, + "num_input_tokens_seen": 35432512, + "step": 61080 + }, + { + "epoch": 9.098153112898421, + "grad_norm": 0.6038394570350647, + "learning_rate": 3.314743736822631e-05, + "loss": 0.0841, + "num_input_tokens_seen": 35435552, + "step": 61085 + }, + { + "epoch": 9.09889782543938, + "grad_norm": 0.044763658195734024, + "learning_rate": 3.314436527976381e-05, + "loss": 0.0036, + "num_input_tokens_seen": 35438496, + "step": 61090 + }, + { + "epoch": 9.099642537980339, + "grad_norm": 0.015478221699595451, + "learning_rate": 3.314129305371052e-05, + "loss": 0.0025, + "num_input_tokens_seen": 35441312, + "step": 61095 + }, + { + "epoch": 9.1003872505213, + "grad_norm": 0.3008750379085541, + "learning_rate": 3.313822069011837e-05, + "loss": 0.2579, + "num_input_tokens_seen": 35444160, + "step": 61100 + }, + { + "epoch": 9.101131963062258, + "grad_norm": 0.005412362515926361, + "learning_rate": 3.313514818903924e-05, + "loss": 0.0037, + "num_input_tokens_seen": 35446976, + "step": 61105 + }, + { + "epoch": 9.101876675603217, + "grad_norm": 0.007085916120558977, + "learning_rate": 3.313207555052505e-05, + "loss": 0.1721, + "num_input_tokens_seen": 35449952, + "step": 61110 + }, + { + "epoch": 9.102621388144176, + "grad_norm": 0.014982448890805244, + "learning_rate": 3.3129002774627723e-05, + "loss": 0.1298, + "num_input_tokens_seen": 35452576, + "step": 61115 + }, + { + "epoch": 9.103366100685136, + "grad_norm": 0.0367426872253418, + "learning_rate": 3.3125929861399155e-05, + "loss": 0.0014, + "num_input_tokens_seen": 35455584, + "step": 61120 + }, + { + "epoch": 9.104110813226095, + "grad_norm": 14.83867359161377, + "learning_rate": 3.3122856810891245e-05, + "loss": 0.2523, + "num_input_tokens_seen": 35458464, + "step": 61125 + }, + { + "epoch": 9.104855525767054, + "grad_norm": 0.009357217699289322, + "learning_rate": 3.311978362315594e-05, + "loss": 0.0035, + "num_input_tokens_seen": 35461152, + "step": 61130 + }, + { + "epoch": 9.105600238308012, + "grad_norm": 0.02229861542582512, + "learning_rate": 3.3116710298245134e-05, + "loss": 0.0917, + "num_input_tokens_seen": 35464288, + "step": 61135 + }, + { + "epoch": 9.106344950848973, + "grad_norm": 0.09301180392503738, + "learning_rate": 3.311363683621076e-05, + "loss": 0.0013, + "num_input_tokens_seen": 35467040, + "step": 61140 + }, + { + "epoch": 9.107089663389932, + "grad_norm": 5.358892917633057, + "learning_rate": 3.311056323710474e-05, + "loss": 0.0028, + "num_input_tokens_seen": 35469792, + "step": 61145 + }, + { + "epoch": 9.10783437593089, + "grad_norm": 0.5223279595375061, + "learning_rate": 3.3107489500978996e-05, + "loss": 0.1295, + "num_input_tokens_seen": 35472768, + "step": 61150 + }, + { + "epoch": 9.10857908847185, + "grad_norm": 0.007159221451729536, + "learning_rate": 3.310441562788546e-05, + "loss": 0.0681, + "num_input_tokens_seen": 35475296, + "step": 61155 + }, + { + "epoch": 9.10932380101281, + "grad_norm": 0.029083628207445145, + "learning_rate": 3.310134161787605e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35478048, + "step": 61160 + }, + { + "epoch": 9.110068513553768, + "grad_norm": 0.0015116699505597353, + "learning_rate": 3.309826747100272e-05, + "loss": 0.1722, + "num_input_tokens_seen": 35481184, + "step": 61165 + }, + { + "epoch": 9.110813226094727, + "grad_norm": 72.03714752197266, + "learning_rate": 3.309519318731739e-05, + "loss": 0.2328, + "num_input_tokens_seen": 35484320, + "step": 61170 + }, + { + "epoch": 9.111557938635686, + "grad_norm": 0.04968002438545227, + "learning_rate": 3.309211876687199e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35487136, + "step": 61175 + }, + { + "epoch": 9.112302651176647, + "grad_norm": 2.364762306213379, + "learning_rate": 3.308904420971847e-05, + "loss": 0.001, + "num_input_tokens_seen": 35489664, + "step": 61180 + }, + { + "epoch": 9.113047363717605, + "grad_norm": 35.212345123291016, + "learning_rate": 3.308596951590877e-05, + "loss": 0.206, + "num_input_tokens_seen": 35492672, + "step": 61185 + }, + { + "epoch": 9.113792076258564, + "grad_norm": 15.542600631713867, + "learning_rate": 3.308289468549484e-05, + "loss": 0.0626, + "num_input_tokens_seen": 35495488, + "step": 61190 + }, + { + "epoch": 9.114536788799523, + "grad_norm": 50.17867660522461, + "learning_rate": 3.30798197185286e-05, + "loss": 0.1582, + "num_input_tokens_seen": 35498336, + "step": 61195 + }, + { + "epoch": 9.115281501340483, + "grad_norm": 17.338623046875, + "learning_rate": 3.307674461506204e-05, + "loss": 0.0026, + "num_input_tokens_seen": 35501088, + "step": 61200 + }, + { + "epoch": 9.116026213881442, + "grad_norm": 33.17319869995117, + "learning_rate": 3.3073669375147074e-05, + "loss": 0.2903, + "num_input_tokens_seen": 35504096, + "step": 61205 + }, + { + "epoch": 9.1167709264224, + "grad_norm": 0.26731568574905396, + "learning_rate": 3.307059399883568e-05, + "loss": 0.0038, + "num_input_tokens_seen": 35506848, + "step": 61210 + }, + { + "epoch": 9.11751563896336, + "grad_norm": 0.00546587398275733, + "learning_rate": 3.30675184861798e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35509696, + "step": 61215 + }, + { + "epoch": 9.11826035150432, + "grad_norm": 0.003576801624149084, + "learning_rate": 3.30644428372314e-05, + "loss": 0.2896, + "num_input_tokens_seen": 35512704, + "step": 61220 + }, + { + "epoch": 9.119005064045279, + "grad_norm": 0.008743157610297203, + "learning_rate": 3.306136705204242e-05, + "loss": 0.1108, + "num_input_tokens_seen": 35515488, + "step": 61225 + }, + { + "epoch": 9.119749776586238, + "grad_norm": 7.047196388244629, + "learning_rate": 3.3058291130664844e-05, + "loss": 0.0147, + "num_input_tokens_seen": 35518240, + "step": 61230 + }, + { + "epoch": 9.120494489127196, + "grad_norm": 25.465044021606445, + "learning_rate": 3.305521507315063e-05, + "loss": 0.2502, + "num_input_tokens_seen": 35521568, + "step": 61235 + }, + { + "epoch": 9.121239201668157, + "grad_norm": 15.141324043273926, + "learning_rate": 3.305213887955174e-05, + "loss": 0.1377, + "num_input_tokens_seen": 35524416, + "step": 61240 + }, + { + "epoch": 9.121983914209116, + "grad_norm": 0.18293823301792145, + "learning_rate": 3.3049062549920154e-05, + "loss": 0.0508, + "num_input_tokens_seen": 35527168, + "step": 61245 + }, + { + "epoch": 9.122728626750074, + "grad_norm": 0.0033298712223768234, + "learning_rate": 3.3045986084307835e-05, + "loss": 0.0005, + "num_input_tokens_seen": 35530496, + "step": 61250 + }, + { + "epoch": 9.123473339291033, + "grad_norm": 0.007599640637636185, + "learning_rate": 3.304290948276677e-05, + "loss": 0.0001, + "num_input_tokens_seen": 35533280, + "step": 61255 + }, + { + "epoch": 9.124218051831992, + "grad_norm": 0.00491346837952733, + "learning_rate": 3.30398327453489e-05, + "loss": 0.0103, + "num_input_tokens_seen": 35536416, + "step": 61260 + }, + { + "epoch": 9.124962764372953, + "grad_norm": 144.74322509765625, + "learning_rate": 3.303675587210624e-05, + "loss": 0.2425, + "num_input_tokens_seen": 35539552, + "step": 61265 + }, + { + "epoch": 9.125707476913911, + "grad_norm": 0.021762002259492874, + "learning_rate": 3.3033678863090756e-05, + "loss": 0.0002, + "num_input_tokens_seen": 35542432, + "step": 61270 + }, + { + "epoch": 9.12645218945487, + "grad_norm": 1.0238621234893799, + "learning_rate": 3.303060171835444e-05, + "loss": 0.1569, + "num_input_tokens_seen": 35545280, + "step": 61275 + }, + { + "epoch": 9.127196901995829, + "grad_norm": 17.51356315612793, + "learning_rate": 3.302752443794925e-05, + "loss": 0.0628, + "num_input_tokens_seen": 35548000, + "step": 61280 + }, + { + "epoch": 9.12794161453679, + "grad_norm": 28.517749786376953, + "learning_rate": 3.302444702192722e-05, + "loss": 0.2376, + "num_input_tokens_seen": 35550976, + "step": 61285 + }, + { + "epoch": 9.128686327077748, + "grad_norm": 0.0033739181235432625, + "learning_rate": 3.30213694703403e-05, + "loss": 0.0377, + "num_input_tokens_seen": 35553632, + "step": 61290 + }, + { + "epoch": 9.129431039618707, + "grad_norm": 0.037504661828279495, + "learning_rate": 3.3018291783240495e-05, + "loss": 0.0061, + "num_input_tokens_seen": 35556608, + "step": 61295 + }, + { + "epoch": 9.130175752159666, + "grad_norm": 0.004969241563230753, + "learning_rate": 3.3015213960679796e-05, + "loss": 0.163, + "num_input_tokens_seen": 35559392, + "step": 61300 + }, + { + "epoch": 9.130920464700626, + "grad_norm": 73.15498352050781, + "learning_rate": 3.301213600271021e-05, + "loss": 0.146, + "num_input_tokens_seen": 35562432, + "step": 61305 + }, + { + "epoch": 9.131665177241585, + "grad_norm": 0.006561741698533297, + "learning_rate": 3.3009057909383725e-05, + "loss": 0.0002, + "num_input_tokens_seen": 35564960, + "step": 61310 + }, + { + "epoch": 9.132409889782544, + "grad_norm": 9.381169319152832, + "learning_rate": 3.300597968075235e-05, + "loss": 0.278, + "num_input_tokens_seen": 35568512, + "step": 61315 + }, + { + "epoch": 9.133154602323502, + "grad_norm": 0.07026798278093338, + "learning_rate": 3.3002901316868085e-05, + "loss": 0.0004, + "num_input_tokens_seen": 35571296, + "step": 61320 + }, + { + "epoch": 9.133899314864463, + "grad_norm": 0.01684996485710144, + "learning_rate": 3.299982281778293e-05, + "loss": 0.0005, + "num_input_tokens_seen": 35574208, + "step": 61325 + }, + { + "epoch": 9.134644027405422, + "grad_norm": 0.0034906184300780296, + "learning_rate": 3.2996744183548905e-05, + "loss": 0.2858, + "num_input_tokens_seen": 35577184, + "step": 61330 + }, + { + "epoch": 9.13538873994638, + "grad_norm": 0.05093887820839882, + "learning_rate": 3.2993665414218024e-05, + "loss": 0.1399, + "num_input_tokens_seen": 35580096, + "step": 61335 + }, + { + "epoch": 9.13613345248734, + "grad_norm": 1.0931668281555176, + "learning_rate": 3.2990586509842274e-05, + "loss": 0.0131, + "num_input_tokens_seen": 35583168, + "step": 61340 + }, + { + "epoch": 9.1368781650283, + "grad_norm": 0.014895061030983925, + "learning_rate": 3.298750747047369e-05, + "loss": 0.0034, + "num_input_tokens_seen": 35586112, + "step": 61345 + }, + { + "epoch": 9.137622877569259, + "grad_norm": 0.003917212598025799, + "learning_rate": 3.2984428296164296e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35589056, + "step": 61350 + }, + { + "epoch": 9.138367590110217, + "grad_norm": 0.0393076129257679, + "learning_rate": 3.298134898696609e-05, + "loss": 0.2004, + "num_input_tokens_seen": 35591904, + "step": 61355 + }, + { + "epoch": 9.139112302651176, + "grad_norm": 0.3399752080440521, + "learning_rate": 3.297826954293111e-05, + "loss": 0.0023, + "num_input_tokens_seen": 35594912, + "step": 61360 + }, + { + "epoch": 9.139857015192137, + "grad_norm": 91.95658874511719, + "learning_rate": 3.2975189964111365e-05, + "loss": 0.1173, + "num_input_tokens_seen": 35597696, + "step": 61365 + }, + { + "epoch": 9.140601727733095, + "grad_norm": 0.019076967611908913, + "learning_rate": 3.2972110250558895e-05, + "loss": 0.0011, + "num_input_tokens_seen": 35600480, + "step": 61370 + }, + { + "epoch": 9.141346440274054, + "grad_norm": 0.021414147689938545, + "learning_rate": 3.296903040232573e-05, + "loss": 0.0058, + "num_input_tokens_seen": 35603360, + "step": 61375 + }, + { + "epoch": 9.142091152815013, + "grad_norm": 0.6702988743782043, + "learning_rate": 3.29659504194639e-05, + "loss": 0.1291, + "num_input_tokens_seen": 35606432, + "step": 61380 + }, + { + "epoch": 9.142835865355973, + "grad_norm": 0.03399784490466118, + "learning_rate": 3.296287030202543e-05, + "loss": 0.1, + "num_input_tokens_seen": 35609664, + "step": 61385 + }, + { + "epoch": 9.143580577896932, + "grad_norm": 0.022526293992996216, + "learning_rate": 3.295979005006235e-05, + "loss": 0.0027, + "num_input_tokens_seen": 35612256, + "step": 61390 + }, + { + "epoch": 9.14432529043789, + "grad_norm": 0.01749569922685623, + "learning_rate": 3.295670966362672e-05, + "loss": 0.2586, + "num_input_tokens_seen": 35614912, + "step": 61395 + }, + { + "epoch": 9.14507000297885, + "grad_norm": 0.004714867565780878, + "learning_rate": 3.2953629142770556e-05, + "loss": 0.2752, + "num_input_tokens_seen": 35617728, + "step": 61400 + }, + { + "epoch": 9.14581471551981, + "grad_norm": 1.2560441493988037, + "learning_rate": 3.295054848754591e-05, + "loss": 0.057, + "num_input_tokens_seen": 35620480, + "step": 61405 + }, + { + "epoch": 9.146559428060769, + "grad_norm": 0.013194468803703785, + "learning_rate": 3.294746769800484e-05, + "loss": 0.159, + "num_input_tokens_seen": 35623296, + "step": 61410 + }, + { + "epoch": 9.147304140601728, + "grad_norm": 0.11200600117444992, + "learning_rate": 3.2944386774199373e-05, + "loss": 0.1184, + "num_input_tokens_seen": 35626432, + "step": 61415 + }, + { + "epoch": 9.148048853142686, + "grad_norm": 0.026784980669617653, + "learning_rate": 3.294130571618157e-05, + "loss": 0.1706, + "num_input_tokens_seen": 35629120, + "step": 61420 + }, + { + "epoch": 9.148793565683647, + "grad_norm": 1.381993055343628, + "learning_rate": 3.2938224524003483e-05, + "loss": 0.2658, + "num_input_tokens_seen": 35632032, + "step": 61425 + }, + { + "epoch": 9.149538278224606, + "grad_norm": 0.012610144913196564, + "learning_rate": 3.293514319771715e-05, + "loss": 0.005, + "num_input_tokens_seen": 35634976, + "step": 61430 + }, + { + "epoch": 9.150282990765565, + "grad_norm": 132.1767578125, + "learning_rate": 3.2932061737374635e-05, + "loss": 0.0292, + "num_input_tokens_seen": 35637632, + "step": 61435 + }, + { + "epoch": 9.151027703306523, + "grad_norm": 0.016523055732250214, + "learning_rate": 3.292898014302801e-05, + "loss": 0.098, + "num_input_tokens_seen": 35640480, + "step": 61440 + }, + { + "epoch": 9.151772415847482, + "grad_norm": 0.011980559676885605, + "learning_rate": 3.292589841472932e-05, + "loss": 0.0023, + "num_input_tokens_seen": 35643392, + "step": 61445 + }, + { + "epoch": 9.152517128388443, + "grad_norm": 0.020833982154726982, + "learning_rate": 3.292281655253063e-05, + "loss": 0.0005, + "num_input_tokens_seen": 35646304, + "step": 61450 + }, + { + "epoch": 9.153261840929401, + "grad_norm": 0.005474559031426907, + "learning_rate": 3.291973455648401e-05, + "loss": 0.0017, + "num_input_tokens_seen": 35649440, + "step": 61455 + }, + { + "epoch": 9.15400655347036, + "grad_norm": 0.023980122059583664, + "learning_rate": 3.291665242664152e-05, + "loss": 0.174, + "num_input_tokens_seen": 35652064, + "step": 61460 + }, + { + "epoch": 9.154751266011319, + "grad_norm": 0.35729753971099854, + "learning_rate": 3.291357016305523e-05, + "loss": 0.0019, + "num_input_tokens_seen": 35654848, + "step": 61465 + }, + { + "epoch": 9.15549597855228, + "grad_norm": 26.357494354248047, + "learning_rate": 3.291048776577722e-05, + "loss": 0.2473, + "num_input_tokens_seen": 35657664, + "step": 61470 + }, + { + "epoch": 9.156240691093238, + "grad_norm": 0.03926990553736687, + "learning_rate": 3.290740523485956e-05, + "loss": 0.0009, + "num_input_tokens_seen": 35660864, + "step": 61475 + }, + { + "epoch": 9.156985403634197, + "grad_norm": 0.02469916269183159, + "learning_rate": 3.290432257035432e-05, + "loss": 0.1915, + "num_input_tokens_seen": 35663552, + "step": 61480 + }, + { + "epoch": 9.157730116175156, + "grad_norm": 0.02257157675921917, + "learning_rate": 3.29012397723136e-05, + "loss": 0.0186, + "num_input_tokens_seen": 35666688, + "step": 61485 + }, + { + "epoch": 9.158474828716116, + "grad_norm": 0.06573929637670517, + "learning_rate": 3.289815684078944e-05, + "loss": 0.127, + "num_input_tokens_seen": 35669568, + "step": 61490 + }, + { + "epoch": 9.159219541257075, + "grad_norm": 0.032686807215213776, + "learning_rate": 3.2895073775833976e-05, + "loss": 0.0284, + "num_input_tokens_seen": 35672768, + "step": 61495 + }, + { + "epoch": 9.159964253798034, + "grad_norm": 0.006819278467446566, + "learning_rate": 3.2891990577499246e-05, + "loss": 0.0006, + "num_input_tokens_seen": 35675968, + "step": 61500 + }, + { + "epoch": 9.160708966338992, + "grad_norm": 0.216340571641922, + "learning_rate": 3.2888907245837356e-05, + "loss": 0.0005, + "num_input_tokens_seen": 35678688, + "step": 61505 + }, + { + "epoch": 9.161453678879953, + "grad_norm": 6.655673980712891, + "learning_rate": 3.2885823780900395e-05, + "loss": 0.01, + "num_input_tokens_seen": 35681536, + "step": 61510 + }, + { + "epoch": 9.162198391420912, + "grad_norm": 0.08265086263418198, + "learning_rate": 3.2882740182740466e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35684352, + "step": 61515 + }, + { + "epoch": 9.16294310396187, + "grad_norm": 0.051859691739082336, + "learning_rate": 3.2879656451409644e-05, + "loss": 0.1946, + "num_input_tokens_seen": 35687488, + "step": 61520 + }, + { + "epoch": 9.16368781650283, + "grad_norm": 0.008780024014413357, + "learning_rate": 3.287657258696004e-05, + "loss": 0.0209, + "num_input_tokens_seen": 35690304, + "step": 61525 + }, + { + "epoch": 9.16443252904379, + "grad_norm": 0.030654534697532654, + "learning_rate": 3.2873488589443747e-05, + "loss": 0.1241, + "num_input_tokens_seen": 35693472, + "step": 61530 + }, + { + "epoch": 9.165177241584749, + "grad_norm": 0.07099030166864395, + "learning_rate": 3.287040445891286e-05, + "loss": 0.1631, + "num_input_tokens_seen": 35696448, + "step": 61535 + }, + { + "epoch": 9.165921954125707, + "grad_norm": 0.1850900501012802, + "learning_rate": 3.28673201954195e-05, + "loss": 0.0382, + "num_input_tokens_seen": 35699264, + "step": 61540 + }, + { + "epoch": 9.166666666666666, + "grad_norm": 0.00446931691840291, + "learning_rate": 3.286423579901575e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35702048, + "step": 61545 + }, + { + "epoch": 9.167411379207627, + "grad_norm": 0.18868495523929596, + "learning_rate": 3.2861151269753745e-05, + "loss": 0.2229, + "num_input_tokens_seen": 35704928, + "step": 61550 + }, + { + "epoch": 9.168156091748585, + "grad_norm": 0.003471870208159089, + "learning_rate": 3.285806660768556e-05, + "loss": 0.0009, + "num_input_tokens_seen": 35707616, + "step": 61555 + }, + { + "epoch": 9.168900804289544, + "grad_norm": 0.00412776879966259, + "learning_rate": 3.285498181286334e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35710528, + "step": 61560 + }, + { + "epoch": 9.169645516830503, + "grad_norm": 0.029827028512954712, + "learning_rate": 3.285189688533917e-05, + "loss": 0.0037, + "num_input_tokens_seen": 35713216, + "step": 61565 + }, + { + "epoch": 9.170390229371463, + "grad_norm": 0.19457045197486877, + "learning_rate": 3.284881182516519e-05, + "loss": 0.1449, + "num_input_tokens_seen": 35716160, + "step": 61570 + }, + { + "epoch": 9.171134941912422, + "grad_norm": 0.006616971921175718, + "learning_rate": 3.2845726632393525e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35718944, + "step": 61575 + }, + { + "epoch": 9.171879654453381, + "grad_norm": 0.0011503578862175345, + "learning_rate": 3.284264130707627e-05, + "loss": 0.0302, + "num_input_tokens_seen": 35721792, + "step": 61580 + }, + { + "epoch": 9.17262436699434, + "grad_norm": 6.544009208679199, + "learning_rate": 3.283955584926557e-05, + "loss": 0.0034, + "num_input_tokens_seen": 35724608, + "step": 61585 + }, + { + "epoch": 9.1733690795353, + "grad_norm": 29.827781677246094, + "learning_rate": 3.283647025901353e-05, + "loss": 0.1912, + "num_input_tokens_seen": 35727712, + "step": 61590 + }, + { + "epoch": 9.174113792076259, + "grad_norm": 30.27085304260254, + "learning_rate": 3.283338453637229e-05, + "loss": 0.1715, + "num_input_tokens_seen": 35731072, + "step": 61595 + }, + { + "epoch": 9.174858504617218, + "grad_norm": 0.10458533465862274, + "learning_rate": 3.2830298681393985e-05, + "loss": 0.001, + "num_input_tokens_seen": 35733824, + "step": 61600 + }, + { + "epoch": 9.175603217158177, + "grad_norm": 0.08210945129394531, + "learning_rate": 3.2827212694130736e-05, + "loss": 0.099, + "num_input_tokens_seen": 35736704, + "step": 61605 + }, + { + "epoch": 9.176347929699135, + "grad_norm": 0.03173172101378441, + "learning_rate": 3.282412657463469e-05, + "loss": 0.0002, + "num_input_tokens_seen": 35739648, + "step": 61610 + }, + { + "epoch": 9.177092642240096, + "grad_norm": 0.006120791658759117, + "learning_rate": 3.282104032295798e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35742560, + "step": 61615 + }, + { + "epoch": 9.177837354781055, + "grad_norm": 0.015638407319784164, + "learning_rate": 3.281795393915275e-05, + "loss": 0.0016, + "num_input_tokens_seen": 35745440, + "step": 61620 + }, + { + "epoch": 9.178582067322013, + "grad_norm": 0.010867830365896225, + "learning_rate": 3.281486742327112e-05, + "loss": 0.1415, + "num_input_tokens_seen": 35748096, + "step": 61625 + }, + { + "epoch": 9.179326779862972, + "grad_norm": 0.004296315833926201, + "learning_rate": 3.281178077536525e-05, + "loss": 0.0005, + "num_input_tokens_seen": 35751296, + "step": 61630 + }, + { + "epoch": 9.180071492403933, + "grad_norm": 0.0018731140298768878, + "learning_rate": 3.280869399548728e-05, + "loss": 0.1146, + "num_input_tokens_seen": 35754336, + "step": 61635 + }, + { + "epoch": 9.180816204944891, + "grad_norm": 10.057400703430176, + "learning_rate": 3.280560708368936e-05, + "loss": 0.4926, + "num_input_tokens_seen": 35757536, + "step": 61640 + }, + { + "epoch": 9.18156091748585, + "grad_norm": 0.09642976522445679, + "learning_rate": 3.2802520040023646e-05, + "loss": 0.3484, + "num_input_tokens_seen": 35760416, + "step": 61645 + }, + { + "epoch": 9.182305630026809, + "grad_norm": 0.010151605121791363, + "learning_rate": 3.279943286454229e-05, + "loss": 0.0268, + "num_input_tokens_seen": 35763424, + "step": 61650 + }, + { + "epoch": 9.18305034256777, + "grad_norm": 54.83135223388672, + "learning_rate": 3.2796345557297446e-05, + "loss": 0.0532, + "num_input_tokens_seen": 35766176, + "step": 61655 + }, + { + "epoch": 9.183795055108728, + "grad_norm": 0.014699055813252926, + "learning_rate": 3.2793258118341265e-05, + "loss": 0.0005, + "num_input_tokens_seen": 35768896, + "step": 61660 + }, + { + "epoch": 9.184539767649687, + "grad_norm": 92.92189025878906, + "learning_rate": 3.2790170547725894e-05, + "loss": 0.0152, + "num_input_tokens_seen": 35772064, + "step": 61665 + }, + { + "epoch": 9.185284480190646, + "grad_norm": 0.04165671393275261, + "learning_rate": 3.2787082845503525e-05, + "loss": 0.0076, + "num_input_tokens_seen": 35774880, + "step": 61670 + }, + { + "epoch": 9.186029192731606, + "grad_norm": 0.014134572818875313, + "learning_rate": 3.27839950117263e-05, + "loss": 0.0008, + "num_input_tokens_seen": 35777664, + "step": 61675 + }, + { + "epoch": 9.186773905272565, + "grad_norm": 0.05508751422166824, + "learning_rate": 3.27809070464464e-05, + "loss": 0.1513, + "num_input_tokens_seen": 35780608, + "step": 61680 + }, + { + "epoch": 9.187518617813524, + "grad_norm": 0.44183579087257385, + "learning_rate": 3.2777818949715965e-05, + "loss": 0.0622, + "num_input_tokens_seen": 35783584, + "step": 61685 + }, + { + "epoch": 9.188263330354483, + "grad_norm": 0.08867640793323517, + "learning_rate": 3.27747307215872e-05, + "loss": 0.0025, + "num_input_tokens_seen": 35786592, + "step": 61690 + }, + { + "epoch": 9.189008042895443, + "grad_norm": 0.05397048965096474, + "learning_rate": 3.2771642362112255e-05, + "loss": 0.035, + "num_input_tokens_seen": 35789344, + "step": 61695 + }, + { + "epoch": 9.189752755436402, + "grad_norm": 0.008260921575129032, + "learning_rate": 3.276855387134331e-05, + "loss": 0.0232, + "num_input_tokens_seen": 35792224, + "step": 61700 + }, + { + "epoch": 9.19049746797736, + "grad_norm": 43.79813766479492, + "learning_rate": 3.2765465249332545e-05, + "loss": 0.525, + "num_input_tokens_seen": 35795360, + "step": 61705 + }, + { + "epoch": 9.19124218051832, + "grad_norm": 62.28101348876953, + "learning_rate": 3.276237649613214e-05, + "loss": 0.0271, + "num_input_tokens_seen": 35798848, + "step": 61710 + }, + { + "epoch": 9.19198689305928, + "grad_norm": 0.6312347054481506, + "learning_rate": 3.275928761179427e-05, + "loss": 0.0389, + "num_input_tokens_seen": 35801824, + "step": 61715 + }, + { + "epoch": 9.192731605600239, + "grad_norm": 0.04675648361444473, + "learning_rate": 3.2756198596371115e-05, + "loss": 0.3354, + "num_input_tokens_seen": 35804672, + "step": 61720 + }, + { + "epoch": 9.193476318141197, + "grad_norm": 11.56712532043457, + "learning_rate": 3.275310944991487e-05, + "loss": 0.0077, + "num_input_tokens_seen": 35807680, + "step": 61725 + }, + { + "epoch": 9.194221030682156, + "grad_norm": 25.739990234375, + "learning_rate": 3.275002017247773e-05, + "loss": 0.0945, + "num_input_tokens_seen": 35810272, + "step": 61730 + }, + { + "epoch": 9.194965743223117, + "grad_norm": 0.005526997614651918, + "learning_rate": 3.2746930764111876e-05, + "loss": 0.1054, + "num_input_tokens_seen": 35813056, + "step": 61735 + }, + { + "epoch": 9.195710455764075, + "grad_norm": 0.18072707951068878, + "learning_rate": 3.2743841224869496e-05, + "loss": 0.0011, + "num_input_tokens_seen": 35816160, + "step": 61740 + }, + { + "epoch": 9.196455168305034, + "grad_norm": 0.026787133887410164, + "learning_rate": 3.274075155480278e-05, + "loss": 0.1045, + "num_input_tokens_seen": 35819104, + "step": 61745 + }, + { + "epoch": 9.197199880845993, + "grad_norm": 0.022511687129735947, + "learning_rate": 3.273766175396395e-05, + "loss": 0.5907, + "num_input_tokens_seen": 35822080, + "step": 61750 + }, + { + "epoch": 9.197944593386953, + "grad_norm": 87.12591552734375, + "learning_rate": 3.273457182240518e-05, + "loss": 0.2216, + "num_input_tokens_seen": 35825088, + "step": 61755 + }, + { + "epoch": 9.198689305927912, + "grad_norm": 12.777581214904785, + "learning_rate": 3.273148176017868e-05, + "loss": 0.244, + "num_input_tokens_seen": 35828128, + "step": 61760 + }, + { + "epoch": 9.199434018468871, + "grad_norm": 0.03370992839336395, + "learning_rate": 3.2728391567336656e-05, + "loss": 0.0013, + "num_input_tokens_seen": 35831072, + "step": 61765 + }, + { + "epoch": 9.20017873100983, + "grad_norm": 0.25944992899894714, + "learning_rate": 3.272530124393131e-05, + "loss": 0.1015, + "num_input_tokens_seen": 35834080, + "step": 61770 + }, + { + "epoch": 9.200923443550789, + "grad_norm": 0.5017790794372559, + "learning_rate": 3.2722210790014854e-05, + "loss": 0.0426, + "num_input_tokens_seen": 35836768, + "step": 61775 + }, + { + "epoch": 9.201668156091749, + "grad_norm": 45.42161178588867, + "learning_rate": 3.271912020563949e-05, + "loss": 0.2879, + "num_input_tokens_seen": 35839520, + "step": 61780 + }, + { + "epoch": 9.202412868632708, + "grad_norm": 0.038708776235580444, + "learning_rate": 3.2716029490857445e-05, + "loss": 0.0736, + "num_input_tokens_seen": 35842528, + "step": 61785 + }, + { + "epoch": 9.203157581173667, + "grad_norm": 1.519643783569336, + "learning_rate": 3.271293864572092e-05, + "loss": 0.0015, + "num_input_tokens_seen": 35845664, + "step": 61790 + }, + { + "epoch": 9.203902293714625, + "grad_norm": 0.1612045168876648, + "learning_rate": 3.2709847670282126e-05, + "loss": 0.1107, + "num_input_tokens_seen": 35848704, + "step": 61795 + }, + { + "epoch": 9.204647006255586, + "grad_norm": 1.2424548864364624, + "learning_rate": 3.27067565645933e-05, + "loss": 0.0028, + "num_input_tokens_seen": 35851872, + "step": 61800 + }, + { + "epoch": 9.205391718796545, + "grad_norm": 0.05211475118994713, + "learning_rate": 3.2703665328706654e-05, + "loss": 0.052, + "num_input_tokens_seen": 35854592, + "step": 61805 + }, + { + "epoch": 9.206136431337503, + "grad_norm": 4.668890476226807, + "learning_rate": 3.270057396267441e-05, + "loss": 0.0212, + "num_input_tokens_seen": 35857440, + "step": 61810 + }, + { + "epoch": 9.206881143878462, + "grad_norm": 0.02428067848086357, + "learning_rate": 3.26974824665488e-05, + "loss": 0.0007, + "num_input_tokens_seen": 35860672, + "step": 61815 + }, + { + "epoch": 9.207625856419423, + "grad_norm": 7.482700347900391, + "learning_rate": 3.269439084038205e-05, + "loss": 0.0107, + "num_input_tokens_seen": 35863584, + "step": 61820 + }, + { + "epoch": 9.208370568960381, + "grad_norm": 0.04125766456127167, + "learning_rate": 3.2691299084226375e-05, + "loss": 0.0387, + "num_input_tokens_seen": 35866144, + "step": 61825 + }, + { + "epoch": 9.20911528150134, + "grad_norm": 0.014517229981720448, + "learning_rate": 3.2688207198134026e-05, + "loss": 0.0121, + "num_input_tokens_seen": 35869248, + "step": 61830 + }, + { + "epoch": 9.209859994042299, + "grad_norm": 0.4221482574939728, + "learning_rate": 3.2685115182157225e-05, + "loss": 0.0184, + "num_input_tokens_seen": 35872256, + "step": 61835 + }, + { + "epoch": 9.21060470658326, + "grad_norm": 0.00735011650249362, + "learning_rate": 3.2682023036348216e-05, + "loss": 0.3631, + "num_input_tokens_seen": 35875296, + "step": 61840 + }, + { + "epoch": 9.211349419124218, + "grad_norm": 2.1129488945007324, + "learning_rate": 3.267893076075924e-05, + "loss": 0.0007, + "num_input_tokens_seen": 35878048, + "step": 61845 + }, + { + "epoch": 9.212094131665177, + "grad_norm": 79.6133804321289, + "learning_rate": 3.267583835544253e-05, + "loss": 0.1855, + "num_input_tokens_seen": 35881056, + "step": 61850 + }, + { + "epoch": 9.212838844206136, + "grad_norm": 1.3976799249649048, + "learning_rate": 3.2672745820450336e-05, + "loss": 0.0008, + "num_input_tokens_seen": 35883648, + "step": 61855 + }, + { + "epoch": 9.213583556747096, + "grad_norm": 0.0020315926522016525, + "learning_rate": 3.2669653155834894e-05, + "loss": 0.1254, + "num_input_tokens_seen": 35886304, + "step": 61860 + }, + { + "epoch": 9.214328269288055, + "grad_norm": 0.07396155595779419, + "learning_rate": 3.2666560361648456e-05, + "loss": 0.0103, + "num_input_tokens_seen": 35889248, + "step": 61865 + }, + { + "epoch": 9.215072981829014, + "grad_norm": 0.0017593995435163379, + "learning_rate": 3.266346743794328e-05, + "loss": 0.0046, + "num_input_tokens_seen": 35892000, + "step": 61870 + }, + { + "epoch": 9.215817694369973, + "grad_norm": 22.388286590576172, + "learning_rate": 3.26603743847716e-05, + "loss": 0.284, + "num_input_tokens_seen": 35894880, + "step": 61875 + }, + { + "epoch": 9.216562406910933, + "grad_norm": 0.01609501801431179, + "learning_rate": 3.26572812021857e-05, + "loss": 0.0002, + "num_input_tokens_seen": 35897792, + "step": 61880 + }, + { + "epoch": 9.217307119451892, + "grad_norm": 0.2127581089735031, + "learning_rate": 3.2654187890237795e-05, + "loss": 0.1042, + "num_input_tokens_seen": 35900480, + "step": 61885 + }, + { + "epoch": 9.21805183199285, + "grad_norm": 0.08257487416267395, + "learning_rate": 3.2651094448980175e-05, + "loss": 0.0278, + "num_input_tokens_seen": 35903232, + "step": 61890 + }, + { + "epoch": 9.21879654453381, + "grad_norm": 0.004501881543546915, + "learning_rate": 3.264800087846509e-05, + "loss": 0.1306, + "num_input_tokens_seen": 35906144, + "step": 61895 + }, + { + "epoch": 9.21954125707477, + "grad_norm": 0.025092042982578278, + "learning_rate": 3.2644907178744805e-05, + "loss": 0.1146, + "num_input_tokens_seen": 35909184, + "step": 61900 + }, + { + "epoch": 9.220285969615729, + "grad_norm": 0.055233895778656006, + "learning_rate": 3.264181334987157e-05, + "loss": 0.0002, + "num_input_tokens_seen": 35911840, + "step": 61905 + }, + { + "epoch": 9.221030682156687, + "grad_norm": 0.00595329562202096, + "learning_rate": 3.2638719391897684e-05, + "loss": 0.1476, + "num_input_tokens_seen": 35914592, + "step": 61910 + }, + { + "epoch": 9.221775394697646, + "grad_norm": 0.005056809168308973, + "learning_rate": 3.2635625304875386e-05, + "loss": 0.0001, + "num_input_tokens_seen": 35917376, + "step": 61915 + }, + { + "epoch": 9.222520107238607, + "grad_norm": 0.030362773686647415, + "learning_rate": 3.263253108885696e-05, + "loss": 0.0887, + "num_input_tokens_seen": 35920320, + "step": 61920 + }, + { + "epoch": 9.223264819779565, + "grad_norm": 0.9754003286361694, + "learning_rate": 3.262943674389469e-05, + "loss": 0.0013, + "num_input_tokens_seen": 35923040, + "step": 61925 + }, + { + "epoch": 9.224009532320524, + "grad_norm": 0.0021138018928468227, + "learning_rate": 3.2626342270040823e-05, + "loss": 0.0964, + "num_input_tokens_seen": 35926304, + "step": 61930 + }, + { + "epoch": 9.224754244861483, + "grad_norm": 3.370438575744629, + "learning_rate": 3.262324766734766e-05, + "loss": 0.1888, + "num_input_tokens_seen": 35928960, + "step": 61935 + }, + { + "epoch": 9.225498957402444, + "grad_norm": 0.01455497182905674, + "learning_rate": 3.2620152935867484e-05, + "loss": 0.0013, + "num_input_tokens_seen": 35931904, + "step": 61940 + }, + { + "epoch": 9.226243669943402, + "grad_norm": 0.0030506784096360207, + "learning_rate": 3.261705807565256e-05, + "loss": 0.0017, + "num_input_tokens_seen": 35934848, + "step": 61945 + }, + { + "epoch": 9.226988382484361, + "grad_norm": 0.4887279272079468, + "learning_rate": 3.26139630867552e-05, + "loss": 0.0005, + "num_input_tokens_seen": 35937376, + "step": 61950 + }, + { + "epoch": 9.22773309502532, + "grad_norm": 0.06104394793510437, + "learning_rate": 3.261086796922765e-05, + "loss": 0.147, + "num_input_tokens_seen": 35940160, + "step": 61955 + }, + { + "epoch": 9.228477807566279, + "grad_norm": 24.682886123657227, + "learning_rate": 3.260777272312222e-05, + "loss": 0.2388, + "num_input_tokens_seen": 35942976, + "step": 61960 + }, + { + "epoch": 9.229222520107239, + "grad_norm": 0.5027503967285156, + "learning_rate": 3.2604677348491215e-05, + "loss": 0.0038, + "num_input_tokens_seen": 35945888, + "step": 61965 + }, + { + "epoch": 9.229967232648198, + "grad_norm": 0.0016517853364348412, + "learning_rate": 3.260158184538691e-05, + "loss": 0.0003, + "num_input_tokens_seen": 35948864, + "step": 61970 + }, + { + "epoch": 9.230711945189157, + "grad_norm": 0.01787465065717697, + "learning_rate": 3.25984862138616e-05, + "loss": 0.0006, + "num_input_tokens_seen": 35951680, + "step": 61975 + }, + { + "epoch": 9.231456657730115, + "grad_norm": 0.07079396396875381, + "learning_rate": 3.25953904539676e-05, + "loss": 0.0195, + "num_input_tokens_seen": 35954336, + "step": 61980 + }, + { + "epoch": 9.232201370271076, + "grad_norm": 0.004328396636992693, + "learning_rate": 3.259229456575719e-05, + "loss": 0.0915, + "num_input_tokens_seen": 35956864, + "step": 61985 + }, + { + "epoch": 9.232946082812035, + "grad_norm": 0.0022276765666902065, + "learning_rate": 3.258919854928268e-05, + "loss": 0.0005, + "num_input_tokens_seen": 35959616, + "step": 61990 + }, + { + "epoch": 9.233690795352993, + "grad_norm": 0.13434502482414246, + "learning_rate": 3.2586102404596375e-05, + "loss": 0.0765, + "num_input_tokens_seen": 35962368, + "step": 61995 + }, + { + "epoch": 9.234435507893952, + "grad_norm": 0.05074097216129303, + "learning_rate": 3.258300613175058e-05, + "loss": 0.0887, + "num_input_tokens_seen": 35965344, + "step": 62000 + }, + { + "epoch": 9.235180220434913, + "grad_norm": 21.13034439086914, + "learning_rate": 3.2579909730797605e-05, + "loss": 0.2185, + "num_input_tokens_seen": 35968224, + "step": 62005 + }, + { + "epoch": 9.235924932975871, + "grad_norm": 0.0248012263327837, + "learning_rate": 3.2576813201789755e-05, + "loss": 0.2005, + "num_input_tokens_seen": 35971200, + "step": 62010 + }, + { + "epoch": 9.23666964551683, + "grad_norm": 0.02308986522257328, + "learning_rate": 3.257371654477935e-05, + "loss": 0.1349, + "num_input_tokens_seen": 35974144, + "step": 62015 + }, + { + "epoch": 9.237414358057789, + "grad_norm": 0.005107569508254528, + "learning_rate": 3.257061975981871e-05, + "loss": 0.1021, + "num_input_tokens_seen": 35977056, + "step": 62020 + }, + { + "epoch": 9.23815907059875, + "grad_norm": 0.07696059346199036, + "learning_rate": 3.256752284696013e-05, + "loss": 0.0021, + "num_input_tokens_seen": 35980256, + "step": 62025 + }, + { + "epoch": 9.238903783139708, + "grad_norm": 0.23646855354309082, + "learning_rate": 3.256442580625595e-05, + "loss": 0.0044, + "num_input_tokens_seen": 35983200, + "step": 62030 + }, + { + "epoch": 9.239648495680667, + "grad_norm": 0.0027830980252474546, + "learning_rate": 3.2561328637758475e-05, + "loss": 0.0005, + "num_input_tokens_seen": 35986144, + "step": 62035 + }, + { + "epoch": 9.240393208221626, + "grad_norm": 0.01880059950053692, + "learning_rate": 3.2558231341520046e-05, + "loss": 0.366, + "num_input_tokens_seen": 35988896, + "step": 62040 + }, + { + "epoch": 9.241137920762586, + "grad_norm": 23.873838424682617, + "learning_rate": 3.255513391759299e-05, + "loss": 0.1261, + "num_input_tokens_seen": 35992320, + "step": 62045 + }, + { + "epoch": 9.241882633303545, + "grad_norm": 0.15277428925037384, + "learning_rate": 3.25520363660296e-05, + "loss": 0.0144, + "num_input_tokens_seen": 35995040, + "step": 62050 + }, + { + "epoch": 9.242627345844504, + "grad_norm": 13.045733451843262, + "learning_rate": 3.2548938686882246e-05, + "loss": 0.1032, + "num_input_tokens_seen": 35997856, + "step": 62055 + }, + { + "epoch": 9.243372058385463, + "grad_norm": 0.02555486559867859, + "learning_rate": 3.254584088020325e-05, + "loss": 0.0151, + "num_input_tokens_seen": 36000800, + "step": 62060 + }, + { + "epoch": 9.244116770926423, + "grad_norm": 9.132704734802246, + "learning_rate": 3.254274294604494e-05, + "loss": 0.2702, + "num_input_tokens_seen": 36003424, + "step": 62065 + }, + { + "epoch": 9.244861483467382, + "grad_norm": 0.02300955355167389, + "learning_rate": 3.253964488445964e-05, + "loss": 0.0752, + "num_input_tokens_seen": 36006400, + "step": 62070 + }, + { + "epoch": 9.24560619600834, + "grad_norm": 0.034372106194496155, + "learning_rate": 3.253654669549972e-05, + "loss": 0.0773, + "num_input_tokens_seen": 36009440, + "step": 62075 + }, + { + "epoch": 9.2463509085493, + "grad_norm": 1.972051978111267, + "learning_rate": 3.253344837921749e-05, + "loss": 0.0011, + "num_input_tokens_seen": 36012160, + "step": 62080 + }, + { + "epoch": 9.24709562109026, + "grad_norm": 0.0935383066534996, + "learning_rate": 3.253034993566532e-05, + "loss": 0.0972, + "num_input_tokens_seen": 36015072, + "step": 62085 + }, + { + "epoch": 9.247840333631219, + "grad_norm": 0.08427737653255463, + "learning_rate": 3.252725136489553e-05, + "loss": 0.0354, + "num_input_tokens_seen": 36018144, + "step": 62090 + }, + { + "epoch": 9.248585046172177, + "grad_norm": 0.0010433238931000233, + "learning_rate": 3.2524152666960476e-05, + "loss": 0.0004, + "num_input_tokens_seen": 36021184, + "step": 62095 + }, + { + "epoch": 9.249329758713136, + "grad_norm": 0.14185787737369537, + "learning_rate": 3.252105384191252e-05, + "loss": 0.1285, + "num_input_tokens_seen": 36023840, + "step": 62100 + }, + { + "epoch": 9.250074471254097, + "grad_norm": 0.09648901969194412, + "learning_rate": 3.2517954889803995e-05, + "loss": 0.0004, + "num_input_tokens_seen": 36027104, + "step": 62105 + }, + { + "epoch": 9.250819183795056, + "grad_norm": 62.30767822265625, + "learning_rate": 3.2514855810687265e-05, + "loss": 0.1336, + "num_input_tokens_seen": 36029984, + "step": 62110 + }, + { + "epoch": 9.251563896336014, + "grad_norm": 0.03733434900641441, + "learning_rate": 3.2511756604614695e-05, + "loss": 0.0009, + "num_input_tokens_seen": 36032960, + "step": 62115 + }, + { + "epoch": 9.252308608876973, + "grad_norm": 0.19850745797157288, + "learning_rate": 3.250865727163862e-05, + "loss": 0.2393, + "num_input_tokens_seen": 36035808, + "step": 62120 + }, + { + "epoch": 9.253053321417934, + "grad_norm": 0.015541533008217812, + "learning_rate": 3.250555781181142e-05, + "loss": 0.3182, + "num_input_tokens_seen": 36038592, + "step": 62125 + }, + { + "epoch": 9.253798033958892, + "grad_norm": 0.013773564249277115, + "learning_rate": 3.250245822518544e-05, + "loss": 0.0007, + "num_input_tokens_seen": 36041440, + "step": 62130 + }, + { + "epoch": 9.254542746499851, + "grad_norm": 34.5308837890625, + "learning_rate": 3.249935851181305e-05, + "loss": 0.0053, + "num_input_tokens_seen": 36044288, + "step": 62135 + }, + { + "epoch": 9.25528745904081, + "grad_norm": 22.323575973510742, + "learning_rate": 3.2496258671746636e-05, + "loss": 0.0314, + "num_input_tokens_seen": 36046848, + "step": 62140 + }, + { + "epoch": 9.256032171581769, + "grad_norm": 0.0039889756590127945, + "learning_rate": 3.249315870503854e-05, + "loss": 0.001, + "num_input_tokens_seen": 36049792, + "step": 62145 + }, + { + "epoch": 9.25677688412273, + "grad_norm": 0.019272591918706894, + "learning_rate": 3.249005861174115e-05, + "loss": 0.1355, + "num_input_tokens_seen": 36052576, + "step": 62150 + }, + { + "epoch": 9.257521596663688, + "grad_norm": 28.746736526489258, + "learning_rate": 3.2486958391906825e-05, + "loss": 0.0062, + "num_input_tokens_seen": 36055328, + "step": 62155 + }, + { + "epoch": 9.258266309204647, + "grad_norm": 0.015985412523150444, + "learning_rate": 3.2483858045587944e-05, + "loss": 0.1738, + "num_input_tokens_seen": 36058144, + "step": 62160 + }, + { + "epoch": 9.259011021745605, + "grad_norm": 0.0931846871972084, + "learning_rate": 3.2480757572836895e-05, + "loss": 0.0995, + "num_input_tokens_seen": 36061056, + "step": 62165 + }, + { + "epoch": 9.259755734286566, + "grad_norm": 0.034213095903396606, + "learning_rate": 3.247765697370604e-05, + "loss": 0.0921, + "num_input_tokens_seen": 36064160, + "step": 62170 + }, + { + "epoch": 9.260500446827525, + "grad_norm": 0.006884048227220774, + "learning_rate": 3.247455624824779e-05, + "loss": 0.1167, + "num_input_tokens_seen": 36067008, + "step": 62175 + }, + { + "epoch": 9.261245159368483, + "grad_norm": 35.05583572387695, + "learning_rate": 3.247145539651449e-05, + "loss": 0.1132, + "num_input_tokens_seen": 36069984, + "step": 62180 + }, + { + "epoch": 9.261989871909442, + "grad_norm": 0.00030005149892531335, + "learning_rate": 3.246835441855856e-05, + "loss": 0.0001, + "num_input_tokens_seen": 36073088, + "step": 62185 + }, + { + "epoch": 9.262734584450403, + "grad_norm": 0.10158396512269974, + "learning_rate": 3.2465253314432366e-05, + "loss": 0.0006, + "num_input_tokens_seen": 36075904, + "step": 62190 + }, + { + "epoch": 9.263479296991362, + "grad_norm": 17.51409912109375, + "learning_rate": 3.24621520841883e-05, + "loss": 0.1634, + "num_input_tokens_seen": 36078656, + "step": 62195 + }, + { + "epoch": 9.26422400953232, + "grad_norm": 0.004525644704699516, + "learning_rate": 3.245905072787876e-05, + "loss": 0.094, + "num_input_tokens_seen": 36081568, + "step": 62200 + }, + { + "epoch": 9.264968722073279, + "grad_norm": 0.015523405745625496, + "learning_rate": 3.245594924555614e-05, + "loss": 0.0002, + "num_input_tokens_seen": 36084384, + "step": 62205 + }, + { + "epoch": 9.26571343461424, + "grad_norm": 0.49734964966773987, + "learning_rate": 3.2452847637272845e-05, + "loss": 0.0022, + "num_input_tokens_seen": 36087232, + "step": 62210 + }, + { + "epoch": 9.266458147155198, + "grad_norm": 0.012544921599328518, + "learning_rate": 3.244974590308125e-05, + "loss": 0.0002, + "num_input_tokens_seen": 36090016, + "step": 62215 + }, + { + "epoch": 9.267202859696157, + "grad_norm": 0.0027270012069493532, + "learning_rate": 3.244664404303378e-05, + "loss": 0.0004, + "num_input_tokens_seen": 36092576, + "step": 62220 + }, + { + "epoch": 9.267947572237116, + "grad_norm": 4.309220790863037, + "learning_rate": 3.2443542057182825e-05, + "loss": 0.0309, + "num_input_tokens_seen": 36095264, + "step": 62225 + }, + { + "epoch": 9.268692284778076, + "grad_norm": 1.3386865854263306, + "learning_rate": 3.244043994558079e-05, + "loss": 0.1862, + "num_input_tokens_seen": 36098208, + "step": 62230 + }, + { + "epoch": 9.269436997319035, + "grad_norm": 0.020903989672660828, + "learning_rate": 3.243733770828008e-05, + "loss": 0.1256, + "num_input_tokens_seen": 36101152, + "step": 62235 + }, + { + "epoch": 9.270181709859994, + "grad_norm": 0.08799377083778381, + "learning_rate": 3.243423534533311e-05, + "loss": 0.0738, + "num_input_tokens_seen": 36103680, + "step": 62240 + }, + { + "epoch": 9.270926422400953, + "grad_norm": 2.6785366535186768, + "learning_rate": 3.2431132856792294e-05, + "loss": 0.0352, + "num_input_tokens_seen": 36106432, + "step": 62245 + }, + { + "epoch": 9.271671134941913, + "grad_norm": 0.438795268535614, + "learning_rate": 3.242803024271004e-05, + "loss": 0.0717, + "num_input_tokens_seen": 36109088, + "step": 62250 + }, + { + "epoch": 9.272415847482872, + "grad_norm": 0.1823982447385788, + "learning_rate": 3.2424927503138766e-05, + "loss": 0.3169, + "num_input_tokens_seen": 36112288, + "step": 62255 + }, + { + "epoch": 9.27316056002383, + "grad_norm": 101.5743179321289, + "learning_rate": 3.242182463813088e-05, + "loss": 0.0226, + "num_input_tokens_seen": 36115360, + "step": 62260 + }, + { + "epoch": 9.27390527256479, + "grad_norm": 23.12222671508789, + "learning_rate": 3.241872164773882e-05, + "loss": 0.4191, + "num_input_tokens_seen": 36118208, + "step": 62265 + }, + { + "epoch": 9.27464998510575, + "grad_norm": 59.573448181152344, + "learning_rate": 3.241561853201499e-05, + "loss": 0.1749, + "num_input_tokens_seen": 36121152, + "step": 62270 + }, + { + "epoch": 9.275394697646709, + "grad_norm": 0.01661032810807228, + "learning_rate": 3.2412515291011826e-05, + "loss": 0.0033, + "num_input_tokens_seen": 36124192, + "step": 62275 + }, + { + "epoch": 9.276139410187668, + "grad_norm": 0.016495345160365105, + "learning_rate": 3.2409411924781754e-05, + "loss": 0.0003, + "num_input_tokens_seen": 36127488, + "step": 62280 + }, + { + "epoch": 9.276884122728626, + "grad_norm": 0.17611482739448547, + "learning_rate": 3.2406308433377194e-05, + "loss": 0.1349, + "num_input_tokens_seen": 36130304, + "step": 62285 + }, + { + "epoch": 9.277628835269585, + "grad_norm": 11.997049331665039, + "learning_rate": 3.2403204816850574e-05, + "loss": 0.0095, + "num_input_tokens_seen": 36133568, + "step": 62290 + }, + { + "epoch": 9.278373547810546, + "grad_norm": 0.005328258033841848, + "learning_rate": 3.240010107525434e-05, + "loss": 0.0003, + "num_input_tokens_seen": 36136576, + "step": 62295 + }, + { + "epoch": 9.279118260351504, + "grad_norm": 0.022526033222675323, + "learning_rate": 3.2396997208640925e-05, + "loss": 0.1863, + "num_input_tokens_seen": 36139264, + "step": 62300 + }, + { + "epoch": 9.279862972892463, + "grad_norm": 0.007582252379506826, + "learning_rate": 3.2393893217062746e-05, + "loss": 0.1607, + "num_input_tokens_seen": 36142112, + "step": 62305 + }, + { + "epoch": 9.280607685433422, + "grad_norm": 0.08959871530532837, + "learning_rate": 3.239078910057226e-05, + "loss": 0.1533, + "num_input_tokens_seen": 36145120, + "step": 62310 + }, + { + "epoch": 9.281352397974382, + "grad_norm": 0.040391936898231506, + "learning_rate": 3.238768485922191e-05, + "loss": 0.0416, + "num_input_tokens_seen": 36147872, + "step": 62315 + }, + { + "epoch": 9.282097110515341, + "grad_norm": 0.01628122851252556, + "learning_rate": 3.238458049306413e-05, + "loss": 0.1814, + "num_input_tokens_seen": 36150720, + "step": 62320 + }, + { + "epoch": 9.2828418230563, + "grad_norm": 0.0006645202520303428, + "learning_rate": 3.2381476002151365e-05, + "loss": 0.0003, + "num_input_tokens_seen": 36153824, + "step": 62325 + }, + { + "epoch": 9.283586535597259, + "grad_norm": 0.08556429296731949, + "learning_rate": 3.2378371386536074e-05, + "loss": 0.0222, + "num_input_tokens_seen": 36156576, + "step": 62330 + }, + { + "epoch": 9.28433124813822, + "grad_norm": 291.7050476074219, + "learning_rate": 3.2375266646270684e-05, + "loss": 0.0388, + "num_input_tokens_seen": 36159200, + "step": 62335 + }, + { + "epoch": 9.285075960679178, + "grad_norm": 3.910892963409424, + "learning_rate": 3.2372161781407675e-05, + "loss": 0.1197, + "num_input_tokens_seen": 36162144, + "step": 62340 + }, + { + "epoch": 9.285820673220137, + "grad_norm": 0.012656701728701591, + "learning_rate": 3.2369056791999476e-05, + "loss": 0.0966, + "num_input_tokens_seen": 36164928, + "step": 62345 + }, + { + "epoch": 9.286565385761095, + "grad_norm": 17.940635681152344, + "learning_rate": 3.236595167809856e-05, + "loss": 0.2077, + "num_input_tokens_seen": 36167936, + "step": 62350 + }, + { + "epoch": 9.287310098302056, + "grad_norm": 22.405601501464844, + "learning_rate": 3.236284643975737e-05, + "loss": 0.1316, + "num_input_tokens_seen": 36171008, + "step": 62355 + }, + { + "epoch": 9.288054810843015, + "grad_norm": 0.02654130570590496, + "learning_rate": 3.235974107702837e-05, + "loss": 0.0006, + "num_input_tokens_seen": 36173888, + "step": 62360 + }, + { + "epoch": 9.288799523383974, + "grad_norm": 0.027881458401679993, + "learning_rate": 3.235663558996402e-05, + "loss": 0.0263, + "num_input_tokens_seen": 36176896, + "step": 62365 + }, + { + "epoch": 9.289544235924932, + "grad_norm": 0.0034816160332411528, + "learning_rate": 3.2353529978616806e-05, + "loss": 0.001, + "num_input_tokens_seen": 36179584, + "step": 62370 + }, + { + "epoch": 9.290288948465893, + "grad_norm": 1.2362329959869385, + "learning_rate": 3.235042424303917e-05, + "loss": 0.0008, + "num_input_tokens_seen": 36182656, + "step": 62375 + }, + { + "epoch": 9.291033661006852, + "grad_norm": 0.3714548349380493, + "learning_rate": 3.2347318383283585e-05, + "loss": 0.0079, + "num_input_tokens_seen": 36185600, + "step": 62380 + }, + { + "epoch": 9.29177837354781, + "grad_norm": 0.6838282942771912, + "learning_rate": 3.234421239940252e-05, + "loss": 0.1797, + "num_input_tokens_seen": 36188704, + "step": 62385 + }, + { + "epoch": 9.292523086088769, + "grad_norm": 0.051217880100011826, + "learning_rate": 3.2341106291448456e-05, + "loss": 0.2915, + "num_input_tokens_seen": 36191392, + "step": 62390 + }, + { + "epoch": 9.29326779862973, + "grad_norm": 0.005325939506292343, + "learning_rate": 3.233800005947386e-05, + "loss": 0.0003, + "num_input_tokens_seen": 36194208, + "step": 62395 + }, + { + "epoch": 9.294012511170688, + "grad_norm": 0.013078518211841583, + "learning_rate": 3.23348937035312e-05, + "loss": 0.0001, + "num_input_tokens_seen": 36196992, + "step": 62400 + }, + { + "epoch": 9.294757223711647, + "grad_norm": 5.4666008949279785, + "learning_rate": 3.233178722367298e-05, + "loss": 0.0395, + "num_input_tokens_seen": 36199648, + "step": 62405 + }, + { + "epoch": 9.295501936252606, + "grad_norm": 0.025406647473573685, + "learning_rate": 3.232868061995167e-05, + "loss": 0.0017, + "num_input_tokens_seen": 36202848, + "step": 62410 + }, + { + "epoch": 9.296246648793566, + "grad_norm": 0.011040259152650833, + "learning_rate": 3.2325573892419745e-05, + "loss": 0.0012, + "num_input_tokens_seen": 36205728, + "step": 62415 + }, + { + "epoch": 9.296991361334525, + "grad_norm": 0.006276775151491165, + "learning_rate": 3.232246704112969e-05, + "loss": 0.001, + "num_input_tokens_seen": 36208352, + "step": 62420 + }, + { + "epoch": 9.297736073875484, + "grad_norm": 0.5825095176696777, + "learning_rate": 3.2319360066134e-05, + "loss": 0.2491, + "num_input_tokens_seen": 36211648, + "step": 62425 + }, + { + "epoch": 9.298480786416443, + "grad_norm": 0.02713753655552864, + "learning_rate": 3.2316252967485155e-05, + "loss": 0.1192, + "num_input_tokens_seen": 36214272, + "step": 62430 + }, + { + "epoch": 9.299225498957403, + "grad_norm": 0.01116874348372221, + "learning_rate": 3.231314574523566e-05, + "loss": 0.056, + "num_input_tokens_seen": 36217376, + "step": 62435 + }, + { + "epoch": 9.299970211498362, + "grad_norm": 0.15127010643482208, + "learning_rate": 3.2310038399437995e-05, + "loss": 0.0002, + "num_input_tokens_seen": 36220288, + "step": 62440 + }, + { + "epoch": 9.30071492403932, + "grad_norm": 0.016261355951428413, + "learning_rate": 3.230693093014466e-05, + "loss": 0.0025, + "num_input_tokens_seen": 36223200, + "step": 62445 + }, + { + "epoch": 9.30145963658028, + "grad_norm": 0.001137500279583037, + "learning_rate": 3.230382333740816e-05, + "loss": 0.0708, + "num_input_tokens_seen": 36226080, + "step": 62450 + }, + { + "epoch": 9.30220434912124, + "grad_norm": 0.0038572330959141254, + "learning_rate": 3.230071562128098e-05, + "loss": 0.0007, + "num_input_tokens_seen": 36228896, + "step": 62455 + }, + { + "epoch": 9.302949061662199, + "grad_norm": 0.0056962789967656136, + "learning_rate": 3.2297607781815645e-05, + "loss": 0.2285, + "num_input_tokens_seen": 36231872, + "step": 62460 + }, + { + "epoch": 9.303693774203158, + "grad_norm": 0.004757927265018225, + "learning_rate": 3.229449981906463e-05, + "loss": 0.0008, + "num_input_tokens_seen": 36234848, + "step": 62465 + }, + { + "epoch": 9.304438486744116, + "grad_norm": 0.007678629830479622, + "learning_rate": 3.229139173308045e-05, + "loss": 0.048, + "num_input_tokens_seen": 36237696, + "step": 62470 + }, + { + "epoch": 9.305183199285075, + "grad_norm": 9.341559410095215, + "learning_rate": 3.228828352391562e-05, + "loss": 0.0361, + "num_input_tokens_seen": 36240928, + "step": 62475 + }, + { + "epoch": 9.305927911826036, + "grad_norm": 0.20992256700992584, + "learning_rate": 3.2285175191622656e-05, + "loss": 0.0737, + "num_input_tokens_seen": 36243584, + "step": 62480 + }, + { + "epoch": 9.306672624366994, + "grad_norm": 0.0019946922548115253, + "learning_rate": 3.2282066736254056e-05, + "loss": 0.0797, + "num_input_tokens_seen": 36246720, + "step": 62485 + }, + { + "epoch": 9.307417336907953, + "grad_norm": 0.024990232661366463, + "learning_rate": 3.2278958157862336e-05, + "loss": 0.0002, + "num_input_tokens_seen": 36249472, + "step": 62490 + }, + { + "epoch": 9.308162049448912, + "grad_norm": 150.3889923095703, + "learning_rate": 3.2275849456500026e-05, + "loss": 0.2142, + "num_input_tokens_seen": 36252224, + "step": 62495 + }, + { + "epoch": 9.308906761989872, + "grad_norm": 0.011371796019375324, + "learning_rate": 3.2272740632219635e-05, + "loss": 0.0494, + "num_input_tokens_seen": 36254912, + "step": 62500 + }, + { + "epoch": 9.309651474530831, + "grad_norm": 0.0026629099156707525, + "learning_rate": 3.226963168507367e-05, + "loss": 0.0001, + "num_input_tokens_seen": 36257824, + "step": 62505 + }, + { + "epoch": 9.31039618707179, + "grad_norm": 0.2523181140422821, + "learning_rate": 3.226652261511467e-05, + "loss": 0.0036, + "num_input_tokens_seen": 36261056, + "step": 62510 + }, + { + "epoch": 9.311140899612749, + "grad_norm": 36.419185638427734, + "learning_rate": 3.226341342239516e-05, + "loss": 0.4299, + "num_input_tokens_seen": 36263808, + "step": 62515 + }, + { + "epoch": 9.31188561215371, + "grad_norm": 0.02529222145676613, + "learning_rate": 3.226030410696766e-05, + "loss": 0.1037, + "num_input_tokens_seen": 36267072, + "step": 62520 + }, + { + "epoch": 9.312630324694668, + "grad_norm": 5.704070568084717, + "learning_rate": 3.2257194668884704e-05, + "loss": 0.1357, + "num_input_tokens_seen": 36269760, + "step": 62525 + }, + { + "epoch": 9.313375037235627, + "grad_norm": 0.01993824914097786, + "learning_rate": 3.2254085108198815e-05, + "loss": 0.2888, + "num_input_tokens_seen": 36272800, + "step": 62530 + }, + { + "epoch": 9.314119749776586, + "grad_norm": 38.924373626708984, + "learning_rate": 3.225097542496254e-05, + "loss": 0.2115, + "num_input_tokens_seen": 36275744, + "step": 62535 + }, + { + "epoch": 9.314864462317546, + "grad_norm": 15.823467254638672, + "learning_rate": 3.2247865619228394e-05, + "loss": 0.115, + "num_input_tokens_seen": 36278592, + "step": 62540 + }, + { + "epoch": 9.315609174858505, + "grad_norm": 65.93624114990234, + "learning_rate": 3.2244755691048933e-05, + "loss": 0.3331, + "num_input_tokens_seen": 36281344, + "step": 62545 + }, + { + "epoch": 9.316353887399464, + "grad_norm": 5.63358211517334, + "learning_rate": 3.224164564047669e-05, + "loss": 0.0026, + "num_input_tokens_seen": 36284192, + "step": 62550 + }, + { + "epoch": 9.317098599940422, + "grad_norm": 11.060127258300781, + "learning_rate": 3.223853546756419e-05, + "loss": 0.1575, + "num_input_tokens_seen": 36287296, + "step": 62555 + }, + { + "epoch": 9.317843312481383, + "grad_norm": 0.11752106249332428, + "learning_rate": 3.2235425172363996e-05, + "loss": 0.0188, + "num_input_tokens_seen": 36290240, + "step": 62560 + }, + { + "epoch": 9.318588025022342, + "grad_norm": 6.150876045227051, + "learning_rate": 3.223231475492865e-05, + "loss": 0.119, + "num_input_tokens_seen": 36293216, + "step": 62565 + }, + { + "epoch": 9.3193327375633, + "grad_norm": 0.07318974286317825, + "learning_rate": 3.222920421531069e-05, + "loss": 0.0007, + "num_input_tokens_seen": 36295808, + "step": 62570 + }, + { + "epoch": 9.32007745010426, + "grad_norm": 0.019172638654708862, + "learning_rate": 3.222609355356269e-05, + "loss": 0.0068, + "num_input_tokens_seen": 36298784, + "step": 62575 + }, + { + "epoch": 9.32082216264522, + "grad_norm": 0.006883095484226942, + "learning_rate": 3.222298276973717e-05, + "loss": 0.029, + "num_input_tokens_seen": 36302048, + "step": 62580 + }, + { + "epoch": 9.321566875186178, + "grad_norm": 0.10962723940610886, + "learning_rate": 3.22198718638867e-05, + "loss": 0.1047, + "num_input_tokens_seen": 36304672, + "step": 62585 + }, + { + "epoch": 9.322311587727137, + "grad_norm": 6.003509044647217, + "learning_rate": 3.2216760836063834e-05, + "loss": 0.2007, + "num_input_tokens_seen": 36307552, + "step": 62590 + }, + { + "epoch": 9.323056300268096, + "grad_norm": 0.034528035670518875, + "learning_rate": 3.2213649686321124e-05, + "loss": 0.231, + "num_input_tokens_seen": 36310496, + "step": 62595 + }, + { + "epoch": 9.323801012809056, + "grad_norm": 62.42786407470703, + "learning_rate": 3.2210538414711136e-05, + "loss": 0.1501, + "num_input_tokens_seen": 36313344, + "step": 62600 + }, + { + "epoch": 9.324545725350015, + "grad_norm": 127.77898406982422, + "learning_rate": 3.220742702128643e-05, + "loss": 0.6065, + "num_input_tokens_seen": 36316256, + "step": 62605 + }, + { + "epoch": 9.325290437890974, + "grad_norm": 2.846247434616089, + "learning_rate": 3.220431550609958e-05, + "loss": 0.089, + "num_input_tokens_seen": 36319168, + "step": 62610 + }, + { + "epoch": 9.326035150431933, + "grad_norm": 0.0015465427422896028, + "learning_rate": 3.220120386920313e-05, + "loss": 0.1506, + "num_input_tokens_seen": 36322016, + "step": 62615 + }, + { + "epoch": 9.326779862972893, + "grad_norm": 85.58799743652344, + "learning_rate": 3.219809211064966e-05, + "loss": 0.0259, + "num_input_tokens_seen": 36324768, + "step": 62620 + }, + { + "epoch": 9.327524575513852, + "grad_norm": 0.022815410047769547, + "learning_rate": 3.2194980230491744e-05, + "loss": 0.0636, + "num_input_tokens_seen": 36327456, + "step": 62625 + }, + { + "epoch": 9.32826928805481, + "grad_norm": 8.564966201782227, + "learning_rate": 3.2191868228781944e-05, + "loss": 0.3203, + "num_input_tokens_seen": 36330432, + "step": 62630 + }, + { + "epoch": 9.32901400059577, + "grad_norm": 158.09886169433594, + "learning_rate": 3.2188756105572844e-05, + "loss": 0.0874, + "num_input_tokens_seen": 36333600, + "step": 62635 + }, + { + "epoch": 9.32975871313673, + "grad_norm": 0.012894745916128159, + "learning_rate": 3.218564386091701e-05, + "loss": 0.2082, + "num_input_tokens_seen": 36336384, + "step": 62640 + }, + { + "epoch": 9.330503425677689, + "grad_norm": 0.03348316624760628, + "learning_rate": 3.218253149486704e-05, + "loss": 0.0125, + "num_input_tokens_seen": 36339488, + "step": 62645 + }, + { + "epoch": 9.331248138218648, + "grad_norm": 0.05180848762392998, + "learning_rate": 3.2179419007475483e-05, + "loss": 0.0471, + "num_input_tokens_seen": 36342368, + "step": 62650 + }, + { + "epoch": 9.331992850759606, + "grad_norm": 0.10388700664043427, + "learning_rate": 3.217630639879495e-05, + "loss": 0.1207, + "num_input_tokens_seen": 36345472, + "step": 62655 + }, + { + "epoch": 9.332737563300565, + "grad_norm": 96.30144500732422, + "learning_rate": 3.217319366887801e-05, + "loss": 0.0941, + "num_input_tokens_seen": 36348480, + "step": 62660 + }, + { + "epoch": 9.333482275841526, + "grad_norm": 0.0019441746408119798, + "learning_rate": 3.217008081777726e-05, + "loss": 0.0003, + "num_input_tokens_seen": 36351808, + "step": 62665 + }, + { + "epoch": 9.334226988382484, + "grad_norm": 41.21784591674805, + "learning_rate": 3.2166967845545275e-05, + "loss": 0.2977, + "num_input_tokens_seen": 36354560, + "step": 62670 + }, + { + "epoch": 9.334971700923443, + "grad_norm": 0.0003974924038629979, + "learning_rate": 3.216385475223465e-05, + "loss": 0.0918, + "num_input_tokens_seen": 36357600, + "step": 62675 + }, + { + "epoch": 9.335716413464402, + "grad_norm": 0.029538456350564957, + "learning_rate": 3.216074153789799e-05, + "loss": 0.0017, + "num_input_tokens_seen": 36360320, + "step": 62680 + }, + { + "epoch": 9.336461126005362, + "grad_norm": 18.198387145996094, + "learning_rate": 3.2157628202587874e-05, + "loss": 0.1315, + "num_input_tokens_seen": 36363168, + "step": 62685 + }, + { + "epoch": 9.337205838546321, + "grad_norm": 0.00423941295593977, + "learning_rate": 3.21545147463569e-05, + "loss": 0.0103, + "num_input_tokens_seen": 36366112, + "step": 62690 + }, + { + "epoch": 9.33795055108728, + "grad_norm": 0.002424095291644335, + "learning_rate": 3.2151401169257676e-05, + "loss": 0.0018, + "num_input_tokens_seen": 36368960, + "step": 62695 + }, + { + "epoch": 9.338695263628239, + "grad_norm": 0.00987142976373434, + "learning_rate": 3.2148287471342796e-05, + "loss": 0.256, + "num_input_tokens_seen": 36371872, + "step": 62700 + }, + { + "epoch": 9.3394399761692, + "grad_norm": 92.43510437011719, + "learning_rate": 3.2145173652664864e-05, + "loss": 0.0468, + "num_input_tokens_seen": 36374528, + "step": 62705 + }, + { + "epoch": 9.340184688710158, + "grad_norm": 0.04105469584465027, + "learning_rate": 3.21420597132765e-05, + "loss": 0.1316, + "num_input_tokens_seen": 36377792, + "step": 62710 + }, + { + "epoch": 9.340929401251117, + "grad_norm": 14.93399715423584, + "learning_rate": 3.213894565323027e-05, + "loss": 0.2862, + "num_input_tokens_seen": 36380800, + "step": 62715 + }, + { + "epoch": 9.341674113792076, + "grad_norm": 0.0015552297700196505, + "learning_rate": 3.213583147257883e-05, + "loss": 0.0358, + "num_input_tokens_seen": 36383360, + "step": 62720 + }, + { + "epoch": 9.342418826333036, + "grad_norm": 42.983848571777344, + "learning_rate": 3.213271717137475e-05, + "loss": 0.1045, + "num_input_tokens_seen": 36385984, + "step": 62725 + }, + { + "epoch": 9.343163538873995, + "grad_norm": 29.356813430786133, + "learning_rate": 3.2129602749670674e-05, + "loss": 0.0394, + "num_input_tokens_seen": 36388832, + "step": 62730 + }, + { + "epoch": 9.343908251414954, + "grad_norm": 5.27304220199585, + "learning_rate": 3.212648820751921e-05, + "loss": 0.0015, + "num_input_tokens_seen": 36391968, + "step": 62735 + }, + { + "epoch": 9.344652963955912, + "grad_norm": 0.005532593932002783, + "learning_rate": 3.212337354497296e-05, + "loss": 0.0336, + "num_input_tokens_seen": 36394688, + "step": 62740 + }, + { + "epoch": 9.345397676496873, + "grad_norm": 0.03722986951470375, + "learning_rate": 3.2120258762084565e-05, + "loss": 0.1442, + "num_input_tokens_seen": 36397408, + "step": 62745 + }, + { + "epoch": 9.346142389037832, + "grad_norm": 0.23119880259037018, + "learning_rate": 3.211714385890663e-05, + "loss": 0.1756, + "num_input_tokens_seen": 36400512, + "step": 62750 + }, + { + "epoch": 9.34688710157879, + "grad_norm": 0.026845529675483704, + "learning_rate": 3.2114028835491786e-05, + "loss": 0.0005, + "num_input_tokens_seen": 36403296, + "step": 62755 + }, + { + "epoch": 9.34763181411975, + "grad_norm": 0.031336672604084015, + "learning_rate": 3.211091369189265e-05, + "loss": 0.0048, + "num_input_tokens_seen": 36405984, + "step": 62760 + }, + { + "epoch": 9.34837652666071, + "grad_norm": 0.19073733687400818, + "learning_rate": 3.210779842816185e-05, + "loss": 0.0004, + "num_input_tokens_seen": 36408928, + "step": 62765 + }, + { + "epoch": 9.349121239201668, + "grad_norm": 0.011571747250854969, + "learning_rate": 3.2104683044352025e-05, + "loss": 0.248, + "num_input_tokens_seen": 36411744, + "step": 62770 + }, + { + "epoch": 9.349865951742627, + "grad_norm": 0.1771724969148636, + "learning_rate": 3.210156754051581e-05, + "loss": 0.0113, + "num_input_tokens_seen": 36414592, + "step": 62775 + }, + { + "epoch": 9.350610664283586, + "grad_norm": 2.80241060256958, + "learning_rate": 3.2098451916705815e-05, + "loss": 0.1739, + "num_input_tokens_seen": 36417696, + "step": 62780 + }, + { + "epoch": 9.351355376824547, + "grad_norm": 0.07542340457439423, + "learning_rate": 3.20953361729747e-05, + "loss": 0.1009, + "num_input_tokens_seen": 36420448, + "step": 62785 + }, + { + "epoch": 9.352100089365505, + "grad_norm": 1.4659940004348755, + "learning_rate": 3.209222030937509e-05, + "loss": 0.2356, + "num_input_tokens_seen": 36423424, + "step": 62790 + }, + { + "epoch": 9.352844801906464, + "grad_norm": 24.862865447998047, + "learning_rate": 3.208910432595962e-05, + "loss": 0.3329, + "num_input_tokens_seen": 36426336, + "step": 62795 + }, + { + "epoch": 9.353589514447423, + "grad_norm": 0.07764377444982529, + "learning_rate": 3.208598822278094e-05, + "loss": 0.1013, + "num_input_tokens_seen": 36429024, + "step": 62800 + }, + { + "epoch": 9.354334226988382, + "grad_norm": 0.017753634601831436, + "learning_rate": 3.208287199989169e-05, + "loss": 0.0694, + "num_input_tokens_seen": 36432160, + "step": 62805 + }, + { + "epoch": 9.355078939529342, + "grad_norm": 0.011210242286324501, + "learning_rate": 3.207975565734452e-05, + "loss": 0.0621, + "num_input_tokens_seen": 36435104, + "step": 62810 + }, + { + "epoch": 9.3558236520703, + "grad_norm": 0.0058076828718185425, + "learning_rate": 3.207663919519207e-05, + "loss": 0.0059, + "num_input_tokens_seen": 36438016, + "step": 62815 + }, + { + "epoch": 9.35656836461126, + "grad_norm": 4.936497688293457, + "learning_rate": 3.2073522613486994e-05, + "loss": 0.1515, + "num_input_tokens_seen": 36440864, + "step": 62820 + }, + { + "epoch": 9.357313077152218, + "grad_norm": 233.44143676757812, + "learning_rate": 3.207040591228194e-05, + "loss": 0.1225, + "num_input_tokens_seen": 36443680, + "step": 62825 + }, + { + "epoch": 9.358057789693179, + "grad_norm": 41.95170974731445, + "learning_rate": 3.206728909162957e-05, + "loss": 0.3299, + "num_input_tokens_seen": 36446496, + "step": 62830 + }, + { + "epoch": 9.358802502234138, + "grad_norm": 35.12147521972656, + "learning_rate": 3.206417215158253e-05, + "loss": 0.1744, + "num_input_tokens_seen": 36449248, + "step": 62835 + }, + { + "epoch": 9.359547214775096, + "grad_norm": 0.124161496758461, + "learning_rate": 3.206105509219348e-05, + "loss": 0.1154, + "num_input_tokens_seen": 36452096, + "step": 62840 + }, + { + "epoch": 9.360291927316055, + "grad_norm": 0.03298168629407883, + "learning_rate": 3.205793791351509e-05, + "loss": 0.1691, + "num_input_tokens_seen": 36455392, + "step": 62845 + }, + { + "epoch": 9.361036639857016, + "grad_norm": 0.040988706052303314, + "learning_rate": 3.2054820615600003e-05, + "loss": 0.0181, + "num_input_tokens_seen": 36458112, + "step": 62850 + }, + { + "epoch": 9.361781352397974, + "grad_norm": 2.086239814758301, + "learning_rate": 3.2051703198500896e-05, + "loss": 0.0695, + "num_input_tokens_seen": 36461088, + "step": 62855 + }, + { + "epoch": 9.362526064938933, + "grad_norm": 16.14040756225586, + "learning_rate": 3.2048585662270425e-05, + "loss": 0.3036, + "num_input_tokens_seen": 36464608, + "step": 62860 + }, + { + "epoch": 9.363270777479892, + "grad_norm": 0.009498645551502705, + "learning_rate": 3.204546800696127e-05, + "loss": 0.0052, + "num_input_tokens_seen": 36467488, + "step": 62865 + }, + { + "epoch": 9.364015490020853, + "grad_norm": 0.10843406617641449, + "learning_rate": 3.2042350232626086e-05, + "loss": 0.0026, + "num_input_tokens_seen": 36470592, + "step": 62870 + }, + { + "epoch": 9.364760202561811, + "grad_norm": 16.772119522094727, + "learning_rate": 3.203923233931757e-05, + "loss": 0.0443, + "num_input_tokens_seen": 36473856, + "step": 62875 + }, + { + "epoch": 9.36550491510277, + "grad_norm": 0.3675896227359772, + "learning_rate": 3.2036114327088354e-05, + "loss": 0.0024, + "num_input_tokens_seen": 36476768, + "step": 62880 + }, + { + "epoch": 9.366249627643729, + "grad_norm": 59.374637603759766, + "learning_rate": 3.203299619599115e-05, + "loss": 0.3919, + "num_input_tokens_seen": 36479840, + "step": 62885 + }, + { + "epoch": 9.36699434018469, + "grad_norm": 93.43061828613281, + "learning_rate": 3.2029877946078624e-05, + "loss": 0.1004, + "num_input_tokens_seen": 36483488, + "step": 62890 + }, + { + "epoch": 9.367739052725648, + "grad_norm": 9.342999458312988, + "learning_rate": 3.2026759577403445e-05, + "loss": 0.0518, + "num_input_tokens_seen": 36486080, + "step": 62895 + }, + { + "epoch": 9.368483765266607, + "grad_norm": 0.03331737965345383, + "learning_rate": 3.202364109001831e-05, + "loss": 0.1555, + "num_input_tokens_seen": 36489312, + "step": 62900 + }, + { + "epoch": 9.369228477807566, + "grad_norm": 142.13771057128906, + "learning_rate": 3.2020522483975906e-05, + "loss": 0.0612, + "num_input_tokens_seen": 36492256, + "step": 62905 + }, + { + "epoch": 9.369973190348526, + "grad_norm": 0.020192647352814674, + "learning_rate": 3.201740375932891e-05, + "loss": 0.0859, + "num_input_tokens_seen": 36495040, + "step": 62910 + }, + { + "epoch": 9.370717902889485, + "grad_norm": 12.599271774291992, + "learning_rate": 3.201428491613e-05, + "loss": 0.3662, + "num_input_tokens_seen": 36497952, + "step": 62915 + }, + { + "epoch": 9.371462615430444, + "grad_norm": 0.395588219165802, + "learning_rate": 3.2011165954431873e-05, + "loss": 0.0592, + "num_input_tokens_seen": 36500992, + "step": 62920 + }, + { + "epoch": 9.372207327971402, + "grad_norm": 0.05165427178144455, + "learning_rate": 3.200804687428724e-05, + "loss": 0.1694, + "num_input_tokens_seen": 36503616, + "step": 62925 + }, + { + "epoch": 9.372952040512363, + "grad_norm": 0.011426066048443317, + "learning_rate": 3.200492767574876e-05, + "loss": 0.1723, + "num_input_tokens_seen": 36506368, + "step": 62930 + }, + { + "epoch": 9.373696753053322, + "grad_norm": 0.052656736224889755, + "learning_rate": 3.200180835886915e-05, + "loss": 0.0366, + "num_input_tokens_seen": 36509216, + "step": 62935 + }, + { + "epoch": 9.37444146559428, + "grad_norm": 0.011575262062251568, + "learning_rate": 3.199868892370111e-05, + "loss": 0.0007, + "num_input_tokens_seen": 36511936, + "step": 62940 + }, + { + "epoch": 9.37518617813524, + "grad_norm": 0.026943983510136604, + "learning_rate": 3.199556937029734e-05, + "loss": 0.0032, + "num_input_tokens_seen": 36514848, + "step": 62945 + }, + { + "epoch": 9.3759308906762, + "grad_norm": 0.9306795001029968, + "learning_rate": 3.199244969871052e-05, + "loss": 0.1235, + "num_input_tokens_seen": 36518016, + "step": 62950 + }, + { + "epoch": 9.376675603217159, + "grad_norm": 33.646690368652344, + "learning_rate": 3.198932990899337e-05, + "loss": 0.0877, + "num_input_tokens_seen": 36520960, + "step": 62955 + }, + { + "epoch": 9.377420315758117, + "grad_norm": 0.761446475982666, + "learning_rate": 3.19862100011986e-05, + "loss": 0.0358, + "num_input_tokens_seen": 36523840, + "step": 62960 + }, + { + "epoch": 9.378165028299076, + "grad_norm": 35.62543869018555, + "learning_rate": 3.198308997537891e-05, + "loss": 0.0095, + "num_input_tokens_seen": 36526752, + "step": 62965 + }, + { + "epoch": 9.378909740840037, + "grad_norm": 38.53931427001953, + "learning_rate": 3.1979969831587014e-05, + "loss": 0.5146, + "num_input_tokens_seen": 36529984, + "step": 62970 + }, + { + "epoch": 9.379654453380995, + "grad_norm": 0.06507202237844467, + "learning_rate": 3.1976849569875624e-05, + "loss": 0.0758, + "num_input_tokens_seen": 36532864, + "step": 62975 + }, + { + "epoch": 9.380399165921954, + "grad_norm": 12.699991226196289, + "learning_rate": 3.197372919029745e-05, + "loss": 0.0028, + "num_input_tokens_seen": 36535520, + "step": 62980 + }, + { + "epoch": 9.381143878462913, + "grad_norm": 0.05716953054070473, + "learning_rate": 3.1970608692905216e-05, + "loss": 0.1045, + "num_input_tokens_seen": 36538368, + "step": 62985 + }, + { + "epoch": 9.381888591003872, + "grad_norm": 0.18302622437477112, + "learning_rate": 3.196748807775162e-05, + "loss": 0.1691, + "num_input_tokens_seen": 36541408, + "step": 62990 + }, + { + "epoch": 9.382633303544832, + "grad_norm": 95.70148468017578, + "learning_rate": 3.19643673448894e-05, + "loss": 0.1442, + "num_input_tokens_seen": 36544256, + "step": 62995 + }, + { + "epoch": 9.383378016085791, + "grad_norm": 0.013182485476136208, + "learning_rate": 3.1961246494371275e-05, + "loss": 0.0004, + "num_input_tokens_seen": 36547008, + "step": 63000 + }, + { + "epoch": 9.38412272862675, + "grad_norm": 0.040623221546411514, + "learning_rate": 3.195812552624996e-05, + "loss": 0.0012, + "num_input_tokens_seen": 36550240, + "step": 63005 + }, + { + "epoch": 9.384867441167708, + "grad_norm": 0.018209895119071007, + "learning_rate": 3.1955004440578196e-05, + "loss": 0.0005, + "num_input_tokens_seen": 36552992, + "step": 63010 + }, + { + "epoch": 9.385612153708669, + "grad_norm": 0.007504285778850317, + "learning_rate": 3.195188323740869e-05, + "loss": 0.0005, + "num_input_tokens_seen": 36555712, + "step": 63015 + }, + { + "epoch": 9.386356866249628, + "grad_norm": 38.330204010009766, + "learning_rate": 3.194876191679418e-05, + "loss": 0.4263, + "num_input_tokens_seen": 36558624, + "step": 63020 + }, + { + "epoch": 9.387101578790586, + "grad_norm": 0.11227139085531235, + "learning_rate": 3.194564047878742e-05, + "loss": 0.1537, + "num_input_tokens_seen": 36561344, + "step": 63025 + }, + { + "epoch": 9.387846291331545, + "grad_norm": 0.05100207030773163, + "learning_rate": 3.19425189234411e-05, + "loss": 0.2074, + "num_input_tokens_seen": 36564320, + "step": 63030 + }, + { + "epoch": 9.388591003872506, + "grad_norm": 0.02613593079149723, + "learning_rate": 3.193939725080799e-05, + "loss": 0.0019, + "num_input_tokens_seen": 36567200, + "step": 63035 + }, + { + "epoch": 9.389335716413465, + "grad_norm": 0.02825135365128517, + "learning_rate": 3.1936275460940815e-05, + "loss": 0.0005, + "num_input_tokens_seen": 36570080, + "step": 63040 + }, + { + "epoch": 9.390080428954423, + "grad_norm": 206.2106475830078, + "learning_rate": 3.193315355389231e-05, + "loss": 0.0573, + "num_input_tokens_seen": 36573056, + "step": 63045 + }, + { + "epoch": 9.390825141495382, + "grad_norm": 0.016868019476532936, + "learning_rate": 3.1930031529715234e-05, + "loss": 0.0922, + "num_input_tokens_seen": 36576096, + "step": 63050 + }, + { + "epoch": 9.391569854036343, + "grad_norm": 0.3673272430896759, + "learning_rate": 3.192690938846231e-05, + "loss": 0.0007, + "num_input_tokens_seen": 36578912, + "step": 63055 + }, + { + "epoch": 9.392314566577301, + "grad_norm": 0.002050848910585046, + "learning_rate": 3.1923787130186286e-05, + "loss": 0.0378, + "num_input_tokens_seen": 36581696, + "step": 63060 + }, + { + "epoch": 9.39305927911826, + "grad_norm": 0.38985979557037354, + "learning_rate": 3.1920664754939936e-05, + "loss": 0.1452, + "num_input_tokens_seen": 36584736, + "step": 63065 + }, + { + "epoch": 9.393803991659219, + "grad_norm": 0.7583010792732239, + "learning_rate": 3.1917542262775975e-05, + "loss": 0.0009, + "num_input_tokens_seen": 36587712, + "step": 63070 + }, + { + "epoch": 9.39454870420018, + "grad_norm": 42.98545837402344, + "learning_rate": 3.191441965374717e-05, + "loss": 0.3528, + "num_input_tokens_seen": 36590304, + "step": 63075 + }, + { + "epoch": 9.395293416741138, + "grad_norm": 77.93669128417969, + "learning_rate": 3.191129692790627e-05, + "loss": 0.1936, + "num_input_tokens_seen": 36593056, + "step": 63080 + }, + { + "epoch": 9.396038129282097, + "grad_norm": 0.09985777735710144, + "learning_rate": 3.190817408530604e-05, + "loss": 0.0253, + "num_input_tokens_seen": 36595808, + "step": 63085 + }, + { + "epoch": 9.396782841823056, + "grad_norm": 27.20440673828125, + "learning_rate": 3.190505112599922e-05, + "loss": 0.1276, + "num_input_tokens_seen": 36598784, + "step": 63090 + }, + { + "epoch": 9.397527554364016, + "grad_norm": 0.007996046915650368, + "learning_rate": 3.190192805003858e-05, + "loss": 0.0003, + "num_input_tokens_seen": 36601504, + "step": 63095 + }, + { + "epoch": 9.398272266904975, + "grad_norm": 61.07696533203125, + "learning_rate": 3.189880485747688e-05, + "loss": 0.1153, + "num_input_tokens_seen": 36604288, + "step": 63100 + }, + { + "epoch": 9.399016979445934, + "grad_norm": 3.544229030609131, + "learning_rate": 3.1895681548366896e-05, + "loss": 0.0079, + "num_input_tokens_seen": 36607104, + "step": 63105 + }, + { + "epoch": 9.399761691986892, + "grad_norm": 0.09275703132152557, + "learning_rate": 3.189255812276137e-05, + "loss": 0.0015, + "num_input_tokens_seen": 36609952, + "step": 63110 + }, + { + "epoch": 9.400506404527853, + "grad_norm": 0.015182212926447392, + "learning_rate": 3.188943458071308e-05, + "loss": 0.0005, + "num_input_tokens_seen": 36612704, + "step": 63115 + }, + { + "epoch": 9.401251117068812, + "grad_norm": 0.013137930072844028, + "learning_rate": 3.18863109222748e-05, + "loss": 0.3964, + "num_input_tokens_seen": 36615360, + "step": 63120 + }, + { + "epoch": 9.40199582960977, + "grad_norm": 0.0056013246066868305, + "learning_rate": 3.188318714749929e-05, + "loss": 0.0312, + "num_input_tokens_seen": 36618528, + "step": 63125 + }, + { + "epoch": 9.40274054215073, + "grad_norm": 0.007946492172777653, + "learning_rate": 3.188006325643934e-05, + "loss": 0.1138, + "num_input_tokens_seen": 36621472, + "step": 63130 + }, + { + "epoch": 9.40348525469169, + "grad_norm": 1.407841444015503, + "learning_rate": 3.1876939249147694e-05, + "loss": 0.0345, + "num_input_tokens_seen": 36624256, + "step": 63135 + }, + { + "epoch": 9.404229967232649, + "grad_norm": 59.686798095703125, + "learning_rate": 3.187381512567717e-05, + "loss": 0.2728, + "num_input_tokens_seen": 36627168, + "step": 63140 + }, + { + "epoch": 9.404974679773607, + "grad_norm": 0.031411848962306976, + "learning_rate": 3.1870690886080515e-05, + "loss": 0.0084, + "num_input_tokens_seen": 36630144, + "step": 63145 + }, + { + "epoch": 9.405719392314566, + "grad_norm": 0.012424707412719727, + "learning_rate": 3.186756653041053e-05, + "loss": 0.0599, + "num_input_tokens_seen": 36633024, + "step": 63150 + }, + { + "epoch": 9.406464104855527, + "grad_norm": 0.01476286817342043, + "learning_rate": 3.186444205871997e-05, + "loss": 0.0007, + "num_input_tokens_seen": 36635904, + "step": 63155 + }, + { + "epoch": 9.407208817396485, + "grad_norm": 21.848703384399414, + "learning_rate": 3.186131747106165e-05, + "loss": 0.0896, + "num_input_tokens_seen": 36638816, + "step": 63160 + }, + { + "epoch": 9.407953529937444, + "grad_norm": 0.0017484140116721392, + "learning_rate": 3.1858192767488345e-05, + "loss": 0.2503, + "num_input_tokens_seen": 36641600, + "step": 63165 + }, + { + "epoch": 9.408698242478403, + "grad_norm": 0.19741615653038025, + "learning_rate": 3.185506794805284e-05, + "loss": 0.1362, + "num_input_tokens_seen": 36644384, + "step": 63170 + }, + { + "epoch": 9.409442955019362, + "grad_norm": 0.0054203481413424015, + "learning_rate": 3.1851943012807934e-05, + "loss": 0.0039, + "num_input_tokens_seen": 36647104, + "step": 63175 + }, + { + "epoch": 9.410187667560322, + "grad_norm": 0.005524516571313143, + "learning_rate": 3.184881796180641e-05, + "loss": 0.0629, + "num_input_tokens_seen": 36649824, + "step": 63180 + }, + { + "epoch": 9.410932380101281, + "grad_norm": 0.06908900290727615, + "learning_rate": 3.184569279510107e-05, + "loss": 0.0006, + "num_input_tokens_seen": 36652800, + "step": 63185 + }, + { + "epoch": 9.41167709264224, + "grad_norm": 1.2835428714752197, + "learning_rate": 3.184256751274471e-05, + "loss": 0.0023, + "num_input_tokens_seen": 36655584, + "step": 63190 + }, + { + "epoch": 9.412421805183198, + "grad_norm": 0.03816170245409012, + "learning_rate": 3.183944211479012e-05, + "loss": 0.4247, + "num_input_tokens_seen": 36658368, + "step": 63195 + }, + { + "epoch": 9.413166517724159, + "grad_norm": 0.03641422465443611, + "learning_rate": 3.183631660129011e-05, + "loss": 0.1995, + "num_input_tokens_seen": 36661184, + "step": 63200 + }, + { + "epoch": 9.413911230265118, + "grad_norm": 0.06345541775226593, + "learning_rate": 3.183319097229748e-05, + "loss": 0.0067, + "num_input_tokens_seen": 36664160, + "step": 63205 + }, + { + "epoch": 9.414655942806077, + "grad_norm": 0.03754318132996559, + "learning_rate": 3.183006522786504e-05, + "loss": 0.0003, + "num_input_tokens_seen": 36666976, + "step": 63210 + }, + { + "epoch": 9.415400655347035, + "grad_norm": 14.485478401184082, + "learning_rate": 3.182693936804558e-05, + "loss": 0.3156, + "num_input_tokens_seen": 36669984, + "step": 63215 + }, + { + "epoch": 9.416145367887996, + "grad_norm": 62.3018798828125, + "learning_rate": 3.182381339289192e-05, + "loss": 0.0821, + "num_input_tokens_seen": 36673088, + "step": 63220 + }, + { + "epoch": 9.416890080428955, + "grad_norm": 1.7518199682235718, + "learning_rate": 3.182068730245686e-05, + "loss": 0.136, + "num_input_tokens_seen": 36675648, + "step": 63225 + }, + { + "epoch": 9.417634792969913, + "grad_norm": 1.3125332593917847, + "learning_rate": 3.181756109679324e-05, + "loss": 0.0257, + "num_input_tokens_seen": 36678368, + "step": 63230 + }, + { + "epoch": 9.418379505510872, + "grad_norm": 59.68495559692383, + "learning_rate": 3.1814434775953837e-05, + "loss": 0.1731, + "num_input_tokens_seen": 36681120, + "step": 63235 + }, + { + "epoch": 9.419124218051833, + "grad_norm": 0.004004232585430145, + "learning_rate": 3.18113083399915e-05, + "loss": 0.2148, + "num_input_tokens_seen": 36683840, + "step": 63240 + }, + { + "epoch": 9.419868930592791, + "grad_norm": 13.201123237609863, + "learning_rate": 3.180818178895901e-05, + "loss": 0.2301, + "num_input_tokens_seen": 36686880, + "step": 63245 + }, + { + "epoch": 9.42061364313375, + "grad_norm": 8.352620124816895, + "learning_rate": 3.180505512290922e-05, + "loss": 0.09, + "num_input_tokens_seen": 36689568, + "step": 63250 + }, + { + "epoch": 9.421358355674709, + "grad_norm": 0.019972506910562515, + "learning_rate": 3.1801928341894943e-05, + "loss": 0.0013, + "num_input_tokens_seen": 36692288, + "step": 63255 + }, + { + "epoch": 9.42210306821567, + "grad_norm": 0.06763890385627747, + "learning_rate": 3.1798801445968993e-05, + "loss": 0.125, + "num_input_tokens_seen": 36695392, + "step": 63260 + }, + { + "epoch": 9.422847780756628, + "grad_norm": 0.02028120495378971, + "learning_rate": 3.179567443518421e-05, + "loss": 0.1386, + "num_input_tokens_seen": 36698176, + "step": 63265 + }, + { + "epoch": 9.423592493297587, + "grad_norm": 0.01960582472383976, + "learning_rate": 3.1792547309593415e-05, + "loss": 0.0031, + "num_input_tokens_seen": 36700992, + "step": 63270 + }, + { + "epoch": 9.424337205838546, + "grad_norm": 0.22210465371608734, + "learning_rate": 3.178942006924943e-05, + "loss": 0.0128, + "num_input_tokens_seen": 36703680, + "step": 63275 + }, + { + "epoch": 9.425081918379506, + "grad_norm": 0.05864161252975464, + "learning_rate": 3.17862927142051e-05, + "loss": 0.0666, + "num_input_tokens_seen": 36706496, + "step": 63280 + }, + { + "epoch": 9.425826630920465, + "grad_norm": 0.0157152246683836, + "learning_rate": 3.178316524451325e-05, + "loss": 0.0067, + "num_input_tokens_seen": 36709408, + "step": 63285 + }, + { + "epoch": 9.426571343461424, + "grad_norm": 0.010737631469964981, + "learning_rate": 3.178003766022671e-05, + "loss": 0.0002, + "num_input_tokens_seen": 36712256, + "step": 63290 + }, + { + "epoch": 9.427316056002383, + "grad_norm": 97.02487182617188, + "learning_rate": 3.177690996139833e-05, + "loss": 0.2137, + "num_input_tokens_seen": 36715168, + "step": 63295 + }, + { + "epoch": 9.428060768543343, + "grad_norm": 0.5335872173309326, + "learning_rate": 3.177378214808094e-05, + "loss": 0.1468, + "num_input_tokens_seen": 36718208, + "step": 63300 + }, + { + "epoch": 9.428805481084302, + "grad_norm": 0.015012111514806747, + "learning_rate": 3.177065422032739e-05, + "loss": 0.1704, + "num_input_tokens_seen": 36721344, + "step": 63305 + }, + { + "epoch": 9.42955019362526, + "grad_norm": 0.13720105588436127, + "learning_rate": 3.176752617819052e-05, + "loss": 0.0028, + "num_input_tokens_seen": 36724064, + "step": 63310 + }, + { + "epoch": 9.43029490616622, + "grad_norm": 0.02230824902653694, + "learning_rate": 3.1764398021723175e-05, + "loss": 0.2819, + "num_input_tokens_seen": 36727168, + "step": 63315 + }, + { + "epoch": 9.43103961870718, + "grad_norm": 0.005512113217264414, + "learning_rate": 3.17612697509782e-05, + "loss": 0.0002, + "num_input_tokens_seen": 36729984, + "step": 63320 + }, + { + "epoch": 9.431784331248139, + "grad_norm": 0.03903729096055031, + "learning_rate": 3.1758141366008434e-05, + "loss": 0.0383, + "num_input_tokens_seen": 36732704, + "step": 63325 + }, + { + "epoch": 9.432529043789097, + "grad_norm": 62.72960662841797, + "learning_rate": 3.175501286686674e-05, + "loss": 0.0634, + "num_input_tokens_seen": 36735488, + "step": 63330 + }, + { + "epoch": 9.433273756330056, + "grad_norm": 0.012531782500445843, + "learning_rate": 3.1751884253605974e-05, + "loss": 0.1424, + "num_input_tokens_seen": 36738368, + "step": 63335 + }, + { + "epoch": 9.434018468871017, + "grad_norm": 0.012453502975404263, + "learning_rate": 3.174875552627899e-05, + "loss": 0.1044, + "num_input_tokens_seen": 36741152, + "step": 63340 + }, + { + "epoch": 9.434763181411975, + "grad_norm": 0.0014881249517202377, + "learning_rate": 3.174562668493863e-05, + "loss": 0.1243, + "num_input_tokens_seen": 36743872, + "step": 63345 + }, + { + "epoch": 9.435507893952934, + "grad_norm": 0.002089455258101225, + "learning_rate": 3.1742497729637774e-05, + "loss": 0.0, + "num_input_tokens_seen": 36746784, + "step": 63350 + }, + { + "epoch": 9.436252606493893, + "grad_norm": 11.775252342224121, + "learning_rate": 3.173936866042927e-05, + "loss": 0.1012, + "num_input_tokens_seen": 36749504, + "step": 63355 + }, + { + "epoch": 9.436997319034852, + "grad_norm": 0.0012424970045685768, + "learning_rate": 3.173623947736597e-05, + "loss": 0.0004, + "num_input_tokens_seen": 36752384, + "step": 63360 + }, + { + "epoch": 9.437742031575812, + "grad_norm": 1.005889892578125, + "learning_rate": 3.1733110180500766e-05, + "loss": 0.0044, + "num_input_tokens_seen": 36755104, + "step": 63365 + }, + { + "epoch": 9.438486744116771, + "grad_norm": 23.173276901245117, + "learning_rate": 3.17299807698865e-05, + "loss": 0.1982, + "num_input_tokens_seen": 36757664, + "step": 63370 + }, + { + "epoch": 9.43923145665773, + "grad_norm": 0.030493052676320076, + "learning_rate": 3.1726851245576056e-05, + "loss": 0.0012, + "num_input_tokens_seen": 36760704, + "step": 63375 + }, + { + "epoch": 9.439976169198689, + "grad_norm": 0.04596972465515137, + "learning_rate": 3.172372160762229e-05, + "loss": 0.1909, + "num_input_tokens_seen": 36763552, + "step": 63380 + }, + { + "epoch": 9.440720881739649, + "grad_norm": 0.00651154201477766, + "learning_rate": 3.172059185607808e-05, + "loss": 0.4192, + "num_input_tokens_seen": 36766176, + "step": 63385 + }, + { + "epoch": 9.441465594280608, + "grad_norm": 0.009724795818328857, + "learning_rate": 3.171746199099631e-05, + "loss": 0.5539, + "num_input_tokens_seen": 36768960, + "step": 63390 + }, + { + "epoch": 9.442210306821567, + "grad_norm": 0.010672515258193016, + "learning_rate": 3.171433201242984e-05, + "loss": 0.0007, + "num_input_tokens_seen": 36771840, + "step": 63395 + }, + { + "epoch": 9.442955019362525, + "grad_norm": 42.7931022644043, + "learning_rate": 3.1711201920431556e-05, + "loss": 0.3865, + "num_input_tokens_seen": 36774464, + "step": 63400 + }, + { + "epoch": 9.443699731903486, + "grad_norm": 0.006620747968554497, + "learning_rate": 3.170807171505434e-05, + "loss": 0.0996, + "num_input_tokens_seen": 36777408, + "step": 63405 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 0.05325470492243767, + "learning_rate": 3.1704941396351064e-05, + "loss": 0.0003, + "num_input_tokens_seen": 36780256, + "step": 63410 + }, + { + "epoch": 9.445189156985403, + "grad_norm": 10.142504692077637, + "learning_rate": 3.1701810964374626e-05, + "loss": 0.0029, + "num_input_tokens_seen": 36783296, + "step": 63415 + }, + { + "epoch": 9.445933869526362, + "grad_norm": 0.014220857992768288, + "learning_rate": 3.169868041917789e-05, + "loss": 0.0002, + "num_input_tokens_seen": 36786176, + "step": 63420 + }, + { + "epoch": 9.446678582067323, + "grad_norm": 0.004192049149423838, + "learning_rate": 3.1695549760813764e-05, + "loss": 0.1195, + "num_input_tokens_seen": 36789152, + "step": 63425 + }, + { + "epoch": 9.447423294608281, + "grad_norm": 0.003938106819987297, + "learning_rate": 3.169241898933514e-05, + "loss": 0.0002, + "num_input_tokens_seen": 36791840, + "step": 63430 + }, + { + "epoch": 9.44816800714924, + "grad_norm": 0.017790205776691437, + "learning_rate": 3.168928810479488e-05, + "loss": 0.0065, + "num_input_tokens_seen": 36794880, + "step": 63435 + }, + { + "epoch": 9.448912719690199, + "grad_norm": 0.09300550073385239, + "learning_rate": 3.1686157107245915e-05, + "loss": 0.0004, + "num_input_tokens_seen": 36797856, + "step": 63440 + }, + { + "epoch": 9.44965743223116, + "grad_norm": 0.007991093210875988, + "learning_rate": 3.1683025996741104e-05, + "loss": 0.0003, + "num_input_tokens_seen": 36800640, + "step": 63445 + }, + { + "epoch": 9.450402144772118, + "grad_norm": 0.055006738752126694, + "learning_rate": 3.167989477333337e-05, + "loss": 0.1793, + "num_input_tokens_seen": 36803552, + "step": 63450 + }, + { + "epoch": 9.451146857313077, + "grad_norm": 18.456228256225586, + "learning_rate": 3.16767634370756e-05, + "loss": 0.4074, + "num_input_tokens_seen": 36806432, + "step": 63455 + }, + { + "epoch": 9.451891569854036, + "grad_norm": 6.009599208831787, + "learning_rate": 3.16736319880207e-05, + "loss": 0.2325, + "num_input_tokens_seen": 36809536, + "step": 63460 + }, + { + "epoch": 9.452636282394996, + "grad_norm": 0.1914140284061432, + "learning_rate": 3.1670500426221566e-05, + "loss": 0.1518, + "num_input_tokens_seen": 36812672, + "step": 63465 + }, + { + "epoch": 9.453380994935955, + "grad_norm": 0.036058343946933746, + "learning_rate": 3.1667368751731116e-05, + "loss": 0.2117, + "num_input_tokens_seen": 36815424, + "step": 63470 + }, + { + "epoch": 9.454125707476914, + "grad_norm": 0.06183118000626564, + "learning_rate": 3.1664236964602244e-05, + "loss": 0.0006, + "num_input_tokens_seen": 36818304, + "step": 63475 + }, + { + "epoch": 9.454870420017873, + "grad_norm": 0.01023353636264801, + "learning_rate": 3.166110506488786e-05, + "loss": 0.22, + "num_input_tokens_seen": 36821088, + "step": 63480 + }, + { + "epoch": 9.455615132558833, + "grad_norm": 0.12897410988807678, + "learning_rate": 3.165797305264087e-05, + "loss": 0.0006, + "num_input_tokens_seen": 36823904, + "step": 63485 + }, + { + "epoch": 9.456359845099792, + "grad_norm": 0.003948806319385767, + "learning_rate": 3.1654840927914196e-05, + "loss": 0.0042, + "num_input_tokens_seen": 36826464, + "step": 63490 + }, + { + "epoch": 9.45710455764075, + "grad_norm": 0.011692414060235023, + "learning_rate": 3.165170869076075e-05, + "loss": 0.0518, + "num_input_tokens_seen": 36829088, + "step": 63495 + }, + { + "epoch": 9.45784927018171, + "grad_norm": 0.021255619823932648, + "learning_rate": 3.164857634123345e-05, + "loss": 0.0004, + "num_input_tokens_seen": 36831968, + "step": 63500 + }, + { + "epoch": 9.458593982722668, + "grad_norm": 0.009595086798071861, + "learning_rate": 3.1645443879385206e-05, + "loss": 0.0714, + "num_input_tokens_seen": 36835040, + "step": 63505 + }, + { + "epoch": 9.459338695263629, + "grad_norm": 0.04302860051393509, + "learning_rate": 3.164231130526894e-05, + "loss": 0.2326, + "num_input_tokens_seen": 36838336, + "step": 63510 + }, + { + "epoch": 9.460083407804587, + "grad_norm": 0.0016784792533144355, + "learning_rate": 3.163917861893758e-05, + "loss": 0.107, + "num_input_tokens_seen": 36841056, + "step": 63515 + }, + { + "epoch": 9.460828120345546, + "grad_norm": 0.019458286464214325, + "learning_rate": 3.1636045820444044e-05, + "loss": 0.0061, + "num_input_tokens_seen": 36843904, + "step": 63520 + }, + { + "epoch": 9.461572832886505, + "grad_norm": 8.9280366897583, + "learning_rate": 3.163291290984125e-05, + "loss": 0.0835, + "num_input_tokens_seen": 36846752, + "step": 63525 + }, + { + "epoch": 9.462317545427466, + "grad_norm": 0.008742385543882847, + "learning_rate": 3.162977988718214e-05, + "loss": 0.0004, + "num_input_tokens_seen": 36849824, + "step": 63530 + }, + { + "epoch": 9.463062257968424, + "grad_norm": 0.009719249792397022, + "learning_rate": 3.162664675251965e-05, + "loss": 0.3322, + "num_input_tokens_seen": 36852704, + "step": 63535 + }, + { + "epoch": 9.463806970509383, + "grad_norm": 0.2581998109817505, + "learning_rate": 3.162351350590668e-05, + "loss": 0.104, + "num_input_tokens_seen": 36855584, + "step": 63540 + }, + { + "epoch": 9.464551683050342, + "grad_norm": 0.011531653814017773, + "learning_rate": 3.1620380147396186e-05, + "loss": 0.3144, + "num_input_tokens_seen": 36858272, + "step": 63545 + }, + { + "epoch": 9.465296395591302, + "grad_norm": 0.02806927263736725, + "learning_rate": 3.1617246677041104e-05, + "loss": 0.0948, + "num_input_tokens_seen": 36861728, + "step": 63550 + }, + { + "epoch": 9.466041108132261, + "grad_norm": 0.06987467408180237, + "learning_rate": 3.161411309489436e-05, + "loss": 0.0928, + "num_input_tokens_seen": 36864896, + "step": 63555 + }, + { + "epoch": 9.46678582067322, + "grad_norm": 6.171281814575195, + "learning_rate": 3.161097940100889e-05, + "loss": 0.1129, + "num_input_tokens_seen": 36867872, + "step": 63560 + }, + { + "epoch": 9.467530533214179, + "grad_norm": 19.92437171936035, + "learning_rate": 3.160784559543765e-05, + "loss": 0.0436, + "num_input_tokens_seen": 36870944, + "step": 63565 + }, + { + "epoch": 9.46827524575514, + "grad_norm": 15.050707817077637, + "learning_rate": 3.160471167823358e-05, + "loss": 0.0024, + "num_input_tokens_seen": 36873856, + "step": 63570 + }, + { + "epoch": 9.469019958296098, + "grad_norm": 2.392271041870117, + "learning_rate": 3.1601577649449606e-05, + "loss": 0.0016, + "num_input_tokens_seen": 36876384, + "step": 63575 + }, + { + "epoch": 9.469764670837057, + "grad_norm": 0.015791986137628555, + "learning_rate": 3.15984435091387e-05, + "loss": 0.0376, + "num_input_tokens_seen": 36878976, + "step": 63580 + }, + { + "epoch": 9.470509383378015, + "grad_norm": 0.009187326766550541, + "learning_rate": 3.159530925735379e-05, + "loss": 0.0328, + "num_input_tokens_seen": 36881920, + "step": 63585 + }, + { + "epoch": 9.471254095918976, + "grad_norm": 4.469796180725098, + "learning_rate": 3.1592174894147835e-05, + "loss": 0.0118, + "num_input_tokens_seen": 36884736, + "step": 63590 + }, + { + "epoch": 9.471998808459935, + "grad_norm": 0.0642758384346962, + "learning_rate": 3.158904041957379e-05, + "loss": 0.0171, + "num_input_tokens_seen": 36887488, + "step": 63595 + }, + { + "epoch": 9.472743521000893, + "grad_norm": 0.001137027284130454, + "learning_rate": 3.1585905833684595e-05, + "loss": 0.0002, + "num_input_tokens_seen": 36890752, + "step": 63600 + }, + { + "epoch": 9.473488233541852, + "grad_norm": 0.005295178387314081, + "learning_rate": 3.158277113653322e-05, + "loss": 0.0741, + "num_input_tokens_seen": 36893504, + "step": 63605 + }, + { + "epoch": 9.474232946082813, + "grad_norm": 0.018807770684361458, + "learning_rate": 3.157963632817261e-05, + "loss": 0.0005, + "num_input_tokens_seen": 36896192, + "step": 63610 + }, + { + "epoch": 9.474977658623772, + "grad_norm": 28.58445167541504, + "learning_rate": 3.157650140865574e-05, + "loss": 0.034, + "num_input_tokens_seen": 36899072, + "step": 63615 + }, + { + "epoch": 9.47572237116473, + "grad_norm": 0.5805764198303223, + "learning_rate": 3.157336637803556e-05, + "loss": 0.1743, + "num_input_tokens_seen": 36901888, + "step": 63620 + }, + { + "epoch": 9.476467083705689, + "grad_norm": 0.022402537986636162, + "learning_rate": 3.1570231236365035e-05, + "loss": 0.0071, + "num_input_tokens_seen": 36904544, + "step": 63625 + }, + { + "epoch": 9.47721179624665, + "grad_norm": 0.0029563126154243946, + "learning_rate": 3.156709598369713e-05, + "loss": 0.0952, + "num_input_tokens_seen": 36907584, + "step": 63630 + }, + { + "epoch": 9.477956508787608, + "grad_norm": 28.45254135131836, + "learning_rate": 3.1563960620084816e-05, + "loss": 0.1186, + "num_input_tokens_seen": 36910336, + "step": 63635 + }, + { + "epoch": 9.478701221328567, + "grad_norm": 0.05737672001123428, + "learning_rate": 3.1560825145581056e-05, + "loss": 0.3242, + "num_input_tokens_seen": 36912928, + "step": 63640 + }, + { + "epoch": 9.479445933869526, + "grad_norm": 0.03276189789175987, + "learning_rate": 3.155768956023882e-05, + "loss": 0.0949, + "num_input_tokens_seen": 36915744, + "step": 63645 + }, + { + "epoch": 9.480190646410486, + "grad_norm": 0.14425432682037354, + "learning_rate": 3.155455386411109e-05, + "loss": 0.0005, + "num_input_tokens_seen": 36918496, + "step": 63650 + }, + { + "epoch": 9.480935358951445, + "grad_norm": 0.016896329820156097, + "learning_rate": 3.1551418057250835e-05, + "loss": 0.088, + "num_input_tokens_seen": 36921664, + "step": 63655 + }, + { + "epoch": 9.481680071492404, + "grad_norm": 6.430524826049805, + "learning_rate": 3.1548282139711025e-05, + "loss": 0.2676, + "num_input_tokens_seen": 36924608, + "step": 63660 + }, + { + "epoch": 9.482424784033363, + "grad_norm": 0.0869205892086029, + "learning_rate": 3.154514611154464e-05, + "loss": 0.0002, + "num_input_tokens_seen": 36927680, + "step": 63665 + }, + { + "epoch": 9.483169496574323, + "grad_norm": 0.04457123205065727, + "learning_rate": 3.154200997280468e-05, + "loss": 0.277, + "num_input_tokens_seen": 36930528, + "step": 63670 + }, + { + "epoch": 9.483914209115282, + "grad_norm": 0.029565032571554184, + "learning_rate": 3.1538873723544105e-05, + "loss": 0.0093, + "num_input_tokens_seen": 36933440, + "step": 63675 + }, + { + "epoch": 9.48465892165624, + "grad_norm": 0.0073043471202254295, + "learning_rate": 3.1535737363815896e-05, + "loss": 0.0006, + "num_input_tokens_seen": 36936000, + "step": 63680 + }, + { + "epoch": 9.4854036341972, + "grad_norm": 2.454864978790283, + "learning_rate": 3.1532600893673045e-05, + "loss": 0.001, + "num_input_tokens_seen": 36939424, + "step": 63685 + }, + { + "epoch": 9.486148346738158, + "grad_norm": 114.26742553710938, + "learning_rate": 3.152946431316855e-05, + "loss": 0.1099, + "num_input_tokens_seen": 36942240, + "step": 63690 + }, + { + "epoch": 9.486893059279119, + "grad_norm": 0.017618872225284576, + "learning_rate": 3.152632762235539e-05, + "loss": 0.1725, + "num_input_tokens_seen": 36944960, + "step": 63695 + }, + { + "epoch": 9.487637771820078, + "grad_norm": 88.98099517822266, + "learning_rate": 3.152319082128656e-05, + "loss": 0.2072, + "num_input_tokens_seen": 36947904, + "step": 63700 + }, + { + "epoch": 9.488382484361036, + "grad_norm": 247.96441650390625, + "learning_rate": 3.1520053910015046e-05, + "loss": 0.1305, + "num_input_tokens_seen": 36950912, + "step": 63705 + }, + { + "epoch": 9.489127196901995, + "grad_norm": 0.022176578640937805, + "learning_rate": 3.151691688859385e-05, + "loss": 0.1037, + "num_input_tokens_seen": 36953568, + "step": 63710 + }, + { + "epoch": 9.489871909442956, + "grad_norm": 2.454802989959717, + "learning_rate": 3.151377975707597e-05, + "loss": 0.453, + "num_input_tokens_seen": 36956672, + "step": 63715 + }, + { + "epoch": 9.490616621983914, + "grad_norm": 0.03736196830868721, + "learning_rate": 3.1510642515514393e-05, + "loss": 0.1501, + "num_input_tokens_seen": 36959776, + "step": 63720 + }, + { + "epoch": 9.491361334524873, + "grad_norm": 0.0473368838429451, + "learning_rate": 3.150750516396213e-05, + "loss": 0.0149, + "num_input_tokens_seen": 36962592, + "step": 63725 + }, + { + "epoch": 9.492106047065832, + "grad_norm": 142.093017578125, + "learning_rate": 3.1504367702472185e-05, + "loss": 0.5056, + "num_input_tokens_seen": 36965216, + "step": 63730 + }, + { + "epoch": 9.492850759606792, + "grad_norm": 0.474693238735199, + "learning_rate": 3.150123013109756e-05, + "loss": 0.0218, + "num_input_tokens_seen": 36968064, + "step": 63735 + }, + { + "epoch": 9.493595472147751, + "grad_norm": 0.03581438586115837, + "learning_rate": 3.149809244989125e-05, + "loss": 0.0002, + "num_input_tokens_seen": 36970784, + "step": 63740 + }, + { + "epoch": 9.49434018468871, + "grad_norm": 0.05685648322105408, + "learning_rate": 3.149495465890628e-05, + "loss": 0.0011, + "num_input_tokens_seen": 36973504, + "step": 63745 + }, + { + "epoch": 9.495084897229669, + "grad_norm": 0.03892504423856735, + "learning_rate": 3.149181675819565e-05, + "loss": 0.0003, + "num_input_tokens_seen": 36976256, + "step": 63750 + }, + { + "epoch": 9.49582960977063, + "grad_norm": 34.99245834350586, + "learning_rate": 3.148867874781238e-05, + "loss": 0.2656, + "num_input_tokens_seen": 36979424, + "step": 63755 + }, + { + "epoch": 9.496574322311588, + "grad_norm": 48.623844146728516, + "learning_rate": 3.148554062780947e-05, + "loss": 0.2741, + "num_input_tokens_seen": 36982464, + "step": 63760 + }, + { + "epoch": 9.497319034852547, + "grad_norm": 0.07075916230678558, + "learning_rate": 3.148240239823994e-05, + "loss": 0.0049, + "num_input_tokens_seen": 36984992, + "step": 63765 + }, + { + "epoch": 9.498063747393505, + "grad_norm": 0.004004338290542364, + "learning_rate": 3.147926405915682e-05, + "loss": 0.2409, + "num_input_tokens_seen": 36988160, + "step": 63770 + }, + { + "epoch": 9.498808459934466, + "grad_norm": 0.0011087176389992237, + "learning_rate": 3.147612561061312e-05, + "loss": 0.4051, + "num_input_tokens_seen": 36990816, + "step": 63775 + }, + { + "epoch": 9.499553172475425, + "grad_norm": 0.007436871062964201, + "learning_rate": 3.147298705266185e-05, + "loss": 0.0007, + "num_input_tokens_seen": 36993408, + "step": 63780 + }, + { + "epoch": 9.500297885016384, + "grad_norm": 0.0025258329696953297, + "learning_rate": 3.146984838535604e-05, + "loss": 0.0002, + "num_input_tokens_seen": 36995968, + "step": 63785 + }, + { + "epoch": 9.501042597557342, + "grad_norm": 9.86292839050293, + "learning_rate": 3.146670960874872e-05, + "loss": 0.0133, + "num_input_tokens_seen": 36998784, + "step": 63790 + }, + { + "epoch": 9.501787310098303, + "grad_norm": 0.03563648462295532, + "learning_rate": 3.146357072289292e-05, + "loss": 0.222, + "num_input_tokens_seen": 37001888, + "step": 63795 + }, + { + "epoch": 9.502532022639262, + "grad_norm": 0.0881372019648552, + "learning_rate": 3.146043172784166e-05, + "loss": 0.0008, + "num_input_tokens_seen": 37004576, + "step": 63800 + }, + { + "epoch": 9.50327673518022, + "grad_norm": 58.89942169189453, + "learning_rate": 3.1457292623647976e-05, + "loss": 0.3816, + "num_input_tokens_seen": 37007392, + "step": 63805 + }, + { + "epoch": 9.504021447721179, + "grad_norm": 49.6550407409668, + "learning_rate": 3.145415341036489e-05, + "loss": 0.2143, + "num_input_tokens_seen": 37010240, + "step": 63810 + }, + { + "epoch": 9.50476616026214, + "grad_norm": 0.04320527985692024, + "learning_rate": 3.1451014088045435e-05, + "loss": 0.009, + "num_input_tokens_seen": 37012928, + "step": 63815 + }, + { + "epoch": 9.505510872803098, + "grad_norm": 0.4312768876552582, + "learning_rate": 3.144787465674266e-05, + "loss": 0.0423, + "num_input_tokens_seen": 37015584, + "step": 63820 + }, + { + "epoch": 9.506255585344057, + "grad_norm": 0.06956643611192703, + "learning_rate": 3.14447351165096e-05, + "loss": 0.1172, + "num_input_tokens_seen": 37018368, + "step": 63825 + }, + { + "epoch": 9.507000297885016, + "grad_norm": 0.06448614597320557, + "learning_rate": 3.1441595467399286e-05, + "loss": 0.1105, + "num_input_tokens_seen": 37021152, + "step": 63830 + }, + { + "epoch": 9.507745010425975, + "grad_norm": 0.010374938137829304, + "learning_rate": 3.143845570946477e-05, + "loss": 0.0007, + "num_input_tokens_seen": 37024032, + "step": 63835 + }, + { + "epoch": 9.508489722966935, + "grad_norm": 0.3883788585662842, + "learning_rate": 3.143531584275909e-05, + "loss": 0.0049, + "num_input_tokens_seen": 37026944, + "step": 63840 + }, + { + "epoch": 9.509234435507894, + "grad_norm": 0.6055896878242493, + "learning_rate": 3.1432175867335275e-05, + "loss": 0.1943, + "num_input_tokens_seen": 37029920, + "step": 63845 + }, + { + "epoch": 9.509979148048853, + "grad_norm": 0.11449368298053741, + "learning_rate": 3.1429035783246395e-05, + "loss": 0.0034, + "num_input_tokens_seen": 37032608, + "step": 63850 + }, + { + "epoch": 9.510723860589813, + "grad_norm": 0.3202882409095764, + "learning_rate": 3.142589559054549e-05, + "loss": 0.0395, + "num_input_tokens_seen": 37035360, + "step": 63855 + }, + { + "epoch": 9.511468573130772, + "grad_norm": 0.02764161303639412, + "learning_rate": 3.142275528928561e-05, + "loss": 0.1817, + "num_input_tokens_seen": 37038048, + "step": 63860 + }, + { + "epoch": 9.51221328567173, + "grad_norm": 0.002286735223606229, + "learning_rate": 3.141961487951981e-05, + "loss": 0.0003, + "num_input_tokens_seen": 37040544, + "step": 63865 + }, + { + "epoch": 9.51295799821269, + "grad_norm": 0.013774980790913105, + "learning_rate": 3.141647436130113e-05, + "loss": 0.0034, + "num_input_tokens_seen": 37043264, + "step": 63870 + }, + { + "epoch": 9.513702710753648, + "grad_norm": 16.433427810668945, + "learning_rate": 3.1413333734682656e-05, + "loss": 0.192, + "num_input_tokens_seen": 37046240, + "step": 63875 + }, + { + "epoch": 9.514447423294609, + "grad_norm": 0.002808150602504611, + "learning_rate": 3.141019299971741e-05, + "loss": 0.0006, + "num_input_tokens_seen": 37049408, + "step": 63880 + }, + { + "epoch": 9.515192135835568, + "grad_norm": 0.03249788284301758, + "learning_rate": 3.140705215645847e-05, + "loss": 0.0299, + "num_input_tokens_seen": 37052448, + "step": 63885 + }, + { + "epoch": 9.515936848376526, + "grad_norm": 0.1753132939338684, + "learning_rate": 3.14039112049589e-05, + "loss": 0.3684, + "num_input_tokens_seen": 37055104, + "step": 63890 + }, + { + "epoch": 9.516681560917485, + "grad_norm": 0.008163836784660816, + "learning_rate": 3.140077014527176e-05, + "loss": 0.0647, + "num_input_tokens_seen": 37057792, + "step": 63895 + }, + { + "epoch": 9.517426273458446, + "grad_norm": 0.07643365114927292, + "learning_rate": 3.139762897745011e-05, + "loss": 0.0039, + "num_input_tokens_seen": 37060896, + "step": 63900 + }, + { + "epoch": 9.518170985999404, + "grad_norm": 0.010741706937551498, + "learning_rate": 3.139448770154702e-05, + "loss": 0.2098, + "num_input_tokens_seen": 37063744, + "step": 63905 + }, + { + "epoch": 9.518915698540363, + "grad_norm": 43.630062103271484, + "learning_rate": 3.139134631761557e-05, + "loss": 0.355, + "num_input_tokens_seen": 37066624, + "step": 63910 + }, + { + "epoch": 9.519660411081322, + "grad_norm": 0.15537871420383453, + "learning_rate": 3.1388204825708815e-05, + "loss": 0.124, + "num_input_tokens_seen": 37069216, + "step": 63915 + }, + { + "epoch": 9.520405123622282, + "grad_norm": 0.020046144723892212, + "learning_rate": 3.138506322587982e-05, + "loss": 0.0001, + "num_input_tokens_seen": 37071936, + "step": 63920 + }, + { + "epoch": 9.521149836163241, + "grad_norm": 0.22928555309772491, + "learning_rate": 3.138192151818168e-05, + "loss": 0.2016, + "num_input_tokens_seen": 37075072, + "step": 63925 + }, + { + "epoch": 9.5218945487042, + "grad_norm": 0.04048159718513489, + "learning_rate": 3.137877970266746e-05, + "loss": 0.0015, + "num_input_tokens_seen": 37077952, + "step": 63930 + }, + { + "epoch": 9.522639261245159, + "grad_norm": 0.008224575780332088, + "learning_rate": 3.1375637779390244e-05, + "loss": 0.1115, + "num_input_tokens_seen": 37081024, + "step": 63935 + }, + { + "epoch": 9.52338397378612, + "grad_norm": 0.007206294219940901, + "learning_rate": 3.137249574840311e-05, + "loss": 0.1068, + "num_input_tokens_seen": 37083936, + "step": 63940 + }, + { + "epoch": 9.524128686327078, + "grad_norm": 0.010181523859500885, + "learning_rate": 3.136935360975913e-05, + "loss": 0.0006, + "num_input_tokens_seen": 37086592, + "step": 63945 + }, + { + "epoch": 9.524873398868037, + "grad_norm": 0.064422108232975, + "learning_rate": 3.1366211363511394e-05, + "loss": 0.436, + "num_input_tokens_seen": 37089120, + "step": 63950 + }, + { + "epoch": 9.525618111408996, + "grad_norm": 0.008335834369063377, + "learning_rate": 3.1363069009712994e-05, + "loss": 0.0761, + "num_input_tokens_seen": 37092352, + "step": 63955 + }, + { + "epoch": 9.526362823949956, + "grad_norm": 0.02665039151906967, + "learning_rate": 3.1359926548417007e-05, + "loss": 0.0172, + "num_input_tokens_seen": 37095424, + "step": 63960 + }, + { + "epoch": 9.527107536490915, + "grad_norm": 0.09324803203344345, + "learning_rate": 3.135678397967652e-05, + "loss": 0.0006, + "num_input_tokens_seen": 37098304, + "step": 63965 + }, + { + "epoch": 9.527852249031874, + "grad_norm": 33.16094207763672, + "learning_rate": 3.135364130354464e-05, + "loss": 0.1538, + "num_input_tokens_seen": 37101088, + "step": 63970 + }, + { + "epoch": 9.528596961572832, + "grad_norm": 0.05880922079086304, + "learning_rate": 3.135049852007444e-05, + "loss": 0.0004, + "num_input_tokens_seen": 37103744, + "step": 63975 + }, + { + "epoch": 9.529341674113793, + "grad_norm": 0.021088192239403725, + "learning_rate": 3.134735562931902e-05, + "loss": 0.2644, + "num_input_tokens_seen": 37106528, + "step": 63980 + }, + { + "epoch": 9.530086386654752, + "grad_norm": 0.12988698482513428, + "learning_rate": 3.1344212631331484e-05, + "loss": 0.0004, + "num_input_tokens_seen": 37109632, + "step": 63985 + }, + { + "epoch": 9.53083109919571, + "grad_norm": 0.005215099547058344, + "learning_rate": 3.134106952616491e-05, + "loss": 0.0003, + "num_input_tokens_seen": 37112384, + "step": 63990 + }, + { + "epoch": 9.53157581173667, + "grad_norm": 0.13813690841197968, + "learning_rate": 3.133792631387243e-05, + "loss": 0.0017, + "num_input_tokens_seen": 37115136, + "step": 63995 + }, + { + "epoch": 9.53232052427763, + "grad_norm": 0.028619663789868355, + "learning_rate": 3.133478299450712e-05, + "loss": 0.1668, + "num_input_tokens_seen": 37117920, + "step": 64000 + }, + { + "epoch": 9.533065236818588, + "grad_norm": 53.95813751220703, + "learning_rate": 3.1331639568122084e-05, + "loss": 0.1596, + "num_input_tokens_seen": 37121024, + "step": 64005 + }, + { + "epoch": 9.533809949359547, + "grad_norm": 0.05245603993535042, + "learning_rate": 3.132849603477044e-05, + "loss": 0.0078, + "num_input_tokens_seen": 37123936, + "step": 64010 + }, + { + "epoch": 9.534554661900506, + "grad_norm": 0.003393783699721098, + "learning_rate": 3.132535239450528e-05, + "loss": 0.0058, + "num_input_tokens_seen": 37126944, + "step": 64015 + }, + { + "epoch": 9.535299374441465, + "grad_norm": 41.56221389770508, + "learning_rate": 3.1322208647379724e-05, + "loss": 0.0258, + "num_input_tokens_seen": 37129888, + "step": 64020 + }, + { + "epoch": 9.536044086982425, + "grad_norm": 10.436169624328613, + "learning_rate": 3.1319064793446876e-05, + "loss": 0.0116, + "num_input_tokens_seen": 37132768, + "step": 64025 + }, + { + "epoch": 9.536788799523384, + "grad_norm": 0.022183138877153397, + "learning_rate": 3.131592083275986e-05, + "loss": 0.0041, + "num_input_tokens_seen": 37135584, + "step": 64030 + }, + { + "epoch": 9.537533512064343, + "grad_norm": 0.003329101949930191, + "learning_rate": 3.1312776765371765e-05, + "loss": 0.0004, + "num_input_tokens_seen": 37138560, + "step": 64035 + }, + { + "epoch": 9.538278224605303, + "grad_norm": 0.002905237954109907, + "learning_rate": 3.1309632591335734e-05, + "loss": 0.1283, + "num_input_tokens_seen": 37141440, + "step": 64040 + }, + { + "epoch": 9.539022937146262, + "grad_norm": 15.670912742614746, + "learning_rate": 3.1306488310704875e-05, + "loss": 0.1394, + "num_input_tokens_seen": 37144384, + "step": 64045 + }, + { + "epoch": 9.53976764968722, + "grad_norm": 0.029699452221393585, + "learning_rate": 3.1303343923532294e-05, + "loss": 0.1543, + "num_input_tokens_seen": 37147744, + "step": 64050 + }, + { + "epoch": 9.54051236222818, + "grad_norm": 29.23155975341797, + "learning_rate": 3.130019942987114e-05, + "loss": 0.2346, + "num_input_tokens_seen": 37150624, + "step": 64055 + }, + { + "epoch": 9.541257074769138, + "grad_norm": 0.30848824977874756, + "learning_rate": 3.1297054829774505e-05, + "loss": 0.0012, + "num_input_tokens_seen": 37153600, + "step": 64060 + }, + { + "epoch": 9.542001787310099, + "grad_norm": 28.384540557861328, + "learning_rate": 3.1293910123295535e-05, + "loss": 0.084, + "num_input_tokens_seen": 37156768, + "step": 64065 + }, + { + "epoch": 9.542746499851058, + "grad_norm": 0.003565198043361306, + "learning_rate": 3.1290765310487346e-05, + "loss": 0.0317, + "num_input_tokens_seen": 37159552, + "step": 64070 + }, + { + "epoch": 9.543491212392016, + "grad_norm": 0.011965462006628513, + "learning_rate": 3.1287620391403086e-05, + "loss": 0.3677, + "num_input_tokens_seen": 37162592, + "step": 64075 + }, + { + "epoch": 9.544235924932975, + "grad_norm": 0.016545427963137627, + "learning_rate": 3.128447536609585e-05, + "loss": 0.0029, + "num_input_tokens_seen": 37165344, + "step": 64080 + }, + { + "epoch": 9.544980637473936, + "grad_norm": 0.01696769706904888, + "learning_rate": 3.12813302346188e-05, + "loss": 0.046, + "num_input_tokens_seen": 37168192, + "step": 64085 + }, + { + "epoch": 9.545725350014894, + "grad_norm": 1.5828484296798706, + "learning_rate": 3.127818499702506e-05, + "loss": 0.4196, + "num_input_tokens_seen": 37171424, + "step": 64090 + }, + { + "epoch": 9.546470062555853, + "grad_norm": 0.024002982303500175, + "learning_rate": 3.127503965336776e-05, + "loss": 0.0003, + "num_input_tokens_seen": 37174240, + "step": 64095 + }, + { + "epoch": 9.547214775096812, + "grad_norm": 1.7787576913833618, + "learning_rate": 3.1271894203700045e-05, + "loss": 0.0004, + "num_input_tokens_seen": 37177024, + "step": 64100 + }, + { + "epoch": 9.547959487637772, + "grad_norm": 0.0024128130171447992, + "learning_rate": 3.126874864807505e-05, + "loss": 0.283, + "num_input_tokens_seen": 37179776, + "step": 64105 + }, + { + "epoch": 9.548704200178731, + "grad_norm": 4.598048210144043, + "learning_rate": 3.126560298654593e-05, + "loss": 0.0057, + "num_input_tokens_seen": 37182592, + "step": 64110 + }, + { + "epoch": 9.54944891271969, + "grad_norm": 26.759374618530273, + "learning_rate": 3.126245721916581e-05, + "loss": 0.2403, + "num_input_tokens_seen": 37185696, + "step": 64115 + }, + { + "epoch": 9.550193625260649, + "grad_norm": 0.46865329146385193, + "learning_rate": 3.125931134598783e-05, + "loss": 0.021, + "num_input_tokens_seen": 37188160, + "step": 64120 + }, + { + "epoch": 9.55093833780161, + "grad_norm": 17.029733657836914, + "learning_rate": 3.1256165367065155e-05, + "loss": 0.2003, + "num_input_tokens_seen": 37191040, + "step": 64125 + }, + { + "epoch": 9.551683050342568, + "grad_norm": 6.727047920227051, + "learning_rate": 3.125301928245092e-05, + "loss": 0.3961, + "num_input_tokens_seen": 37193984, + "step": 64130 + }, + { + "epoch": 9.552427762883527, + "grad_norm": 0.2097603678703308, + "learning_rate": 3.124987309219828e-05, + "loss": 0.1442, + "num_input_tokens_seen": 37196864, + "step": 64135 + }, + { + "epoch": 9.553172475424486, + "grad_norm": 0.035233691334724426, + "learning_rate": 3.124672679636039e-05, + "loss": 0.0265, + "num_input_tokens_seen": 37199776, + "step": 64140 + }, + { + "epoch": 9.553917187965446, + "grad_norm": 130.8999786376953, + "learning_rate": 3.12435803949904e-05, + "loss": 0.0243, + "num_input_tokens_seen": 37202656, + "step": 64145 + }, + { + "epoch": 9.554661900506405, + "grad_norm": 0.006338362116366625, + "learning_rate": 3.1240433888141466e-05, + "loss": 0.0053, + "num_input_tokens_seen": 37205408, + "step": 64150 + }, + { + "epoch": 9.555406613047364, + "grad_norm": 103.46914672851562, + "learning_rate": 3.1237287275866736e-05, + "loss": 0.0413, + "num_input_tokens_seen": 37208576, + "step": 64155 + }, + { + "epoch": 9.556151325588322, + "grad_norm": 0.6859825253486633, + "learning_rate": 3.123414055821938e-05, + "loss": 0.189, + "num_input_tokens_seen": 37211360, + "step": 64160 + }, + { + "epoch": 9.556896038129283, + "grad_norm": 0.03886369988322258, + "learning_rate": 3.1230993735252564e-05, + "loss": 0.1247, + "num_input_tokens_seen": 37214400, + "step": 64165 + }, + { + "epoch": 9.557640750670242, + "grad_norm": 73.19637298583984, + "learning_rate": 3.1227846807019435e-05, + "loss": 0.1794, + "num_input_tokens_seen": 37217376, + "step": 64170 + }, + { + "epoch": 9.5583854632112, + "grad_norm": 0.06183328106999397, + "learning_rate": 3.1224699773573164e-05, + "loss": 0.0005, + "num_input_tokens_seen": 37220224, + "step": 64175 + }, + { + "epoch": 9.55913017575216, + "grad_norm": 16.716842651367188, + "learning_rate": 3.1221552634966914e-05, + "loss": 0.1929, + "num_input_tokens_seen": 37223200, + "step": 64180 + }, + { + "epoch": 9.55987488829312, + "grad_norm": 0.009012441150844097, + "learning_rate": 3.1218405391253856e-05, + "loss": 0.057, + "num_input_tokens_seen": 37225888, + "step": 64185 + }, + { + "epoch": 9.560619600834078, + "grad_norm": 3.1360111236572266, + "learning_rate": 3.121525804248716e-05, + "loss": 0.1118, + "num_input_tokens_seen": 37228832, + "step": 64190 + }, + { + "epoch": 9.561364313375037, + "grad_norm": 4.272585391998291, + "learning_rate": 3.1212110588720004e-05, + "loss": 0.1386, + "num_input_tokens_seen": 37231680, + "step": 64195 + }, + { + "epoch": 9.562109025915996, + "grad_norm": 0.05283956602215767, + "learning_rate": 3.1208963030005543e-05, + "loss": 0.0423, + "num_input_tokens_seen": 37234432, + "step": 64200 + }, + { + "epoch": 9.562853738456955, + "grad_norm": 67.15515899658203, + "learning_rate": 3.120581536639697e-05, + "loss": 0.0951, + "num_input_tokens_seen": 37237216, + "step": 64205 + }, + { + "epoch": 9.563598450997915, + "grad_norm": 88.72726440429688, + "learning_rate": 3.120266759794745e-05, + "loss": 0.0923, + "num_input_tokens_seen": 37239840, + "step": 64210 + }, + { + "epoch": 9.564343163538874, + "grad_norm": 0.07776243984699249, + "learning_rate": 3.119951972471016e-05, + "loss": 0.0416, + "num_input_tokens_seen": 37242752, + "step": 64215 + }, + { + "epoch": 9.565087876079833, + "grad_norm": 0.006213050335645676, + "learning_rate": 3.119637174673829e-05, + "loss": 0.0422, + "num_input_tokens_seen": 37246048, + "step": 64220 + }, + { + "epoch": 9.565832588620792, + "grad_norm": 0.002655158983543515, + "learning_rate": 3.119322366408501e-05, + "loss": 0.0023, + "num_input_tokens_seen": 37249024, + "step": 64225 + }, + { + "epoch": 9.566577301161752, + "grad_norm": 0.004771808162331581, + "learning_rate": 3.119007547680353e-05, + "loss": 0.0019, + "num_input_tokens_seen": 37251808, + "step": 64230 + }, + { + "epoch": 9.56732201370271, + "grad_norm": 0.0004844948125537485, + "learning_rate": 3.1186927184947e-05, + "loss": 0.2098, + "num_input_tokens_seen": 37254784, + "step": 64235 + }, + { + "epoch": 9.56806672624367, + "grad_norm": 0.030863502994179726, + "learning_rate": 3.118377878856863e-05, + "loss": 0.0002, + "num_input_tokens_seen": 37257760, + "step": 64240 + }, + { + "epoch": 9.568811438784628, + "grad_norm": 7.818873405456543, + "learning_rate": 3.1180630287721595e-05, + "loss": 0.2264, + "num_input_tokens_seen": 37260640, + "step": 64245 + }, + { + "epoch": 9.569556151325589, + "grad_norm": 0.13315145671367645, + "learning_rate": 3.11774816824591e-05, + "loss": 0.1651, + "num_input_tokens_seen": 37263648, + "step": 64250 + }, + { + "epoch": 9.570300863866548, + "grad_norm": 0.01179435569792986, + "learning_rate": 3.1174332972834326e-05, + "loss": 0.073, + "num_input_tokens_seen": 37266624, + "step": 64255 + }, + { + "epoch": 9.571045576407506, + "grad_norm": 12.669751167297363, + "learning_rate": 3.117118415890047e-05, + "loss": 0.1793, + "num_input_tokens_seen": 37269568, + "step": 64260 + }, + { + "epoch": 9.571790288948465, + "grad_norm": 0.07820672541856766, + "learning_rate": 3.116803524071074e-05, + "loss": 0.0008, + "num_input_tokens_seen": 37272480, + "step": 64265 + }, + { + "epoch": 9.572535001489426, + "grad_norm": 30.371793746948242, + "learning_rate": 3.116488621831831e-05, + "loss": 0.3007, + "num_input_tokens_seen": 37275200, + "step": 64270 + }, + { + "epoch": 9.573279714030384, + "grad_norm": 0.6714649796485901, + "learning_rate": 3.1161737091776404e-05, + "loss": 0.1191, + "num_input_tokens_seen": 37278208, + "step": 64275 + }, + { + "epoch": 9.574024426571343, + "grad_norm": 0.05018419399857521, + "learning_rate": 3.115858786113821e-05, + "loss": 0.1243, + "num_input_tokens_seen": 37281280, + "step": 64280 + }, + { + "epoch": 9.574769139112302, + "grad_norm": 0.011452356353402138, + "learning_rate": 3.115543852645693e-05, + "loss": 0.0038, + "num_input_tokens_seen": 37284224, + "step": 64285 + }, + { + "epoch": 9.575513851653263, + "grad_norm": 0.011468921788036823, + "learning_rate": 3.1152289087785776e-05, + "loss": 0.1566, + "num_input_tokens_seen": 37287808, + "step": 64290 + }, + { + "epoch": 9.576258564194221, + "grad_norm": 0.0032161283306777477, + "learning_rate": 3.114913954517794e-05, + "loss": 0.001, + "num_input_tokens_seen": 37290336, + "step": 64295 + }, + { + "epoch": 9.57700327673518, + "grad_norm": 0.0075767613016068935, + "learning_rate": 3.1145989898686656e-05, + "loss": 0.001, + "num_input_tokens_seen": 37293344, + "step": 64300 + }, + { + "epoch": 9.577747989276139, + "grad_norm": 0.022307777777314186, + "learning_rate": 3.114284014836512e-05, + "loss": 0.0002, + "num_input_tokens_seen": 37296192, + "step": 64305 + }, + { + "epoch": 9.5784927018171, + "grad_norm": 13.362765312194824, + "learning_rate": 3.1139690294266526e-05, + "loss": 0.2116, + "num_input_tokens_seen": 37298784, + "step": 64310 + }, + { + "epoch": 9.579237414358058, + "grad_norm": 10.331716537475586, + "learning_rate": 3.1136540336444114e-05, + "loss": 0.0695, + "num_input_tokens_seen": 37301824, + "step": 64315 + }, + { + "epoch": 9.579982126899017, + "grad_norm": 0.003303606528788805, + "learning_rate": 3.11333902749511e-05, + "loss": 0.2468, + "num_input_tokens_seen": 37304640, + "step": 64320 + }, + { + "epoch": 9.580726839439976, + "grad_norm": 75.97328186035156, + "learning_rate": 3.1130240109840676e-05, + "loss": 0.6241, + "num_input_tokens_seen": 37307328, + "step": 64325 + }, + { + "epoch": 9.581471551980936, + "grad_norm": 1.2960087060928345, + "learning_rate": 3.112708984116608e-05, + "loss": 0.3106, + "num_input_tokens_seen": 37310464, + "step": 64330 + }, + { + "epoch": 9.582216264521895, + "grad_norm": 0.5288209319114685, + "learning_rate": 3.1123939468980535e-05, + "loss": 0.0014, + "num_input_tokens_seen": 37313408, + "step": 64335 + }, + { + "epoch": 9.582960977062854, + "grad_norm": 0.0045568132773041725, + "learning_rate": 3.1120788993337244e-05, + "loss": 0.0004, + "num_input_tokens_seen": 37316288, + "step": 64340 + }, + { + "epoch": 9.583705689603812, + "grad_norm": 0.3800255358219147, + "learning_rate": 3.111763841428945e-05, + "loss": 0.0075, + "num_input_tokens_seen": 37318816, + "step": 64345 + }, + { + "epoch": 9.584450402144771, + "grad_norm": 0.027131952345371246, + "learning_rate": 3.111448773189037e-05, + "loss": 0.0119, + "num_input_tokens_seen": 37322080, + "step": 64350 + }, + { + "epoch": 9.585195114685732, + "grad_norm": 33.74758529663086, + "learning_rate": 3.1111336946193244e-05, + "loss": 0.1489, + "num_input_tokens_seen": 37324800, + "step": 64355 + }, + { + "epoch": 9.58593982722669, + "grad_norm": 0.0375768318772316, + "learning_rate": 3.1108186057251285e-05, + "loss": 0.0004, + "num_input_tokens_seen": 37327552, + "step": 64360 + }, + { + "epoch": 9.58668453976765, + "grad_norm": 0.03942205756902695, + "learning_rate": 3.1105035065117735e-05, + "loss": 0.0007, + "num_input_tokens_seen": 37330304, + "step": 64365 + }, + { + "epoch": 9.58742925230861, + "grad_norm": 0.05276770889759064, + "learning_rate": 3.110188396984582e-05, + "loss": 0.0076, + "num_input_tokens_seen": 37333056, + "step": 64370 + }, + { + "epoch": 9.588173964849569, + "grad_norm": 0.016384437680244446, + "learning_rate": 3.109873277148878e-05, + "loss": 0.2411, + "num_input_tokens_seen": 37335840, + "step": 64375 + }, + { + "epoch": 9.588918677390527, + "grad_norm": 5.249032974243164, + "learning_rate": 3.109558147009984e-05, + "loss": 0.1319, + "num_input_tokens_seen": 37338784, + "step": 64380 + }, + { + "epoch": 9.589663389931486, + "grad_norm": 0.005731728859245777, + "learning_rate": 3.1092430065732246e-05, + "loss": 0.037, + "num_input_tokens_seen": 37341664, + "step": 64385 + }, + { + "epoch": 9.590408102472445, + "grad_norm": 12.31067180633545, + "learning_rate": 3.108927855843924e-05, + "loss": 0.1775, + "num_input_tokens_seen": 37344608, + "step": 64390 + }, + { + "epoch": 9.591152815013405, + "grad_norm": 0.03479394689202309, + "learning_rate": 3.108612694827407e-05, + "loss": 0.0252, + "num_input_tokens_seen": 37347328, + "step": 64395 + }, + { + "epoch": 9.591897527554364, + "grad_norm": 0.10814578086137772, + "learning_rate": 3.108297523528997e-05, + "loss": 0.1934, + "num_input_tokens_seen": 37349952, + "step": 64400 + }, + { + "epoch": 9.592642240095323, + "grad_norm": 0.7218517661094666, + "learning_rate": 3.107982341954018e-05, + "loss": 0.0482, + "num_input_tokens_seen": 37353152, + "step": 64405 + }, + { + "epoch": 9.593386952636282, + "grad_norm": 0.14841295778751373, + "learning_rate": 3.1076671501077946e-05, + "loss": 0.1817, + "num_input_tokens_seen": 37355968, + "step": 64410 + }, + { + "epoch": 9.594131665177242, + "grad_norm": 2.1161277294158936, + "learning_rate": 3.1073519479956534e-05, + "loss": 0.0013, + "num_input_tokens_seen": 37358688, + "step": 64415 + }, + { + "epoch": 9.594876377718201, + "grad_norm": 4.543317794799805, + "learning_rate": 3.107036735622918e-05, + "loss": 0.0333, + "num_input_tokens_seen": 37361632, + "step": 64420 + }, + { + "epoch": 9.59562109025916, + "grad_norm": 0.03446027263998985, + "learning_rate": 3.106721512994913e-05, + "loss": 0.0441, + "num_input_tokens_seen": 37364352, + "step": 64425 + }, + { + "epoch": 9.596365802800118, + "grad_norm": 51.20167922973633, + "learning_rate": 3.1064062801169666e-05, + "loss": 0.0968, + "num_input_tokens_seen": 37367424, + "step": 64430 + }, + { + "epoch": 9.597110515341079, + "grad_norm": 0.09940924495458603, + "learning_rate": 3.106091036994401e-05, + "loss": 0.0584, + "num_input_tokens_seen": 37370624, + "step": 64435 + }, + { + "epoch": 9.597855227882038, + "grad_norm": 0.01513950526714325, + "learning_rate": 3.105775783632544e-05, + "loss": 0.0005, + "num_input_tokens_seen": 37373376, + "step": 64440 + }, + { + "epoch": 9.598599940422996, + "grad_norm": 0.07724787294864655, + "learning_rate": 3.10546052003672e-05, + "loss": 0.4678, + "num_input_tokens_seen": 37376352, + "step": 64445 + }, + { + "epoch": 9.599344652963955, + "grad_norm": 0.007274876814335585, + "learning_rate": 3.105145246212257e-05, + "loss": 0.0007, + "num_input_tokens_seen": 37379232, + "step": 64450 + }, + { + "epoch": 9.600089365504916, + "grad_norm": 0.05782948434352875, + "learning_rate": 3.1048299621644794e-05, + "loss": 0.2691, + "num_input_tokens_seen": 37382240, + "step": 64455 + }, + { + "epoch": 9.600834078045875, + "grad_norm": 24.709182739257812, + "learning_rate": 3.1045146678987144e-05, + "loss": 0.2395, + "num_input_tokens_seen": 37385024, + "step": 64460 + }, + { + "epoch": 9.601578790586833, + "grad_norm": 0.007196122780442238, + "learning_rate": 3.104199363420289e-05, + "loss": 0.0818, + "num_input_tokens_seen": 37388032, + "step": 64465 + }, + { + "epoch": 9.602323503127792, + "grad_norm": 47.0089111328125, + "learning_rate": 3.1038840487345286e-05, + "loss": 0.3232, + "num_input_tokens_seen": 37390976, + "step": 64470 + }, + { + "epoch": 9.603068215668753, + "grad_norm": 0.23767243325710297, + "learning_rate": 3.103568723846761e-05, + "loss": 0.0028, + "num_input_tokens_seen": 37393920, + "step": 64475 + }, + { + "epoch": 9.603812928209711, + "grad_norm": 0.07205722481012344, + "learning_rate": 3.103253388762314e-05, + "loss": 0.0343, + "num_input_tokens_seen": 37396768, + "step": 64480 + }, + { + "epoch": 9.60455764075067, + "grad_norm": 9.458548545837402, + "learning_rate": 3.102938043486513e-05, + "loss": 0.0554, + "num_input_tokens_seen": 37399712, + "step": 64485 + }, + { + "epoch": 9.605302353291629, + "grad_norm": 0.04608383774757385, + "learning_rate": 3.102622688024688e-05, + "loss": 0.0409, + "num_input_tokens_seen": 37402720, + "step": 64490 + }, + { + "epoch": 9.60604706583259, + "grad_norm": 0.007724236696958542, + "learning_rate": 3.1023073223821643e-05, + "loss": 0.0014, + "num_input_tokens_seen": 37405440, + "step": 64495 + }, + { + "epoch": 9.606791778373548, + "grad_norm": 13.284041404724121, + "learning_rate": 3.101991946564271e-05, + "loss": 0.1702, + "num_input_tokens_seen": 37408512, + "step": 64500 + }, + { + "epoch": 9.607536490914507, + "grad_norm": 23.131641387939453, + "learning_rate": 3.1016765605763346e-05, + "loss": 0.2979, + "num_input_tokens_seen": 37412064, + "step": 64505 + }, + { + "epoch": 9.608281203455466, + "grad_norm": 0.1618782877922058, + "learning_rate": 3.101361164423685e-05, + "loss": 0.0006, + "num_input_tokens_seen": 37415040, + "step": 64510 + }, + { + "epoch": 9.609025915996426, + "grad_norm": 0.007154768332839012, + "learning_rate": 3.1010457581116494e-05, + "loss": 0.0007, + "num_input_tokens_seen": 37417792, + "step": 64515 + }, + { + "epoch": 9.609770628537385, + "grad_norm": 0.17307980358600616, + "learning_rate": 3.100730341645557e-05, + "loss": 0.0322, + "num_input_tokens_seen": 37420768, + "step": 64520 + }, + { + "epoch": 9.610515341078344, + "grad_norm": 0.016959236934781075, + "learning_rate": 3.100414915030736e-05, + "loss": 0.0013, + "num_input_tokens_seen": 37423424, + "step": 64525 + }, + { + "epoch": 9.611260053619302, + "grad_norm": 0.004617162514477968, + "learning_rate": 3.100099478272515e-05, + "loss": 0.0005, + "num_input_tokens_seen": 37426560, + "step": 64530 + }, + { + "epoch": 9.612004766160261, + "grad_norm": 0.027666302397847176, + "learning_rate": 3.099784031376224e-05, + "loss": 0.0003, + "num_input_tokens_seen": 37429152, + "step": 64535 + }, + { + "epoch": 9.612749478701222, + "grad_norm": 0.004627354443073273, + "learning_rate": 3.09946857434719e-05, + "loss": 0.1818, + "num_input_tokens_seen": 37432224, + "step": 64540 + }, + { + "epoch": 9.61349419124218, + "grad_norm": 13.601637840270996, + "learning_rate": 3.099153107190744e-05, + "loss": 0.0044, + "num_input_tokens_seen": 37434976, + "step": 64545 + }, + { + "epoch": 9.61423890378314, + "grad_norm": 0.001680441084317863, + "learning_rate": 3.0988376299122154e-05, + "loss": 0.0002, + "num_input_tokens_seen": 37437952, + "step": 64550 + }, + { + "epoch": 9.6149836163241, + "grad_norm": 0.003000563010573387, + "learning_rate": 3.098522142516934e-05, + "loss": 0.0017, + "num_input_tokens_seen": 37440736, + "step": 64555 + }, + { + "epoch": 9.615728328865059, + "grad_norm": 0.0017120224656537175, + "learning_rate": 3.09820664501023e-05, + "loss": 0.1893, + "num_input_tokens_seen": 37443744, + "step": 64560 + }, + { + "epoch": 9.616473041406017, + "grad_norm": 0.005741037428379059, + "learning_rate": 3.097891137397432e-05, + "loss": 0.0003, + "num_input_tokens_seen": 37446432, + "step": 64565 + }, + { + "epoch": 9.617217753946976, + "grad_norm": 0.003241653088480234, + "learning_rate": 3.097575619683871e-05, + "loss": 0.0022, + "num_input_tokens_seen": 37449664, + "step": 64570 + }, + { + "epoch": 9.617962466487935, + "grad_norm": 0.0014264219207689166, + "learning_rate": 3.097260091874877e-05, + "loss": 0.1259, + "num_input_tokens_seen": 37452608, + "step": 64575 + }, + { + "epoch": 9.618707179028895, + "grad_norm": 43.05754470825195, + "learning_rate": 3.0969445539757805e-05, + "loss": 0.2639, + "num_input_tokens_seen": 37455360, + "step": 64580 + }, + { + "epoch": 9.619451891569854, + "grad_norm": 0.009400030598044395, + "learning_rate": 3.0966290059919126e-05, + "loss": 0.1291, + "num_input_tokens_seen": 37458528, + "step": 64585 + }, + { + "epoch": 9.620196604110813, + "grad_norm": 0.11570287495851517, + "learning_rate": 3.096313447928604e-05, + "loss": 0.0016, + "num_input_tokens_seen": 37461408, + "step": 64590 + }, + { + "epoch": 9.620941316651772, + "grad_norm": 0.33196353912353516, + "learning_rate": 3.095997879791187e-05, + "loss": 0.0675, + "num_input_tokens_seen": 37464384, + "step": 64595 + }, + { + "epoch": 9.621686029192732, + "grad_norm": 0.004335871897637844, + "learning_rate": 3.095682301584991e-05, + "loss": 0.1085, + "num_input_tokens_seen": 37467456, + "step": 64600 + }, + { + "epoch": 9.622430741733691, + "grad_norm": 19.763525009155273, + "learning_rate": 3.095366713315347e-05, + "loss": 0.0737, + "num_input_tokens_seen": 37470368, + "step": 64605 + }, + { + "epoch": 9.62317545427465, + "grad_norm": 8.52121353149414, + "learning_rate": 3.095051114987588e-05, + "loss": 0.0379, + "num_input_tokens_seen": 37473152, + "step": 64610 + }, + { + "epoch": 9.623920166815608, + "grad_norm": 0.017765048891305923, + "learning_rate": 3.094735506607045e-05, + "loss": 0.2758, + "num_input_tokens_seen": 37476096, + "step": 64615 + }, + { + "epoch": 9.624664879356569, + "grad_norm": 52.523189544677734, + "learning_rate": 3.09441988817905e-05, + "loss": 0.2612, + "num_input_tokens_seen": 37478784, + "step": 64620 + }, + { + "epoch": 9.625409591897528, + "grad_norm": 0.003287161700427532, + "learning_rate": 3.0941042597089356e-05, + "loss": 0.0443, + "num_input_tokens_seen": 37481632, + "step": 64625 + }, + { + "epoch": 9.626154304438487, + "grad_norm": 0.030414868146181107, + "learning_rate": 3.093788621202033e-05, + "loss": 0.2514, + "num_input_tokens_seen": 37484608, + "step": 64630 + }, + { + "epoch": 9.626899016979445, + "grad_norm": 103.38446807861328, + "learning_rate": 3.0934729726636755e-05, + "loss": 0.1257, + "num_input_tokens_seen": 37487904, + "step": 64635 + }, + { + "epoch": 9.627643729520406, + "grad_norm": 0.0023992513306438923, + "learning_rate": 3.093157314099196e-05, + "loss": 0.2294, + "num_input_tokens_seen": 37490752, + "step": 64640 + }, + { + "epoch": 9.628388442061365, + "grad_norm": 11.579872131347656, + "learning_rate": 3.092841645513925e-05, + "loss": 0.1377, + "num_input_tokens_seen": 37493600, + "step": 64645 + }, + { + "epoch": 9.629133154602323, + "grad_norm": 0.18910308182239532, + "learning_rate": 3.092525966913198e-05, + "loss": 0.0317, + "num_input_tokens_seen": 37496480, + "step": 64650 + }, + { + "epoch": 9.629877867143282, + "grad_norm": 0.23145949840545654, + "learning_rate": 3.0922102783023466e-05, + "loss": 0.0484, + "num_input_tokens_seen": 37499520, + "step": 64655 + }, + { + "epoch": 9.630622579684243, + "grad_norm": 0.07410906255245209, + "learning_rate": 3.0918945796867044e-05, + "loss": 0.1613, + "num_input_tokens_seen": 37502432, + "step": 64660 + }, + { + "epoch": 9.631367292225201, + "grad_norm": 0.07314538210630417, + "learning_rate": 3.091578871071605e-05, + "loss": 0.0082, + "num_input_tokens_seen": 37505248, + "step": 64665 + }, + { + "epoch": 9.63211200476616, + "grad_norm": 0.16173966228961945, + "learning_rate": 3.0912631524623826e-05, + "loss": 0.0579, + "num_input_tokens_seen": 37508544, + "step": 64670 + }, + { + "epoch": 9.632856717307119, + "grad_norm": 0.05182607099413872, + "learning_rate": 3.0909474238643694e-05, + "loss": 0.1225, + "num_input_tokens_seen": 37511680, + "step": 64675 + }, + { + "epoch": 9.63360142984808, + "grad_norm": 0.031703632324934006, + "learning_rate": 3.090631685282901e-05, + "loss": 0.2129, + "num_input_tokens_seen": 37514464, + "step": 64680 + }, + { + "epoch": 9.634346142389038, + "grad_norm": 9.732566833496094, + "learning_rate": 3.0903159367233086e-05, + "loss": 0.1164, + "num_input_tokens_seen": 37517408, + "step": 64685 + }, + { + "epoch": 9.635090854929997, + "grad_norm": 0.12853851914405823, + "learning_rate": 3.09000017819093e-05, + "loss": 0.0414, + "num_input_tokens_seen": 37520480, + "step": 64690 + }, + { + "epoch": 9.635835567470956, + "grad_norm": 0.0016698422841727734, + "learning_rate": 3.089684409691097e-05, + "loss": 0.0016, + "num_input_tokens_seen": 37523456, + "step": 64695 + }, + { + "epoch": 9.636580280011916, + "grad_norm": 0.01869489811360836, + "learning_rate": 3.0893686312291466e-05, + "loss": 0.0056, + "num_input_tokens_seen": 37526592, + "step": 64700 + }, + { + "epoch": 9.637324992552875, + "grad_norm": 0.028430966660380363, + "learning_rate": 3.089052842810411e-05, + "loss": 0.0403, + "num_input_tokens_seen": 37529728, + "step": 64705 + }, + { + "epoch": 9.638069705093834, + "grad_norm": 2.013805866241455, + "learning_rate": 3.088737044440226e-05, + "loss": 0.0022, + "num_input_tokens_seen": 37532480, + "step": 64710 + }, + { + "epoch": 9.638814417634793, + "grad_norm": 0.010026146657764912, + "learning_rate": 3.088421236123928e-05, + "loss": 0.0102, + "num_input_tokens_seen": 37535680, + "step": 64715 + }, + { + "epoch": 9.639559130175751, + "grad_norm": 0.004649209789931774, + "learning_rate": 3.0881054178668514e-05, + "loss": 0.1256, + "num_input_tokens_seen": 37538240, + "step": 64720 + }, + { + "epoch": 9.640303842716712, + "grad_norm": 0.07104430347681046, + "learning_rate": 3.087789589674331e-05, + "loss": 0.0003, + "num_input_tokens_seen": 37542336, + "step": 64725 + }, + { + "epoch": 9.64104855525767, + "grad_norm": 42.92940139770508, + "learning_rate": 3.087473751551703e-05, + "loss": 0.3008, + "num_input_tokens_seen": 37545600, + "step": 64730 + }, + { + "epoch": 9.64179326779863, + "grad_norm": 0.13124173879623413, + "learning_rate": 3.087157903504303e-05, + "loss": 0.2262, + "num_input_tokens_seen": 37548352, + "step": 64735 + }, + { + "epoch": 9.642537980339588, + "grad_norm": 4.773364543914795, + "learning_rate": 3.086842045537467e-05, + "loss": 0.0748, + "num_input_tokens_seen": 37551072, + "step": 64740 + }, + { + "epoch": 9.643282692880549, + "grad_norm": 0.010397279635071754, + "learning_rate": 3.0865261776565306e-05, + "loss": 0.1371, + "num_input_tokens_seen": 37554080, + "step": 64745 + }, + { + "epoch": 9.644027405421507, + "grad_norm": 38.470741271972656, + "learning_rate": 3.0862102998668314e-05, + "loss": 0.2646, + "num_input_tokens_seen": 37557024, + "step": 64750 + }, + { + "epoch": 9.644772117962466, + "grad_norm": 0.06500719487667084, + "learning_rate": 3.085894412173704e-05, + "loss": 0.0008, + "num_input_tokens_seen": 37559776, + "step": 64755 + }, + { + "epoch": 9.645516830503425, + "grad_norm": 1.0653609037399292, + "learning_rate": 3.085578514582487e-05, + "loss": 0.115, + "num_input_tokens_seen": 37562464, + "step": 64760 + }, + { + "epoch": 9.646261543044385, + "grad_norm": 51.57623291015625, + "learning_rate": 3.0852626070985164e-05, + "loss": 0.1773, + "num_input_tokens_seen": 37565408, + "step": 64765 + }, + { + "epoch": 9.647006255585344, + "grad_norm": 0.008190271444618702, + "learning_rate": 3.084946689727128e-05, + "loss": 0.1259, + "num_input_tokens_seen": 37568256, + "step": 64770 + }, + { + "epoch": 9.647750968126303, + "grad_norm": 0.007817968726158142, + "learning_rate": 3.08463076247366e-05, + "loss": 0.0001, + "num_input_tokens_seen": 37571488, + "step": 64775 + }, + { + "epoch": 9.648495680667262, + "grad_norm": 12.931422233581543, + "learning_rate": 3.084314825343449e-05, + "loss": 0.0548, + "num_input_tokens_seen": 37574368, + "step": 64780 + }, + { + "epoch": 9.649240393208222, + "grad_norm": 0.005580745171755552, + "learning_rate": 3.083998878341833e-05, + "loss": 0.1436, + "num_input_tokens_seen": 37577120, + "step": 64785 + }, + { + "epoch": 9.649985105749181, + "grad_norm": 12.256474494934082, + "learning_rate": 3.0836829214741496e-05, + "loss": 0.0181, + "num_input_tokens_seen": 37580352, + "step": 64790 + }, + { + "epoch": 9.65072981829014, + "grad_norm": 112.64588928222656, + "learning_rate": 3.0833669547457375e-05, + "loss": 0.0391, + "num_input_tokens_seen": 37583136, + "step": 64795 + }, + { + "epoch": 9.651474530831099, + "grad_norm": 0.0669071227312088, + "learning_rate": 3.083050978161933e-05, + "loss": 0.0004, + "num_input_tokens_seen": 37586208, + "step": 64800 + }, + { + "epoch": 9.652219243372059, + "grad_norm": 0.003183458698913455, + "learning_rate": 3.082734991728075e-05, + "loss": 0.0014, + "num_input_tokens_seen": 37589152, + "step": 64805 + }, + { + "epoch": 9.652963955913018, + "grad_norm": 0.0009241256630048156, + "learning_rate": 3.0824189954495006e-05, + "loss": 0.0007, + "num_input_tokens_seen": 37592128, + "step": 64810 + }, + { + "epoch": 9.653708668453977, + "grad_norm": 0.031171953305602074, + "learning_rate": 3.08210298933155e-05, + "loss": 0.0379, + "num_input_tokens_seen": 37594944, + "step": 64815 + }, + { + "epoch": 9.654453380994935, + "grad_norm": 0.00860791839659214, + "learning_rate": 3.081786973379561e-05, + "loss": 0.2584, + "num_input_tokens_seen": 37598016, + "step": 64820 + }, + { + "epoch": 9.655198093535896, + "grad_norm": 0.1259409338235855, + "learning_rate": 3.081470947598872e-05, + "loss": 0.1256, + "num_input_tokens_seen": 37601344, + "step": 64825 + }, + { + "epoch": 9.655942806076855, + "grad_norm": 0.6277720332145691, + "learning_rate": 3.081154911994822e-05, + "loss": 0.0016, + "num_input_tokens_seen": 37604448, + "step": 64830 + }, + { + "epoch": 9.656687518617813, + "grad_norm": 41.75716018676758, + "learning_rate": 3.080838866572752e-05, + "loss": 0.4168, + "num_input_tokens_seen": 37607232, + "step": 64835 + }, + { + "epoch": 9.657432231158772, + "grad_norm": 0.0031193161848932505, + "learning_rate": 3.0805228113379986e-05, + "loss": 0.0025, + "num_input_tokens_seen": 37610144, + "step": 64840 + }, + { + "epoch": 9.658176943699733, + "grad_norm": 41.624149322509766, + "learning_rate": 3.080206746295902e-05, + "loss": 0.3413, + "num_input_tokens_seen": 37613184, + "step": 64845 + }, + { + "epoch": 9.658921656240691, + "grad_norm": 0.0457330159842968, + "learning_rate": 3.079890671451802e-05, + "loss": 0.0872, + "num_input_tokens_seen": 37615840, + "step": 64850 + }, + { + "epoch": 9.65966636878165, + "grad_norm": 0.06571748852729797, + "learning_rate": 3.079574586811039e-05, + "loss": 0.3008, + "num_input_tokens_seen": 37618656, + "step": 64855 + }, + { + "epoch": 9.660411081322609, + "grad_norm": 0.09818363189697266, + "learning_rate": 3.0792584923789525e-05, + "loss": 0.1675, + "num_input_tokens_seen": 37621696, + "step": 64860 + }, + { + "epoch": 9.66115579386357, + "grad_norm": 0.2946775257587433, + "learning_rate": 3.078942388160883e-05, + "loss": 0.0443, + "num_input_tokens_seen": 37624672, + "step": 64865 + }, + { + "epoch": 9.661900506404528, + "grad_norm": 61.37101745605469, + "learning_rate": 3.07862627416217e-05, + "loss": 0.2975, + "num_input_tokens_seen": 37627840, + "step": 64870 + }, + { + "epoch": 9.662645218945487, + "grad_norm": 93.75867462158203, + "learning_rate": 3.0783101503881526e-05, + "loss": 0.0621, + "num_input_tokens_seen": 37631264, + "step": 64875 + }, + { + "epoch": 9.663389931486446, + "grad_norm": 0.17809553444385529, + "learning_rate": 3.0779940168441754e-05, + "loss": 0.4263, + "num_input_tokens_seen": 37634240, + "step": 64880 + }, + { + "epoch": 9.664134644027406, + "grad_norm": 0.2877313196659088, + "learning_rate": 3.077677873535575e-05, + "loss": 0.0036, + "num_input_tokens_seen": 37637312, + "step": 64885 + }, + { + "epoch": 9.664879356568365, + "grad_norm": 0.06504776328802109, + "learning_rate": 3.0773617204676946e-05, + "loss": 0.2544, + "num_input_tokens_seen": 37640416, + "step": 64890 + }, + { + "epoch": 9.665624069109324, + "grad_norm": 0.010479412041604519, + "learning_rate": 3.077045557645875e-05, + "loss": 0.1042, + "num_input_tokens_seen": 37643552, + "step": 64895 + }, + { + "epoch": 9.666368781650283, + "grad_norm": 0.0731324702501297, + "learning_rate": 3.0767293850754566e-05, + "loss": 0.0002, + "num_input_tokens_seen": 37646208, + "step": 64900 + }, + { + "epoch": 9.667113494191241, + "grad_norm": 0.0050960080698132515, + "learning_rate": 3.076413202761782e-05, + "loss": 0.0005, + "num_input_tokens_seen": 37649088, + "step": 64905 + }, + { + "epoch": 9.667858206732202, + "grad_norm": 0.1515999138355255, + "learning_rate": 3.076097010710192e-05, + "loss": 0.1202, + "num_input_tokens_seen": 37651904, + "step": 64910 + }, + { + "epoch": 9.66860291927316, + "grad_norm": 0.02812884747982025, + "learning_rate": 3.075780808926028e-05, + "loss": 0.0053, + "num_input_tokens_seen": 37654624, + "step": 64915 + }, + { + "epoch": 9.66934763181412, + "grad_norm": 0.010872808285057545, + "learning_rate": 3.075464597414632e-05, + "loss": 0.0083, + "num_input_tokens_seen": 37657312, + "step": 64920 + }, + { + "epoch": 9.670092344355078, + "grad_norm": 0.00879353005439043, + "learning_rate": 3.075148376181348e-05, + "loss": 0.0002, + "num_input_tokens_seen": 37660288, + "step": 64925 + }, + { + "epoch": 9.670837056896039, + "grad_norm": 0.038145512342453, + "learning_rate": 3.074832145231517e-05, + "loss": 0.047, + "num_input_tokens_seen": 37663392, + "step": 64930 + }, + { + "epoch": 9.671581769436997, + "grad_norm": 7.282992839813232, + "learning_rate": 3.07451590457048e-05, + "loss": 0.1557, + "num_input_tokens_seen": 37666272, + "step": 64935 + }, + { + "epoch": 9.672326481977956, + "grad_norm": 26.698606491088867, + "learning_rate": 3.0741996542035804e-05, + "loss": 0.0187, + "num_input_tokens_seen": 37669312, + "step": 64940 + }, + { + "epoch": 9.673071194518915, + "grad_norm": 0.0634426698088646, + "learning_rate": 3.073883394136162e-05, + "loss": 0.0007, + "num_input_tokens_seen": 37672064, + "step": 64945 + }, + { + "epoch": 9.673815907059875, + "grad_norm": 123.43009948730469, + "learning_rate": 3.073567124373567e-05, + "loss": 0.0434, + "num_input_tokens_seen": 37674720, + "step": 64950 + }, + { + "epoch": 9.674560619600834, + "grad_norm": 0.07780859619379044, + "learning_rate": 3.0732508449211373e-05, + "loss": 0.2096, + "num_input_tokens_seen": 37677536, + "step": 64955 + }, + { + "epoch": 9.675305332141793, + "grad_norm": 0.0022733507212251425, + "learning_rate": 3.0729345557842184e-05, + "loss": 0.0035, + "num_input_tokens_seen": 37680192, + "step": 64960 + }, + { + "epoch": 9.676050044682752, + "grad_norm": 0.8627985119819641, + "learning_rate": 3.072618256968153e-05, + "loss": 0.0537, + "num_input_tokens_seen": 37684064, + "step": 64965 + }, + { + "epoch": 9.676794757223712, + "grad_norm": 0.026601234450936317, + "learning_rate": 3.072301948478283e-05, + "loss": 0.109, + "num_input_tokens_seen": 37686848, + "step": 64970 + }, + { + "epoch": 9.677539469764671, + "grad_norm": 0.05738958343863487, + "learning_rate": 3.0719856303199526e-05, + "loss": 0.2544, + "num_input_tokens_seen": 37689824, + "step": 64975 + }, + { + "epoch": 9.67828418230563, + "grad_norm": 0.04345190152525902, + "learning_rate": 3.071669302498508e-05, + "loss": 0.2268, + "num_input_tokens_seen": 37693184, + "step": 64980 + }, + { + "epoch": 9.679028894846589, + "grad_norm": 0.003988323733210564, + "learning_rate": 3.07135296501929e-05, + "loss": 0.1011, + "num_input_tokens_seen": 37696256, + "step": 64985 + }, + { + "epoch": 9.679773607387549, + "grad_norm": 0.004646615125238895, + "learning_rate": 3.071036617887645e-05, + "loss": 0.0007, + "num_input_tokens_seen": 37698912, + "step": 64990 + }, + { + "epoch": 9.680518319928508, + "grad_norm": 0.001306200516410172, + "learning_rate": 3.070720261108917e-05, + "loss": 0.1419, + "num_input_tokens_seen": 37702016, + "step": 64995 + }, + { + "epoch": 9.681263032469467, + "grad_norm": 0.006554764695465565, + "learning_rate": 3.07040389468845e-05, + "loss": 0.01, + "num_input_tokens_seen": 37704768, + "step": 65000 + }, + { + "epoch": 9.682007745010425, + "grad_norm": 0.17377568781375885, + "learning_rate": 3.070087518631589e-05, + "loss": 0.0011, + "num_input_tokens_seen": 37707424, + "step": 65005 + }, + { + "epoch": 9.682752457551386, + "grad_norm": 10.871663093566895, + "learning_rate": 3.069771132943679e-05, + "loss": 0.4985, + "num_input_tokens_seen": 37710400, + "step": 65010 + }, + { + "epoch": 9.683497170092345, + "grad_norm": 25.97916030883789, + "learning_rate": 3.069454737630064e-05, + "loss": 0.1944, + "num_input_tokens_seen": 37712928, + "step": 65015 + }, + { + "epoch": 9.684241882633303, + "grad_norm": 18.74098014831543, + "learning_rate": 3.069138332696091e-05, + "loss": 0.1286, + "num_input_tokens_seen": 37716128, + "step": 65020 + }, + { + "epoch": 9.684986595174262, + "grad_norm": 6.533641815185547, + "learning_rate": 3.0688219181471036e-05, + "loss": 0.2476, + "num_input_tokens_seen": 37718912, + "step": 65025 + }, + { + "epoch": 9.685731307715223, + "grad_norm": 0.03230040892958641, + "learning_rate": 3.0685054939884485e-05, + "loss": 0.0978, + "num_input_tokens_seen": 37722304, + "step": 65030 + }, + { + "epoch": 9.686476020256181, + "grad_norm": 0.121390700340271, + "learning_rate": 3.0681890602254704e-05, + "loss": 0.1371, + "num_input_tokens_seen": 37725248, + "step": 65035 + }, + { + "epoch": 9.68722073279714, + "grad_norm": 2.839351177215576, + "learning_rate": 3.067872616863516e-05, + "loss": 0.0031, + "num_input_tokens_seen": 37728224, + "step": 65040 + }, + { + "epoch": 9.687965445338099, + "grad_norm": 0.054248958826065063, + "learning_rate": 3.0675561639079306e-05, + "loss": 0.0124, + "num_input_tokens_seen": 37731072, + "step": 65045 + }, + { + "epoch": 9.688710157879058, + "grad_norm": 0.13029949367046356, + "learning_rate": 3.0672397013640605e-05, + "loss": 0.0493, + "num_input_tokens_seen": 37734240, + "step": 65050 + }, + { + "epoch": 9.689454870420018, + "grad_norm": 0.03639528527855873, + "learning_rate": 3.066923229237253e-05, + "loss": 0.0249, + "num_input_tokens_seen": 37736992, + "step": 65055 + }, + { + "epoch": 9.690199582960977, + "grad_norm": 0.0028020513709634542, + "learning_rate": 3.0666067475328534e-05, + "loss": 0.1383, + "num_input_tokens_seen": 37739712, + "step": 65060 + }, + { + "epoch": 9.690944295501936, + "grad_norm": 0.008756624534726143, + "learning_rate": 3.066290256256208e-05, + "loss": 0.0317, + "num_input_tokens_seen": 37742720, + "step": 65065 + }, + { + "epoch": 9.691689008042896, + "grad_norm": 1.7925971746444702, + "learning_rate": 3.065973755412665e-05, + "loss": 0.1526, + "num_input_tokens_seen": 37745696, + "step": 65070 + }, + { + "epoch": 9.692433720583855, + "grad_norm": 0.14447146654129028, + "learning_rate": 3.06565724500757e-05, + "loss": 0.023, + "num_input_tokens_seen": 37748416, + "step": 65075 + }, + { + "epoch": 9.693178433124814, + "grad_norm": 13.541803359985352, + "learning_rate": 3.0653407250462716e-05, + "loss": 0.083, + "num_input_tokens_seen": 37751360, + "step": 65080 + }, + { + "epoch": 9.693923145665773, + "grad_norm": 35.81514358520508, + "learning_rate": 3.065024195534116e-05, + "loss": 0.0371, + "num_input_tokens_seen": 37754272, + "step": 65085 + }, + { + "epoch": 9.694667858206731, + "grad_norm": 0.016976764425635338, + "learning_rate": 3.06470765647645e-05, + "loss": 0.1675, + "num_input_tokens_seen": 37757088, + "step": 65090 + }, + { + "epoch": 9.695412570747692, + "grad_norm": 0.003033509012311697, + "learning_rate": 3.064391107878623e-05, + "loss": 0.212, + "num_input_tokens_seen": 37760128, + "step": 65095 + }, + { + "epoch": 9.69615728328865, + "grad_norm": 126.42503356933594, + "learning_rate": 3.064074549745982e-05, + "loss": 0.1031, + "num_input_tokens_seen": 37762976, + "step": 65100 + }, + { + "epoch": 9.69690199582961, + "grad_norm": 21.391620635986328, + "learning_rate": 3.063757982083874e-05, + "loss": 0.3123, + "num_input_tokens_seen": 37765920, + "step": 65105 + }, + { + "epoch": 9.697646708370568, + "grad_norm": 0.07115144282579422, + "learning_rate": 3.063441404897648e-05, + "loss": 0.0008, + "num_input_tokens_seen": 37768864, + "step": 65110 + }, + { + "epoch": 9.698391420911529, + "grad_norm": 63.78565216064453, + "learning_rate": 3.063124818192652e-05, + "loss": 0.2397, + "num_input_tokens_seen": 37772000, + "step": 65115 + }, + { + "epoch": 9.699136133452487, + "grad_norm": 0.17754113674163818, + "learning_rate": 3.062808221974235e-05, + "loss": 0.1906, + "num_input_tokens_seen": 37774624, + "step": 65120 + }, + { + "epoch": 9.699880845993446, + "grad_norm": 0.12051302194595337, + "learning_rate": 3.062491616247745e-05, + "loss": 0.0012, + "num_input_tokens_seen": 37777536, + "step": 65125 + }, + { + "epoch": 9.700625558534405, + "grad_norm": 0.004894267302006483, + "learning_rate": 3.0621750010185316e-05, + "loss": 0.0842, + "num_input_tokens_seen": 37780480, + "step": 65130 + }, + { + "epoch": 9.701370271075366, + "grad_norm": 0.009168146178126335, + "learning_rate": 3.0618583762919417e-05, + "loss": 0.3229, + "num_input_tokens_seen": 37783552, + "step": 65135 + }, + { + "epoch": 9.702114983616324, + "grad_norm": 1.0049036741256714, + "learning_rate": 3.0615417420733264e-05, + "loss": 0.0022, + "num_input_tokens_seen": 37786432, + "step": 65140 + }, + { + "epoch": 9.702859696157283, + "grad_norm": 122.65001678466797, + "learning_rate": 3.0612250983680336e-05, + "loss": 0.0393, + "num_input_tokens_seen": 37789408, + "step": 65145 + }, + { + "epoch": 9.703604408698242, + "grad_norm": 0.008294561877846718, + "learning_rate": 3.060908445181413e-05, + "loss": 0.1251, + "num_input_tokens_seen": 37792448, + "step": 65150 + }, + { + "epoch": 9.704349121239202, + "grad_norm": 12.381872177124023, + "learning_rate": 3.060591782518815e-05, + "loss": 0.181, + "num_input_tokens_seen": 37795424, + "step": 65155 + }, + { + "epoch": 9.705093833780161, + "grad_norm": 0.08691179007291794, + "learning_rate": 3.060275110385588e-05, + "loss": 0.1098, + "num_input_tokens_seen": 37798208, + "step": 65160 + }, + { + "epoch": 9.70583854632112, + "grad_norm": 0.009751315228641033, + "learning_rate": 3.059958428787083e-05, + "loss": 0.1815, + "num_input_tokens_seen": 37801024, + "step": 65165 + }, + { + "epoch": 9.706583258862079, + "grad_norm": 0.14737434685230255, + "learning_rate": 3.059641737728649e-05, + "loss": 0.0936, + "num_input_tokens_seen": 37804160, + "step": 65170 + }, + { + "epoch": 9.70732797140304, + "grad_norm": 0.045496128499507904, + "learning_rate": 3.059325037215637e-05, + "loss": 0.0532, + "num_input_tokens_seen": 37807168, + "step": 65175 + }, + { + "epoch": 9.708072683943998, + "grad_norm": 0.11401446908712387, + "learning_rate": 3.059008327253396e-05, + "loss": 0.0014, + "num_input_tokens_seen": 37809760, + "step": 65180 + }, + { + "epoch": 9.708817396484957, + "grad_norm": 0.04910478740930557, + "learning_rate": 3.0586916078472785e-05, + "loss": 0.0438, + "num_input_tokens_seen": 37812800, + "step": 65185 + }, + { + "epoch": 9.709562109025915, + "grad_norm": 22.31898307800293, + "learning_rate": 3.058374879002634e-05, + "loss": 0.1321, + "num_input_tokens_seen": 37815584, + "step": 65190 + }, + { + "epoch": 9.710306821566876, + "grad_norm": 0.00041943779797293246, + "learning_rate": 3.0580581407248126e-05, + "loss": 0.0013, + "num_input_tokens_seen": 37818336, + "step": 65195 + }, + { + "epoch": 9.711051534107835, + "grad_norm": 17.04181480407715, + "learning_rate": 3.0577413930191666e-05, + "loss": 0.1871, + "num_input_tokens_seen": 37821120, + "step": 65200 + }, + { + "epoch": 9.711796246648793, + "grad_norm": 0.03840041160583496, + "learning_rate": 3.0574246358910474e-05, + "loss": 0.0647, + "num_input_tokens_seen": 37823872, + "step": 65205 + }, + { + "epoch": 9.712540959189752, + "grad_norm": 0.007996618747711182, + "learning_rate": 3.057107869345804e-05, + "loss": 0.1763, + "num_input_tokens_seen": 37826880, + "step": 65210 + }, + { + "epoch": 9.713285671730713, + "grad_norm": 0.21719056367874146, + "learning_rate": 3.0567910933887905e-05, + "loss": 0.0022, + "num_input_tokens_seen": 37829664, + "step": 65215 + }, + { + "epoch": 9.714030384271672, + "grad_norm": 61.17611312866211, + "learning_rate": 3.056474308025357e-05, + "loss": 0.0127, + "num_input_tokens_seen": 37832576, + "step": 65220 + }, + { + "epoch": 9.71477509681263, + "grad_norm": 0.8496868014335632, + "learning_rate": 3.056157513260856e-05, + "loss": 0.0368, + "num_input_tokens_seen": 37835424, + "step": 65225 + }, + { + "epoch": 9.715519809353589, + "grad_norm": 0.006399418227374554, + "learning_rate": 3.055840709100639e-05, + "loss": 0.1119, + "num_input_tokens_seen": 37838272, + "step": 65230 + }, + { + "epoch": 9.716264521894548, + "grad_norm": 5.604520797729492, + "learning_rate": 3.055523895550058e-05, + "loss": 0.1284, + "num_input_tokens_seen": 37841184, + "step": 65235 + }, + { + "epoch": 9.717009234435508, + "grad_norm": 0.04663851112127304, + "learning_rate": 3.055207072614465e-05, + "loss": 0.1271, + "num_input_tokens_seen": 37844192, + "step": 65240 + }, + { + "epoch": 9.717753946976467, + "grad_norm": 0.029965443536639214, + "learning_rate": 3.0548902402992134e-05, + "loss": 0.0002, + "num_input_tokens_seen": 37846848, + "step": 65245 + }, + { + "epoch": 9.718498659517426, + "grad_norm": 0.001798527780920267, + "learning_rate": 3.0545733986096545e-05, + "loss": 0.0004, + "num_input_tokens_seen": 37850144, + "step": 65250 + }, + { + "epoch": 9.719243372058386, + "grad_norm": 0.6064496040344238, + "learning_rate": 3.054256547551142e-05, + "loss": 0.0474, + "num_input_tokens_seen": 37852800, + "step": 65255 + }, + { + "epoch": 9.719988084599345, + "grad_norm": 0.005523071624338627, + "learning_rate": 3.0539396871290294e-05, + "loss": 0.0738, + "num_input_tokens_seen": 37855520, + "step": 65260 + }, + { + "epoch": 9.720732797140304, + "grad_norm": 0.003193669719621539, + "learning_rate": 3.053622817348668e-05, + "loss": 0.0001, + "num_input_tokens_seen": 37858112, + "step": 65265 + }, + { + "epoch": 9.721477509681263, + "grad_norm": 0.0070395153015851974, + "learning_rate": 3.053305938215411e-05, + "loss": 0.1591, + "num_input_tokens_seen": 37861152, + "step": 65270 + }, + { + "epoch": 9.722222222222221, + "grad_norm": 0.004723514895886183, + "learning_rate": 3.052989049734613e-05, + "loss": 0.0068, + "num_input_tokens_seen": 37864064, + "step": 65275 + }, + { + "epoch": 9.722966934763182, + "grad_norm": 28.208040237426758, + "learning_rate": 3.052672151911627e-05, + "loss": 0.0067, + "num_input_tokens_seen": 37866880, + "step": 65280 + }, + { + "epoch": 9.72371164730414, + "grad_norm": 4.680602073669434, + "learning_rate": 3.052355244751807e-05, + "loss": 0.0053, + "num_input_tokens_seen": 37870208, + "step": 65285 + }, + { + "epoch": 9.7244563598451, + "grad_norm": 0.1645456701517105, + "learning_rate": 3.052038328260507e-05, + "loss": 0.0003, + "num_input_tokens_seen": 37872960, + "step": 65290 + }, + { + "epoch": 9.725201072386058, + "grad_norm": 0.019075218588113785, + "learning_rate": 3.05172140244308e-05, + "loss": 0.0017, + "num_input_tokens_seen": 37875936, + "step": 65295 + }, + { + "epoch": 9.725945784927019, + "grad_norm": 0.09069263190031052, + "learning_rate": 3.051404467304881e-05, + "loss": 0.2027, + "num_input_tokens_seen": 37878816, + "step": 65300 + }, + { + "epoch": 9.726690497467978, + "grad_norm": 0.0017443826654925942, + "learning_rate": 3.051087522851263e-05, + "loss": 0.0005, + "num_input_tokens_seen": 37881536, + "step": 65305 + }, + { + "epoch": 9.727435210008936, + "grad_norm": 8.019673347473145, + "learning_rate": 3.050770569087582e-05, + "loss": 0.0055, + "num_input_tokens_seen": 37884448, + "step": 65310 + }, + { + "epoch": 9.728179922549895, + "grad_norm": 24.79522132873535, + "learning_rate": 3.0504536060191917e-05, + "loss": 0.1133, + "num_input_tokens_seen": 37887296, + "step": 65315 + }, + { + "epoch": 9.728924635090856, + "grad_norm": 0.028006507083773613, + "learning_rate": 3.0501366336514477e-05, + "loss": 0.0634, + "num_input_tokens_seen": 37890016, + "step": 65320 + }, + { + "epoch": 9.729669347631814, + "grad_norm": 22.658018112182617, + "learning_rate": 3.0498196519897044e-05, + "loss": 0.2536, + "num_input_tokens_seen": 37892832, + "step": 65325 + }, + { + "epoch": 9.730414060172773, + "grad_norm": 0.6370254755020142, + "learning_rate": 3.0495026610393168e-05, + "loss": 0.0039, + "num_input_tokens_seen": 37895712, + "step": 65330 + }, + { + "epoch": 9.731158772713732, + "grad_norm": 5.355196952819824, + "learning_rate": 3.0491856608056403e-05, + "loss": 0.0459, + "num_input_tokens_seen": 37898496, + "step": 65335 + }, + { + "epoch": 9.731903485254692, + "grad_norm": 0.002608593087643385, + "learning_rate": 3.0488686512940297e-05, + "loss": 0.0001, + "num_input_tokens_seen": 37901248, + "step": 65340 + }, + { + "epoch": 9.732648197795651, + "grad_norm": 0.014685526490211487, + "learning_rate": 3.0485516325098413e-05, + "loss": 0.3757, + "num_input_tokens_seen": 37904352, + "step": 65345 + }, + { + "epoch": 9.73339291033661, + "grad_norm": 0.007068927399814129, + "learning_rate": 3.0482346044584305e-05, + "loss": 0.0064, + "num_input_tokens_seen": 37907296, + "step": 65350 + }, + { + "epoch": 9.734137622877569, + "grad_norm": 0.004812837578356266, + "learning_rate": 3.047917567145153e-05, + "loss": 0.1041, + "num_input_tokens_seen": 37909920, + "step": 65355 + }, + { + "epoch": 9.73488233541853, + "grad_norm": 0.01529551763087511, + "learning_rate": 3.0476005205753666e-05, + "loss": 0.305, + "num_input_tokens_seen": 37912832, + "step": 65360 + }, + { + "epoch": 9.735627047959488, + "grad_norm": 0.008831209503114223, + "learning_rate": 3.047283464754425e-05, + "loss": 0.0977, + "num_input_tokens_seen": 37915392, + "step": 65365 + }, + { + "epoch": 9.736371760500447, + "grad_norm": 0.03469571843743324, + "learning_rate": 3.0469663996876853e-05, + "loss": 0.1233, + "num_input_tokens_seen": 37918368, + "step": 65370 + }, + { + "epoch": 9.737116473041405, + "grad_norm": 0.0006953368429094553, + "learning_rate": 3.046649325380504e-05, + "loss": 0.0018, + "num_input_tokens_seen": 37920992, + "step": 65375 + }, + { + "epoch": 9.737861185582366, + "grad_norm": 85.65571594238281, + "learning_rate": 3.0463322418382384e-05, + "loss": 0.512, + "num_input_tokens_seen": 37923968, + "step": 65380 + }, + { + "epoch": 9.738605898123325, + "grad_norm": 0.01867721416056156, + "learning_rate": 3.0460151490662442e-05, + "loss": 0.0004, + "num_input_tokens_seen": 37926752, + "step": 65385 + }, + { + "epoch": 9.739350610664284, + "grad_norm": 15.315540313720703, + "learning_rate": 3.0456980470698803e-05, + "loss": 0.1825, + "num_input_tokens_seen": 37929760, + "step": 65390 + }, + { + "epoch": 9.740095323205242, + "grad_norm": 4.188833713531494, + "learning_rate": 3.0453809358545016e-05, + "loss": 0.104, + "num_input_tokens_seen": 37932416, + "step": 65395 + }, + { + "epoch": 9.740840035746203, + "grad_norm": 0.05183049663901329, + "learning_rate": 3.0450638154254664e-05, + "loss": 0.0266, + "num_input_tokens_seen": 37935104, + "step": 65400 + }, + { + "epoch": 9.741584748287162, + "grad_norm": 11.769088745117188, + "learning_rate": 3.0447466857881318e-05, + "loss": 0.0155, + "num_input_tokens_seen": 37937920, + "step": 65405 + }, + { + "epoch": 9.74232946082812, + "grad_norm": 3.828272581100464, + "learning_rate": 3.0444295469478557e-05, + "loss": 0.0055, + "num_input_tokens_seen": 37940960, + "step": 65410 + }, + { + "epoch": 9.743074173369079, + "grad_norm": 0.010304938070476055, + "learning_rate": 3.0441123989099958e-05, + "loss": 0.1038, + "num_input_tokens_seen": 37943872, + "step": 65415 + }, + { + "epoch": 9.743818885910038, + "grad_norm": 0.2859608232975006, + "learning_rate": 3.0437952416799097e-05, + "loss": 0.0044, + "num_input_tokens_seen": 37946784, + "step": 65420 + }, + { + "epoch": 9.744563598450998, + "grad_norm": 0.006256405729800463, + "learning_rate": 3.0434780752629567e-05, + "loss": 0.0786, + "num_input_tokens_seen": 37949696, + "step": 65425 + }, + { + "epoch": 9.745308310991957, + "grad_norm": 0.06152963638305664, + "learning_rate": 3.043160899664493e-05, + "loss": 0.1423, + "num_input_tokens_seen": 37952736, + "step": 65430 + }, + { + "epoch": 9.746053023532916, + "grad_norm": 23.24112319946289, + "learning_rate": 3.042843714889878e-05, + "loss": 0.0052, + "num_input_tokens_seen": 37955616, + "step": 65435 + }, + { + "epoch": 9.746797736073875, + "grad_norm": 86.29308319091797, + "learning_rate": 3.0425265209444704e-05, + "loss": 0.209, + "num_input_tokens_seen": 37958496, + "step": 65440 + }, + { + "epoch": 9.747542448614835, + "grad_norm": 0.2754695415496826, + "learning_rate": 3.0422093178336287e-05, + "loss": 0.0878, + "num_input_tokens_seen": 37961376, + "step": 65445 + }, + { + "epoch": 9.748287161155794, + "grad_norm": 0.004871981684118509, + "learning_rate": 3.0418921055627115e-05, + "loss": 0.0856, + "num_input_tokens_seen": 37964288, + "step": 65450 + }, + { + "epoch": 9.749031873696753, + "grad_norm": 1.4056442975997925, + "learning_rate": 3.0415748841370778e-05, + "loss": 0.0024, + "num_input_tokens_seen": 37967200, + "step": 65455 + }, + { + "epoch": 9.749776586237711, + "grad_norm": 136.77418518066406, + "learning_rate": 3.0412576535620873e-05, + "loss": 0.3626, + "num_input_tokens_seen": 37969952, + "step": 65460 + }, + { + "epoch": 9.750521298778672, + "grad_norm": 0.04388931766152382, + "learning_rate": 3.0409404138430986e-05, + "loss": 0.0052, + "num_input_tokens_seen": 37973088, + "step": 65465 + }, + { + "epoch": 9.75126601131963, + "grad_norm": 0.012837350368499756, + "learning_rate": 3.040623164985471e-05, + "loss": 0.0381, + "num_input_tokens_seen": 37976416, + "step": 65470 + }, + { + "epoch": 9.75201072386059, + "grad_norm": 0.004668270703405142, + "learning_rate": 3.040305906994565e-05, + "loss": 0.0015, + "num_input_tokens_seen": 37979424, + "step": 65475 + }, + { + "epoch": 9.752755436401548, + "grad_norm": 0.009088058955967426, + "learning_rate": 3.0399886398757394e-05, + "loss": 0.033, + "num_input_tokens_seen": 37982432, + "step": 65480 + }, + { + "epoch": 9.753500148942509, + "grad_norm": 0.06923465430736542, + "learning_rate": 3.0396713636343553e-05, + "loss": 0.1102, + "num_input_tokens_seen": 37985632, + "step": 65485 + }, + { + "epoch": 9.754244861483468, + "grad_norm": 0.0015517983119934797, + "learning_rate": 3.039354078275771e-05, + "loss": 0.2025, + "num_input_tokens_seen": 37988320, + "step": 65490 + }, + { + "epoch": 9.754989574024426, + "grad_norm": 0.009555641561746597, + "learning_rate": 3.039036783805349e-05, + "loss": 0.0324, + "num_input_tokens_seen": 37991168, + "step": 65495 + }, + { + "epoch": 9.755734286565385, + "grad_norm": 15.411211013793945, + "learning_rate": 3.0387194802284474e-05, + "loss": 0.0027, + "num_input_tokens_seen": 37994272, + "step": 65500 + }, + { + "epoch": 9.756478999106346, + "grad_norm": 42.87919235229492, + "learning_rate": 3.0384021675504283e-05, + "loss": 0.1845, + "num_input_tokens_seen": 37996960, + "step": 65505 + }, + { + "epoch": 9.757223711647304, + "grad_norm": 0.015682918950915337, + "learning_rate": 3.038084845776651e-05, + "loss": 0.1419, + "num_input_tokens_seen": 37999808, + "step": 65510 + }, + { + "epoch": 9.757968424188263, + "grad_norm": 1.2968723773956299, + "learning_rate": 3.0377675149124772e-05, + "loss": 0.2834, + "num_input_tokens_seen": 38002784, + "step": 65515 + }, + { + "epoch": 9.758713136729222, + "grad_norm": 172.1160888671875, + "learning_rate": 3.0374501749632684e-05, + "loss": 0.4596, + "num_input_tokens_seen": 38005632, + "step": 65520 + }, + { + "epoch": 9.759457849270182, + "grad_norm": 0.017762448638677597, + "learning_rate": 3.037132825934385e-05, + "loss": 0.0287, + "num_input_tokens_seen": 38008288, + "step": 65525 + }, + { + "epoch": 9.760202561811141, + "grad_norm": 0.35387787222862244, + "learning_rate": 3.0368154678311888e-05, + "loss": 0.5838, + "num_input_tokens_seen": 38011712, + "step": 65530 + }, + { + "epoch": 9.7609472743521, + "grad_norm": 0.006728148087859154, + "learning_rate": 3.0364981006590404e-05, + "loss": 0.0005, + "num_input_tokens_seen": 38014784, + "step": 65535 + }, + { + "epoch": 9.761691986893059, + "grad_norm": 0.001692909630946815, + "learning_rate": 3.0361807244233016e-05, + "loss": 0.2302, + "num_input_tokens_seen": 38017472, + "step": 65540 + }, + { + "epoch": 9.76243669943402, + "grad_norm": 0.09973200410604477, + "learning_rate": 3.0358633391293346e-05, + "loss": 0.0091, + "num_input_tokens_seen": 38020064, + "step": 65545 + }, + { + "epoch": 9.763181411974978, + "grad_norm": 23.02698516845703, + "learning_rate": 3.0355459447825014e-05, + "loss": 0.5625, + "num_input_tokens_seen": 38023008, + "step": 65550 + }, + { + "epoch": 9.763926124515937, + "grad_norm": 3.7659685611724854, + "learning_rate": 3.0352285413881636e-05, + "loss": 0.095, + "num_input_tokens_seen": 38026016, + "step": 65555 + }, + { + "epoch": 9.764670837056896, + "grad_norm": 0.01149707194417715, + "learning_rate": 3.0349111289516834e-05, + "loss": 0.1425, + "num_input_tokens_seen": 38028992, + "step": 65560 + }, + { + "epoch": 9.765415549597854, + "grad_norm": 155.2217254638672, + "learning_rate": 3.0345937074784235e-05, + "loss": 0.2733, + "num_input_tokens_seen": 38032064, + "step": 65565 + }, + { + "epoch": 9.766160262138815, + "grad_norm": 0.030757736414670944, + "learning_rate": 3.0342762769737464e-05, + "loss": 0.0978, + "num_input_tokens_seen": 38035296, + "step": 65570 + }, + { + "epoch": 9.766904974679774, + "grad_norm": 34.08521270751953, + "learning_rate": 3.0339588374430146e-05, + "loss": 0.1047, + "num_input_tokens_seen": 38038208, + "step": 65575 + }, + { + "epoch": 9.767649687220732, + "grad_norm": 0.040358759462833405, + "learning_rate": 3.0336413888915903e-05, + "loss": 0.1504, + "num_input_tokens_seen": 38040992, + "step": 65580 + }, + { + "epoch": 9.768394399761693, + "grad_norm": 0.0551491342484951, + "learning_rate": 3.0333239313248372e-05, + "loss": 0.0008, + "num_input_tokens_seen": 38044352, + "step": 65585 + }, + { + "epoch": 9.769139112302652, + "grad_norm": 0.00341291562654078, + "learning_rate": 3.0330064647481195e-05, + "loss": 0.0175, + "num_input_tokens_seen": 38047136, + "step": 65590 + }, + { + "epoch": 9.76988382484361, + "grad_norm": 0.08795361965894699, + "learning_rate": 3.032688989166798e-05, + "loss": 0.1242, + "num_input_tokens_seen": 38049888, + "step": 65595 + }, + { + "epoch": 9.77062853738457, + "grad_norm": 0.010806486010551453, + "learning_rate": 3.0323715045862382e-05, + "loss": 0.0474, + "num_input_tokens_seen": 38052864, + "step": 65600 + }, + { + "epoch": 9.771373249925528, + "grad_norm": 0.07494901865720749, + "learning_rate": 3.0320540110118022e-05, + "loss": 0.0007, + "num_input_tokens_seen": 38055744, + "step": 65605 + }, + { + "epoch": 9.772117962466488, + "grad_norm": 0.0009399144328199327, + "learning_rate": 3.031736508448855e-05, + "loss": 0.0008, + "num_input_tokens_seen": 38058528, + "step": 65610 + }, + { + "epoch": 9.772862675007447, + "grad_norm": 26.8101806640625, + "learning_rate": 3.03141899690276e-05, + "loss": 0.1298, + "num_input_tokens_seen": 38061440, + "step": 65615 + }, + { + "epoch": 9.773607387548406, + "grad_norm": 0.06557509303092957, + "learning_rate": 3.031101476378881e-05, + "loss": 0.0109, + "num_input_tokens_seen": 38064512, + "step": 65620 + }, + { + "epoch": 9.774352100089365, + "grad_norm": 0.0441187284886837, + "learning_rate": 3.0307839468825826e-05, + "loss": 0.0285, + "num_input_tokens_seen": 38067360, + "step": 65625 + }, + { + "epoch": 9.775096812630325, + "grad_norm": 32.23395538330078, + "learning_rate": 3.0304664084192286e-05, + "loss": 0.1799, + "num_input_tokens_seen": 38070528, + "step": 65630 + }, + { + "epoch": 9.775841525171284, + "grad_norm": 0.10231956839561462, + "learning_rate": 3.0301488609941837e-05, + "loss": 0.1171, + "num_input_tokens_seen": 38073536, + "step": 65635 + }, + { + "epoch": 9.776586237712243, + "grad_norm": 0.04691478610038757, + "learning_rate": 3.0298313046128123e-05, + "loss": 0.0002, + "num_input_tokens_seen": 38076288, + "step": 65640 + }, + { + "epoch": 9.777330950253202, + "grad_norm": 0.0022152734454721212, + "learning_rate": 3.0295137392804796e-05, + "loss": 0.2135, + "num_input_tokens_seen": 38079264, + "step": 65645 + }, + { + "epoch": 9.778075662794162, + "grad_norm": 0.10194437950849533, + "learning_rate": 3.0291961650025512e-05, + "loss": 0.1342, + "num_input_tokens_seen": 38082400, + "step": 65650 + }, + { + "epoch": 9.77882037533512, + "grad_norm": 0.014948057942092419, + "learning_rate": 3.0288785817843907e-05, + "loss": 0.1411, + "num_input_tokens_seen": 38085280, + "step": 65655 + }, + { + "epoch": 9.77956508787608, + "grad_norm": 0.06825580447912216, + "learning_rate": 3.028560989631365e-05, + "loss": 0.0003, + "num_input_tokens_seen": 38088288, + "step": 65660 + }, + { + "epoch": 9.780309800417038, + "grad_norm": 0.32815060019493103, + "learning_rate": 3.0282433885488375e-05, + "loss": 0.185, + "num_input_tokens_seen": 38090976, + "step": 65665 + }, + { + "epoch": 9.781054512957999, + "grad_norm": 0.014522423036396503, + "learning_rate": 3.0279257785421755e-05, + "loss": 0.2164, + "num_input_tokens_seen": 38093824, + "step": 65670 + }, + { + "epoch": 9.781799225498958, + "grad_norm": 0.01736290194094181, + "learning_rate": 3.0276081596167434e-05, + "loss": 0.1814, + "num_input_tokens_seen": 38097024, + "step": 65675 + }, + { + "epoch": 9.782543938039916, + "grad_norm": 0.7850860357284546, + "learning_rate": 3.027290531777908e-05, + "loss": 0.1397, + "num_input_tokens_seen": 38099936, + "step": 65680 + }, + { + "epoch": 9.783288650580875, + "grad_norm": 0.3985779583454132, + "learning_rate": 3.026972895031035e-05, + "loss": 0.1367, + "num_input_tokens_seen": 38102560, + "step": 65685 + }, + { + "epoch": 9.784033363121836, + "grad_norm": 0.08733303099870682, + "learning_rate": 3.026655249381491e-05, + "loss": 0.0003, + "num_input_tokens_seen": 38105600, + "step": 65690 + }, + { + "epoch": 9.784778075662794, + "grad_norm": 0.23240339756011963, + "learning_rate": 3.0263375948346416e-05, + "loss": 0.0012, + "num_input_tokens_seen": 38108672, + "step": 65695 + }, + { + "epoch": 9.785522788203753, + "grad_norm": 0.02151951938867569, + "learning_rate": 3.026019931395853e-05, + "loss": 0.1467, + "num_input_tokens_seen": 38111520, + "step": 65700 + }, + { + "epoch": 9.786267500744712, + "grad_norm": 0.004965507425367832, + "learning_rate": 3.0257022590704926e-05, + "loss": 0.0041, + "num_input_tokens_seen": 38114432, + "step": 65705 + }, + { + "epoch": 9.787012213285673, + "grad_norm": 0.06937293708324432, + "learning_rate": 3.0253845778639267e-05, + "loss": 0.0177, + "num_input_tokens_seen": 38116992, + "step": 65710 + }, + { + "epoch": 9.787756925826631, + "grad_norm": 0.0166781023144722, + "learning_rate": 3.0250668877815226e-05, + "loss": 0.0486, + "num_input_tokens_seen": 38120032, + "step": 65715 + }, + { + "epoch": 9.78850163836759, + "grad_norm": 0.02356860600411892, + "learning_rate": 3.024749188828647e-05, + "loss": 0.0871, + "num_input_tokens_seen": 38122976, + "step": 65720 + }, + { + "epoch": 9.789246350908549, + "grad_norm": 22.696809768676758, + "learning_rate": 3.024431481010667e-05, + "loss": 0.1311, + "num_input_tokens_seen": 38126016, + "step": 65725 + }, + { + "epoch": 9.78999106344951, + "grad_norm": 0.02867545560002327, + "learning_rate": 3.0241137643329508e-05, + "loss": 0.1174, + "num_input_tokens_seen": 38128992, + "step": 65730 + }, + { + "epoch": 9.790735775990468, + "grad_norm": 0.025309601798653603, + "learning_rate": 3.0237960388008647e-05, + "loss": 0.3134, + "num_input_tokens_seen": 38132000, + "step": 65735 + }, + { + "epoch": 9.791480488531427, + "grad_norm": 0.10961078852415085, + "learning_rate": 3.0234783044197767e-05, + "loss": 0.0008, + "num_input_tokens_seen": 38135104, + "step": 65740 + }, + { + "epoch": 9.792225201072386, + "grad_norm": 0.010147323831915855, + "learning_rate": 3.0231605611950548e-05, + "loss": 0.1601, + "num_input_tokens_seen": 38137952, + "step": 65745 + }, + { + "epoch": 9.792969913613344, + "grad_norm": 0.010746494866907597, + "learning_rate": 3.0228428091320672e-05, + "loss": 0.1289, + "num_input_tokens_seen": 38140672, + "step": 65750 + }, + { + "epoch": 9.793714626154305, + "grad_norm": 11.667522430419922, + "learning_rate": 3.0225250482361818e-05, + "loss": 0.1004, + "num_input_tokens_seen": 38143520, + "step": 65755 + }, + { + "epoch": 9.794459338695264, + "grad_norm": 5.157482147216797, + "learning_rate": 3.0222072785127663e-05, + "loss": 0.0076, + "num_input_tokens_seen": 38146336, + "step": 65760 + }, + { + "epoch": 9.795204051236222, + "grad_norm": 136.49087524414062, + "learning_rate": 3.02188949996719e-05, + "loss": 0.2944, + "num_input_tokens_seen": 38149152, + "step": 65765 + }, + { + "epoch": 9.795948763777183, + "grad_norm": 31.17504119873047, + "learning_rate": 3.021571712604821e-05, + "loss": 0.1618, + "num_input_tokens_seen": 38152064, + "step": 65770 + }, + { + "epoch": 9.796693476318142, + "grad_norm": 19.10595703125, + "learning_rate": 3.0212539164310276e-05, + "loss": 0.0963, + "num_input_tokens_seen": 38154976, + "step": 65775 + }, + { + "epoch": 9.7974381888591, + "grad_norm": 77.70594024658203, + "learning_rate": 3.0209361114511796e-05, + "loss": 0.2607, + "num_input_tokens_seen": 38157888, + "step": 65780 + }, + { + "epoch": 9.79818290140006, + "grad_norm": 0.06805060803890228, + "learning_rate": 3.0206182976706447e-05, + "loss": 0.0976, + "num_input_tokens_seen": 38160928, + "step": 65785 + }, + { + "epoch": 9.798927613941018, + "grad_norm": 0.022048640996217728, + "learning_rate": 3.0203004750947938e-05, + "loss": 0.0156, + "num_input_tokens_seen": 38163648, + "step": 65790 + }, + { + "epoch": 9.799672326481979, + "grad_norm": 0.2945498824119568, + "learning_rate": 3.0199826437289947e-05, + "loss": 0.0003, + "num_input_tokens_seen": 38166624, + "step": 65795 + }, + { + "epoch": 9.800417039022937, + "grad_norm": 51.25508117675781, + "learning_rate": 3.0196648035786173e-05, + "loss": 0.1171, + "num_input_tokens_seen": 38169600, + "step": 65800 + }, + { + "epoch": 9.801161751563896, + "grad_norm": 0.6140754222869873, + "learning_rate": 3.019346954649031e-05, + "loss": 0.2844, + "num_input_tokens_seen": 38172480, + "step": 65805 + }, + { + "epoch": 9.801906464104855, + "grad_norm": 0.053977347910404205, + "learning_rate": 3.0190290969456063e-05, + "loss": 0.0006, + "num_input_tokens_seen": 38175136, + "step": 65810 + }, + { + "epoch": 9.802651176645815, + "grad_norm": 48.88047790527344, + "learning_rate": 3.0187112304737125e-05, + "loss": 0.1366, + "num_input_tokens_seen": 38178112, + "step": 65815 + }, + { + "epoch": 9.803395889186774, + "grad_norm": 0.009500222280621529, + "learning_rate": 3.0183933552387188e-05, + "loss": 0.1045, + "num_input_tokens_seen": 38180864, + "step": 65820 + }, + { + "epoch": 9.804140601727733, + "grad_norm": 138.33441162109375, + "learning_rate": 3.0180754712459973e-05, + "loss": 0.1198, + "num_input_tokens_seen": 38183904, + "step": 65825 + }, + { + "epoch": 9.804885314268692, + "grad_norm": 0.013105213642120361, + "learning_rate": 3.0177575785009172e-05, + "loss": 0.0846, + "num_input_tokens_seen": 38186752, + "step": 65830 + }, + { + "epoch": 9.805630026809652, + "grad_norm": 0.0002147908671759069, + "learning_rate": 3.017439677008848e-05, + "loss": 0.0869, + "num_input_tokens_seen": 38189632, + "step": 65835 + }, + { + "epoch": 9.80637473935061, + "grad_norm": 0.12298328429460526, + "learning_rate": 3.0171217667751617e-05, + "loss": 0.0833, + "num_input_tokens_seen": 38192608, + "step": 65840 + }, + { + "epoch": 9.80711945189157, + "grad_norm": 0.0006175932358019054, + "learning_rate": 3.016803847805229e-05, + "loss": 0.0549, + "num_input_tokens_seen": 38195584, + "step": 65845 + }, + { + "epoch": 9.807864164432528, + "grad_norm": 0.0037202953826636076, + "learning_rate": 3.016485920104421e-05, + "loss": 0.0705, + "num_input_tokens_seen": 38198624, + "step": 65850 + }, + { + "epoch": 9.808608876973489, + "grad_norm": 0.004901621490716934, + "learning_rate": 3.0161679836781076e-05, + "loss": 0.0016, + "num_input_tokens_seen": 38201408, + "step": 65855 + }, + { + "epoch": 9.809353589514448, + "grad_norm": 0.0009160630870610476, + "learning_rate": 3.0158500385316612e-05, + "loss": 0.2248, + "num_input_tokens_seen": 38203968, + "step": 65860 + }, + { + "epoch": 9.810098302055406, + "grad_norm": 0.024708019569516182, + "learning_rate": 3.0155320846704526e-05, + "loss": 0.135, + "num_input_tokens_seen": 38207008, + "step": 65865 + }, + { + "epoch": 9.810843014596365, + "grad_norm": 0.0030300291255116463, + "learning_rate": 3.015214122099853e-05, + "loss": 0.0006, + "num_input_tokens_seen": 38209600, + "step": 65870 + }, + { + "epoch": 9.811587727137326, + "grad_norm": 0.005912299733608961, + "learning_rate": 3.0148961508252347e-05, + "loss": 0.0067, + "num_input_tokens_seen": 38212416, + "step": 65875 + }, + { + "epoch": 9.812332439678285, + "grad_norm": 23.597949981689453, + "learning_rate": 3.0145781708519692e-05, + "loss": 0.337, + "num_input_tokens_seen": 38215456, + "step": 65880 + }, + { + "epoch": 9.813077152219243, + "grad_norm": 53.9227294921875, + "learning_rate": 3.0142601821854288e-05, + "loss": 0.3686, + "num_input_tokens_seen": 38218624, + "step": 65885 + }, + { + "epoch": 9.813821864760202, + "grad_norm": 0.04203371703624725, + "learning_rate": 3.0139421848309852e-05, + "loss": 0.0004, + "num_input_tokens_seen": 38221440, + "step": 65890 + }, + { + "epoch": 9.814566577301163, + "grad_norm": 0.7058846950531006, + "learning_rate": 3.0136241787940107e-05, + "loss": 0.0008, + "num_input_tokens_seen": 38224608, + "step": 65895 + }, + { + "epoch": 9.815311289842121, + "grad_norm": 0.9300472736358643, + "learning_rate": 3.0133061640798776e-05, + "loss": 0.0012, + "num_input_tokens_seen": 38227520, + "step": 65900 + }, + { + "epoch": 9.81605600238308, + "grad_norm": 23.9614200592041, + "learning_rate": 3.0129881406939587e-05, + "loss": 0.148, + "num_input_tokens_seen": 38230176, + "step": 65905 + }, + { + "epoch": 9.816800714924039, + "grad_norm": 0.03881038352847099, + "learning_rate": 3.012670108641626e-05, + "loss": 0.1304, + "num_input_tokens_seen": 38233376, + "step": 65910 + }, + { + "epoch": 9.817545427465, + "grad_norm": 51.40338897705078, + "learning_rate": 3.012352067928253e-05, + "loss": 0.2071, + "num_input_tokens_seen": 38236416, + "step": 65915 + }, + { + "epoch": 9.818290140005958, + "grad_norm": 104.34991455078125, + "learning_rate": 3.0120340185592132e-05, + "loss": 0.2797, + "num_input_tokens_seen": 38239232, + "step": 65920 + }, + { + "epoch": 9.819034852546917, + "grad_norm": 0.006837480701506138, + "learning_rate": 3.0117159605398786e-05, + "loss": 0.1658, + "num_input_tokens_seen": 38242176, + "step": 65925 + }, + { + "epoch": 9.819779565087876, + "grad_norm": 0.2051246464252472, + "learning_rate": 3.0113978938756237e-05, + "loss": 0.0426, + "num_input_tokens_seen": 38245120, + "step": 65930 + }, + { + "epoch": 9.820524277628834, + "grad_norm": 0.43064141273498535, + "learning_rate": 3.0110798185718202e-05, + "loss": 0.0008, + "num_input_tokens_seen": 38248128, + "step": 65935 + }, + { + "epoch": 9.821268990169795, + "grad_norm": 12.544380187988281, + "learning_rate": 3.0107617346338422e-05, + "loss": 0.1205, + "num_input_tokens_seen": 38250976, + "step": 65940 + }, + { + "epoch": 9.822013702710754, + "grad_norm": 0.030562207102775574, + "learning_rate": 3.0104436420670644e-05, + "loss": 0.0034, + "num_input_tokens_seen": 38253696, + "step": 65945 + }, + { + "epoch": 9.822758415251712, + "grad_norm": 0.037827733904123306, + "learning_rate": 3.0101255408768603e-05, + "loss": 0.3406, + "num_input_tokens_seen": 38256384, + "step": 65950 + }, + { + "epoch": 9.823503127792671, + "grad_norm": 256.2959899902344, + "learning_rate": 3.0098074310686042e-05, + "loss": 0.1326, + "num_input_tokens_seen": 38259424, + "step": 65955 + }, + { + "epoch": 9.824247840333632, + "grad_norm": 0.002583075547590852, + "learning_rate": 3.0094893126476686e-05, + "loss": 0.0003, + "num_input_tokens_seen": 38262016, + "step": 65960 + }, + { + "epoch": 9.82499255287459, + "grad_norm": 45.647010803222656, + "learning_rate": 3.0091711856194295e-05, + "loss": 0.4374, + "num_input_tokens_seen": 38264928, + "step": 65965 + }, + { + "epoch": 9.82573726541555, + "grad_norm": 19.883647918701172, + "learning_rate": 3.0088530499892605e-05, + "loss": 0.5667, + "num_input_tokens_seen": 38267872, + "step": 65970 + }, + { + "epoch": 9.826481977956508, + "grad_norm": 0.014071966521441936, + "learning_rate": 3.008534905762536e-05, + "loss": 0.0645, + "num_input_tokens_seen": 38270688, + "step": 65975 + }, + { + "epoch": 9.827226690497469, + "grad_norm": 0.011908230371773243, + "learning_rate": 3.0082167529446314e-05, + "loss": 0.045, + "num_input_tokens_seen": 38273472, + "step": 65980 + }, + { + "epoch": 9.827971403038427, + "grad_norm": 0.05239265784621239, + "learning_rate": 3.0078985915409214e-05, + "loss": 0.0008, + "num_input_tokens_seen": 38276256, + "step": 65985 + }, + { + "epoch": 9.828716115579386, + "grad_norm": 0.01457543857395649, + "learning_rate": 3.0075804215567817e-05, + "loss": 0.2858, + "num_input_tokens_seen": 38279520, + "step": 65990 + }, + { + "epoch": 9.829460828120345, + "grad_norm": 0.11893080919981003, + "learning_rate": 3.0072622429975856e-05, + "loss": 0.0523, + "num_input_tokens_seen": 38282304, + "step": 65995 + }, + { + "epoch": 9.830205540661305, + "grad_norm": 0.006738932803273201, + "learning_rate": 3.006944055868709e-05, + "loss": 0.0003, + "num_input_tokens_seen": 38285248, + "step": 66000 + }, + { + "epoch": 9.830950253202264, + "grad_norm": 0.010440049692988396, + "learning_rate": 3.0066258601755288e-05, + "loss": 0.1138, + "num_input_tokens_seen": 38288192, + "step": 66005 + }, + { + "epoch": 9.831694965743223, + "grad_norm": 7.863283634185791, + "learning_rate": 3.0063076559234192e-05, + "loss": 0.0867, + "num_input_tokens_seen": 38291104, + "step": 66010 + }, + { + "epoch": 9.832439678284182, + "grad_norm": 0.10868798196315765, + "learning_rate": 3.0059894431177565e-05, + "loss": 0.1088, + "num_input_tokens_seen": 38293856, + "step": 66015 + }, + { + "epoch": 9.833184390825142, + "grad_norm": 38.03788757324219, + "learning_rate": 3.0056712217639165e-05, + "loss": 0.0747, + "num_input_tokens_seen": 38296736, + "step": 66020 + }, + { + "epoch": 9.833929103366101, + "grad_norm": 0.024600770324468613, + "learning_rate": 3.005352991867275e-05, + "loss": 0.0005, + "num_input_tokens_seen": 38300128, + "step": 66025 + }, + { + "epoch": 9.83467381590706, + "grad_norm": 2.848536491394043, + "learning_rate": 3.0050347534332084e-05, + "loss": 0.0019, + "num_input_tokens_seen": 38302816, + "step": 66030 + }, + { + "epoch": 9.835418528448018, + "grad_norm": 5.408174991607666, + "learning_rate": 3.004716506467093e-05, + "loss": 0.0473, + "num_input_tokens_seen": 38305600, + "step": 66035 + }, + { + "epoch": 9.836163240988979, + "grad_norm": 0.11295246332883835, + "learning_rate": 3.0043982509743052e-05, + "loss": 0.1538, + "num_input_tokens_seen": 38308800, + "step": 66040 + }, + { + "epoch": 9.836907953529938, + "grad_norm": 3.2330634593963623, + "learning_rate": 3.004079986960221e-05, + "loss": 0.0033, + "num_input_tokens_seen": 38311968, + "step": 66045 + }, + { + "epoch": 9.837652666070897, + "grad_norm": 0.021895499899983406, + "learning_rate": 3.0037617144302188e-05, + "loss": 0.0004, + "num_input_tokens_seen": 38314592, + "step": 66050 + }, + { + "epoch": 9.838397378611855, + "grad_norm": 16.04832649230957, + "learning_rate": 3.0034434333896737e-05, + "loss": 0.2381, + "num_input_tokens_seen": 38317504, + "step": 66055 + }, + { + "epoch": 9.839142091152816, + "grad_norm": 77.79646301269531, + "learning_rate": 3.003125143843964e-05, + "loss": 0.0526, + "num_input_tokens_seen": 38320256, + "step": 66060 + }, + { + "epoch": 9.839886803693775, + "grad_norm": 0.007873980328440666, + "learning_rate": 3.002806845798466e-05, + "loss": 0.0004, + "num_input_tokens_seen": 38322848, + "step": 66065 + }, + { + "epoch": 9.840631516234733, + "grad_norm": 0.18431980907917023, + "learning_rate": 3.002488539258557e-05, + "loss": 0.063, + "num_input_tokens_seen": 38325856, + "step": 66070 + }, + { + "epoch": 9.841376228775692, + "grad_norm": 29.459850311279297, + "learning_rate": 3.0021702242296153e-05, + "loss": 0.1524, + "num_input_tokens_seen": 38328800, + "step": 66075 + }, + { + "epoch": 9.842120941316653, + "grad_norm": 0.031366314738988876, + "learning_rate": 3.0018519007170177e-05, + "loss": 0.0088, + "num_input_tokens_seen": 38331904, + "step": 66080 + }, + { + "epoch": 9.842865653857611, + "grad_norm": 19.348241806030273, + "learning_rate": 3.0015335687261425e-05, + "loss": 0.2901, + "num_input_tokens_seen": 38334912, + "step": 66085 + }, + { + "epoch": 9.84361036639857, + "grad_norm": 0.1761581301689148, + "learning_rate": 3.001215228262368e-05, + "loss": 0.129, + "num_input_tokens_seen": 38337696, + "step": 66090 + }, + { + "epoch": 9.844355078939529, + "grad_norm": 0.06965839862823486, + "learning_rate": 3.000896879331071e-05, + "loss": 0.0213, + "num_input_tokens_seen": 38340512, + "step": 66095 + }, + { + "epoch": 9.84509979148049, + "grad_norm": 46.33502197265625, + "learning_rate": 3.0005785219376304e-05, + "loss": 0.3781, + "num_input_tokens_seen": 38343680, + "step": 66100 + }, + { + "epoch": 9.845844504021448, + "grad_norm": 0.01897071674466133, + "learning_rate": 3.000260156087424e-05, + "loss": 0.1115, + "num_input_tokens_seen": 38346368, + "step": 66105 + }, + { + "epoch": 9.846589216562407, + "grad_norm": 0.0012065208284184337, + "learning_rate": 2.999941781785831e-05, + "loss": 0.3783, + "num_input_tokens_seen": 38349216, + "step": 66110 + }, + { + "epoch": 9.847333929103366, + "grad_norm": 0.011947905644774437, + "learning_rate": 2.9996233990382296e-05, + "loss": 0.0033, + "num_input_tokens_seen": 38351840, + "step": 66115 + }, + { + "epoch": 9.848078641644324, + "grad_norm": 0.17720256745815277, + "learning_rate": 2.9993050078499997e-05, + "loss": 0.0015, + "num_input_tokens_seen": 38354816, + "step": 66120 + }, + { + "epoch": 9.848823354185285, + "grad_norm": 6.566088676452637, + "learning_rate": 2.9989866082265177e-05, + "loss": 0.0996, + "num_input_tokens_seen": 38357696, + "step": 66125 + }, + { + "epoch": 9.849568066726244, + "grad_norm": 0.006657259538769722, + "learning_rate": 2.9986682001731647e-05, + "loss": 0.0177, + "num_input_tokens_seen": 38360704, + "step": 66130 + }, + { + "epoch": 9.850312779267203, + "grad_norm": 29.224721908569336, + "learning_rate": 2.99834978369532e-05, + "loss": 0.2215, + "num_input_tokens_seen": 38363776, + "step": 66135 + }, + { + "epoch": 9.851057491808161, + "grad_norm": 38.611915588378906, + "learning_rate": 2.998031358798361e-05, + "loss": 0.0958, + "num_input_tokens_seen": 38366816, + "step": 66140 + }, + { + "epoch": 9.851802204349122, + "grad_norm": 106.71014404296875, + "learning_rate": 2.997712925487669e-05, + "loss": 0.1461, + "num_input_tokens_seen": 38369920, + "step": 66145 + }, + { + "epoch": 9.85254691689008, + "grad_norm": 0.01101562101393938, + "learning_rate": 2.9973944837686228e-05, + "loss": 0.1268, + "num_input_tokens_seen": 38373120, + "step": 66150 + }, + { + "epoch": 9.85329162943104, + "grad_norm": 0.2954087257385254, + "learning_rate": 2.9970760336466032e-05, + "loss": 0.0008, + "num_input_tokens_seen": 38375872, + "step": 66155 + }, + { + "epoch": 9.854036341971998, + "grad_norm": 50.592506408691406, + "learning_rate": 2.9967575751269878e-05, + "loss": 0.3093, + "num_input_tokens_seen": 38378624, + "step": 66160 + }, + { + "epoch": 9.854781054512959, + "grad_norm": 0.055999401956796646, + "learning_rate": 2.9964391082151587e-05, + "loss": 0.0155, + "num_input_tokens_seen": 38381216, + "step": 66165 + }, + { + "epoch": 9.855525767053917, + "grad_norm": 1.4938368797302246, + "learning_rate": 2.9961206329164952e-05, + "loss": 0.0288, + "num_input_tokens_seen": 38384000, + "step": 66170 + }, + { + "epoch": 9.856270479594876, + "grad_norm": 0.11886527389287949, + "learning_rate": 2.9958021492363787e-05, + "loss": 0.0002, + "num_input_tokens_seen": 38386560, + "step": 66175 + }, + { + "epoch": 9.857015192135835, + "grad_norm": 0.05753898620605469, + "learning_rate": 2.9954836571801875e-05, + "loss": 0.258, + "num_input_tokens_seen": 38389312, + "step": 66180 + }, + { + "epoch": 9.857759904676795, + "grad_norm": 0.013650299049913883, + "learning_rate": 2.9951651567533046e-05, + "loss": 0.278, + "num_input_tokens_seen": 38392064, + "step": 66185 + }, + { + "epoch": 9.858504617217754, + "grad_norm": 68.81978607177734, + "learning_rate": 2.994846647961109e-05, + "loss": 0.0891, + "num_input_tokens_seen": 38394976, + "step": 66190 + }, + { + "epoch": 9.859249329758713, + "grad_norm": 0.06199662387371063, + "learning_rate": 2.9945281308089824e-05, + "loss": 0.0193, + "num_input_tokens_seen": 38398208, + "step": 66195 + }, + { + "epoch": 9.859994042299672, + "grad_norm": 0.0877196341753006, + "learning_rate": 2.9942096053023055e-05, + "loss": 0.0986, + "num_input_tokens_seen": 38401344, + "step": 66200 + }, + { + "epoch": 9.860738754840632, + "grad_norm": 0.0313260518014431, + "learning_rate": 2.9938910714464596e-05, + "loss": 0.0344, + "num_input_tokens_seen": 38404320, + "step": 66205 + }, + { + "epoch": 9.861483467381591, + "grad_norm": 0.0004672183422371745, + "learning_rate": 2.9935725292468263e-05, + "loss": 0.0662, + "num_input_tokens_seen": 38407360, + "step": 66210 + }, + { + "epoch": 9.86222817992255, + "grad_norm": 0.006844791583716869, + "learning_rate": 2.9932539787087872e-05, + "loss": 0.2284, + "num_input_tokens_seen": 38410112, + "step": 66215 + }, + { + "epoch": 9.862972892463509, + "grad_norm": 0.013161416165530682, + "learning_rate": 2.9929354198377223e-05, + "loss": 0.0381, + "num_input_tokens_seen": 38413344, + "step": 66220 + }, + { + "epoch": 9.863717605004469, + "grad_norm": 6.147922992706299, + "learning_rate": 2.9926168526390157e-05, + "loss": 0.0754, + "num_input_tokens_seen": 38416448, + "step": 66225 + }, + { + "epoch": 9.864462317545428, + "grad_norm": 0.12137989699840546, + "learning_rate": 2.9922982771180475e-05, + "loss": 0.301, + "num_input_tokens_seen": 38419456, + "step": 66230 + }, + { + "epoch": 9.865207030086387, + "grad_norm": 14.025900840759277, + "learning_rate": 2.9919796932801996e-05, + "loss": 0.4533, + "num_input_tokens_seen": 38422432, + "step": 66235 + }, + { + "epoch": 9.865951742627345, + "grad_norm": 0.0066292197443544865, + "learning_rate": 2.9916611011308555e-05, + "loss": 0.1009, + "num_input_tokens_seen": 38425152, + "step": 66240 + }, + { + "epoch": 9.866696455168306, + "grad_norm": 0.039274200797080994, + "learning_rate": 2.9913425006753965e-05, + "loss": 0.0487, + "num_input_tokens_seen": 38428224, + "step": 66245 + }, + { + "epoch": 9.867441167709265, + "grad_norm": 16.504690170288086, + "learning_rate": 2.9910238919192058e-05, + "loss": 0.0068, + "num_input_tokens_seen": 38431008, + "step": 66250 + }, + { + "epoch": 9.868185880250223, + "grad_norm": 0.6652398109436035, + "learning_rate": 2.9907052748676656e-05, + "loss": 0.0012, + "num_input_tokens_seen": 38433696, + "step": 66255 + }, + { + "epoch": 9.868930592791182, + "grad_norm": 0.17758828401565552, + "learning_rate": 2.9903866495261578e-05, + "loss": 0.0019, + "num_input_tokens_seen": 38436736, + "step": 66260 + }, + { + "epoch": 9.86967530533214, + "grad_norm": 0.013034531846642494, + "learning_rate": 2.9900680159000666e-05, + "loss": 0.0002, + "num_input_tokens_seen": 38439456, + "step": 66265 + }, + { + "epoch": 9.870420017873101, + "grad_norm": 0.023895082995295525, + "learning_rate": 2.9897493739947736e-05, + "loss": 0.0005, + "num_input_tokens_seen": 38442272, + "step": 66270 + }, + { + "epoch": 9.87116473041406, + "grad_norm": 10.103289604187012, + "learning_rate": 2.9894307238156634e-05, + "loss": 0.1698, + "num_input_tokens_seen": 38444960, + "step": 66275 + }, + { + "epoch": 9.871909442955019, + "grad_norm": 0.7464224100112915, + "learning_rate": 2.989112065368118e-05, + "loss": 0.1628, + "num_input_tokens_seen": 38447872, + "step": 66280 + }, + { + "epoch": 9.87265415549598, + "grad_norm": 6.301950454711914, + "learning_rate": 2.9887933986575218e-05, + "loss": 0.1441, + "num_input_tokens_seen": 38450816, + "step": 66285 + }, + { + "epoch": 9.873398868036938, + "grad_norm": 0.01442731637507677, + "learning_rate": 2.9884747236892578e-05, + "loss": 0.2919, + "num_input_tokens_seen": 38453760, + "step": 66290 + }, + { + "epoch": 9.874143580577897, + "grad_norm": 0.009895028546452522, + "learning_rate": 2.9881560404687103e-05, + "loss": 0.0482, + "num_input_tokens_seen": 38456736, + "step": 66295 + }, + { + "epoch": 9.874888293118856, + "grad_norm": 0.008549042977392673, + "learning_rate": 2.9878373490012617e-05, + "loss": 0.0023, + "num_input_tokens_seen": 38459520, + "step": 66300 + }, + { + "epoch": 9.875633005659815, + "grad_norm": 38.80406951904297, + "learning_rate": 2.9875186492922973e-05, + "loss": 0.2665, + "num_input_tokens_seen": 38462560, + "step": 66305 + }, + { + "epoch": 9.876377718200775, + "grad_norm": 132.23052978515625, + "learning_rate": 2.9871999413472006e-05, + "loss": 0.2664, + "num_input_tokens_seen": 38465568, + "step": 66310 + }, + { + "epoch": 9.877122430741734, + "grad_norm": 0.05666951462626457, + "learning_rate": 2.9868812251713564e-05, + "loss": 0.0006, + "num_input_tokens_seen": 38468384, + "step": 66315 + }, + { + "epoch": 9.877867143282693, + "grad_norm": 2.6122055053710938, + "learning_rate": 2.9865625007701487e-05, + "loss": 0.0028, + "num_input_tokens_seen": 38471328, + "step": 66320 + }, + { + "epoch": 9.878611855823651, + "grad_norm": 0.35387465357780457, + "learning_rate": 2.986243768148962e-05, + "loss": 0.0873, + "num_input_tokens_seen": 38474304, + "step": 66325 + }, + { + "epoch": 9.879356568364612, + "grad_norm": 105.23280334472656, + "learning_rate": 2.9859250273131812e-05, + "loss": 0.1574, + "num_input_tokens_seen": 38477120, + "step": 66330 + }, + { + "epoch": 9.88010128090557, + "grad_norm": 0.0016898945905268192, + "learning_rate": 2.985606278268191e-05, + "loss": 0.0445, + "num_input_tokens_seen": 38480096, + "step": 66335 + }, + { + "epoch": 9.88084599344653, + "grad_norm": 0.0023904249537736177, + "learning_rate": 2.985287521019376e-05, + "loss": 0.2512, + "num_input_tokens_seen": 38482784, + "step": 66340 + }, + { + "epoch": 9.881590705987488, + "grad_norm": 0.0033127330243587494, + "learning_rate": 2.984968755572121e-05, + "loss": 0.164, + "num_input_tokens_seen": 38485856, + "step": 66345 + }, + { + "epoch": 9.882335418528449, + "grad_norm": 0.01589694246649742, + "learning_rate": 2.9846499819318124e-05, + "loss": 0.0008, + "num_input_tokens_seen": 38488512, + "step": 66350 + }, + { + "epoch": 9.883080131069407, + "grad_norm": 0.0719524472951889, + "learning_rate": 2.9843312001038353e-05, + "loss": 0.1451, + "num_input_tokens_seen": 38491296, + "step": 66355 + }, + { + "epoch": 9.883824843610366, + "grad_norm": 0.2672133147716522, + "learning_rate": 2.9840124100935744e-05, + "loss": 0.0484, + "num_input_tokens_seen": 38494304, + "step": 66360 + }, + { + "epoch": 9.884569556151325, + "grad_norm": 0.14454221725463867, + "learning_rate": 2.9836936119064156e-05, + "loss": 0.0007, + "num_input_tokens_seen": 38497312, + "step": 66365 + }, + { + "epoch": 9.885314268692285, + "grad_norm": 51.220130920410156, + "learning_rate": 2.983374805547745e-05, + "loss": 0.3507, + "num_input_tokens_seen": 38500352, + "step": 66370 + }, + { + "epoch": 9.886058981233244, + "grad_norm": 77.96726989746094, + "learning_rate": 2.983055991022949e-05, + "loss": 0.24, + "num_input_tokens_seen": 38503232, + "step": 66375 + }, + { + "epoch": 9.886803693774203, + "grad_norm": 0.004057848360389471, + "learning_rate": 2.9827371683374116e-05, + "loss": 0.1314, + "num_input_tokens_seen": 38505856, + "step": 66380 + }, + { + "epoch": 9.887548406315162, + "grad_norm": 0.06536320596933365, + "learning_rate": 2.9824183374965214e-05, + "loss": 0.0005, + "num_input_tokens_seen": 38508800, + "step": 66385 + }, + { + "epoch": 9.888293118856122, + "grad_norm": 39.42267990112305, + "learning_rate": 2.982099498505664e-05, + "loss": 0.1324, + "num_input_tokens_seen": 38511712, + "step": 66390 + }, + { + "epoch": 9.889037831397081, + "grad_norm": 0.07525299489498138, + "learning_rate": 2.9817806513702244e-05, + "loss": 0.3913, + "num_input_tokens_seen": 38514400, + "step": 66395 + }, + { + "epoch": 9.88978254393804, + "grad_norm": 0.03955337405204773, + "learning_rate": 2.9814617960955908e-05, + "loss": 0.1879, + "num_input_tokens_seen": 38517888, + "step": 66400 + }, + { + "epoch": 9.890527256478999, + "grad_norm": 0.11124340444803238, + "learning_rate": 2.9811429326871498e-05, + "loss": 0.0021, + "num_input_tokens_seen": 38520864, + "step": 66405 + }, + { + "epoch": 9.891271969019959, + "grad_norm": 0.0280583668500185, + "learning_rate": 2.9808240611502873e-05, + "loss": 0.2236, + "num_input_tokens_seen": 38524032, + "step": 66410 + }, + { + "epoch": 9.892016681560918, + "grad_norm": 1.616723656654358, + "learning_rate": 2.9805051814903923e-05, + "loss": 0.1548, + "num_input_tokens_seen": 38527008, + "step": 66415 + }, + { + "epoch": 9.892761394101877, + "grad_norm": 0.01972997933626175, + "learning_rate": 2.98018629371285e-05, + "loss": 0.0016, + "num_input_tokens_seen": 38529824, + "step": 66420 + }, + { + "epoch": 9.893506106642835, + "grad_norm": 0.12494397163391113, + "learning_rate": 2.979867397823048e-05, + "loss": 0.2767, + "num_input_tokens_seen": 38532928, + "step": 66425 + }, + { + "epoch": 9.894250819183796, + "grad_norm": 13.234627723693848, + "learning_rate": 2.979548493826374e-05, + "loss": 0.2831, + "num_input_tokens_seen": 38535520, + "step": 66430 + }, + { + "epoch": 9.894995531724755, + "grad_norm": 0.34639161825180054, + "learning_rate": 2.9792295817282157e-05, + "loss": 0.0014, + "num_input_tokens_seen": 38538848, + "step": 66435 + }, + { + "epoch": 9.895740244265713, + "grad_norm": 23.74599838256836, + "learning_rate": 2.9789106615339603e-05, + "loss": 0.0561, + "num_input_tokens_seen": 38541664, + "step": 66440 + }, + { + "epoch": 9.896484956806672, + "grad_norm": 68.83405303955078, + "learning_rate": 2.9785917332489965e-05, + "loss": 0.1876, + "num_input_tokens_seen": 38544416, + "step": 66445 + }, + { + "epoch": 9.897229669347631, + "grad_norm": 0.04590624198317528, + "learning_rate": 2.9782727968787116e-05, + "loss": 0.2996, + "num_input_tokens_seen": 38547360, + "step": 66450 + }, + { + "epoch": 9.897974381888591, + "grad_norm": 0.1720026731491089, + "learning_rate": 2.9779538524284943e-05, + "loss": 0.0026, + "num_input_tokens_seen": 38550464, + "step": 66455 + }, + { + "epoch": 9.89871909442955, + "grad_norm": 54.16681671142578, + "learning_rate": 2.9776348999037322e-05, + "loss": 0.1438, + "num_input_tokens_seen": 38553248, + "step": 66460 + }, + { + "epoch": 9.899463806970509, + "grad_norm": 16.930463790893555, + "learning_rate": 2.9773159393098137e-05, + "loss": 0.1088, + "num_input_tokens_seen": 38555872, + "step": 66465 + }, + { + "epoch": 9.900208519511468, + "grad_norm": 0.03962206840515137, + "learning_rate": 2.9769969706521277e-05, + "loss": 0.2001, + "num_input_tokens_seen": 38558752, + "step": 66470 + }, + { + "epoch": 9.900953232052428, + "grad_norm": 0.017001943662762642, + "learning_rate": 2.9766779939360623e-05, + "loss": 0.0072, + "num_input_tokens_seen": 38561664, + "step": 66475 + }, + { + "epoch": 9.901697944593387, + "grad_norm": 24.149690628051758, + "learning_rate": 2.976359009167007e-05, + "loss": 0.0699, + "num_input_tokens_seen": 38564736, + "step": 66480 + }, + { + "epoch": 9.902442657134346, + "grad_norm": 0.11773265898227692, + "learning_rate": 2.976040016350351e-05, + "loss": 0.1524, + "num_input_tokens_seen": 38567872, + "step": 66485 + }, + { + "epoch": 9.903187369675305, + "grad_norm": 0.11413335800170898, + "learning_rate": 2.9757210154914816e-05, + "loss": 0.1362, + "num_input_tokens_seen": 38570880, + "step": 66490 + }, + { + "epoch": 9.903932082216265, + "grad_norm": 2.2103652954101562, + "learning_rate": 2.9754020065957905e-05, + "loss": 0.136, + "num_input_tokens_seen": 38574016, + "step": 66495 + }, + { + "epoch": 9.904676794757224, + "grad_norm": 0.04701453447341919, + "learning_rate": 2.9750829896686645e-05, + "loss": 0.0019, + "num_input_tokens_seen": 38576896, + "step": 66500 + }, + { + "epoch": 9.905421507298183, + "grad_norm": 0.06801865994930267, + "learning_rate": 2.9747639647154947e-05, + "loss": 0.1084, + "num_input_tokens_seen": 38579616, + "step": 66505 + }, + { + "epoch": 9.906166219839141, + "grad_norm": 0.04787808656692505, + "learning_rate": 2.97444493174167e-05, + "loss": 0.0651, + "num_input_tokens_seen": 38582272, + "step": 66510 + }, + { + "epoch": 9.906910932380102, + "grad_norm": 16.987058639526367, + "learning_rate": 2.9741258907525805e-05, + "loss": 0.0053, + "num_input_tokens_seen": 38584960, + "step": 66515 + }, + { + "epoch": 9.90765564492106, + "grad_norm": 0.01789572276175022, + "learning_rate": 2.9738068417536165e-05, + "loss": 0.2005, + "num_input_tokens_seen": 38587808, + "step": 66520 + }, + { + "epoch": 9.90840035746202, + "grad_norm": 0.008404096588492393, + "learning_rate": 2.9734877847501664e-05, + "loss": 0.0071, + "num_input_tokens_seen": 38590592, + "step": 66525 + }, + { + "epoch": 9.909145070002978, + "grad_norm": 130.3314208984375, + "learning_rate": 2.973168719747622e-05, + "loss": 0.0717, + "num_input_tokens_seen": 38593568, + "step": 66530 + }, + { + "epoch": 9.909889782543939, + "grad_norm": 1.957658052444458, + "learning_rate": 2.9728496467513734e-05, + "loss": 0.3697, + "num_input_tokens_seen": 38596160, + "step": 66535 + }, + { + "epoch": 9.910634495084897, + "grad_norm": 0.0022880134638398886, + "learning_rate": 2.9725305657668102e-05, + "loss": 0.3804, + "num_input_tokens_seen": 38599296, + "step": 66540 + }, + { + "epoch": 9.911379207625856, + "grad_norm": 0.005545489024370909, + "learning_rate": 2.9722114767993226e-05, + "loss": 0.2078, + "num_input_tokens_seen": 38602176, + "step": 66545 + }, + { + "epoch": 9.912123920166815, + "grad_norm": 0.06027836352586746, + "learning_rate": 2.971892379854303e-05, + "loss": 0.0005, + "num_input_tokens_seen": 38604960, + "step": 66550 + }, + { + "epoch": 9.912868632707776, + "grad_norm": 0.004492653533816338, + "learning_rate": 2.9715732749371412e-05, + "loss": 0.0019, + "num_input_tokens_seen": 38608128, + "step": 66555 + }, + { + "epoch": 9.913613345248734, + "grad_norm": 0.009480217471718788, + "learning_rate": 2.971254162053228e-05, + "loss": 0.1819, + "num_input_tokens_seen": 38610976, + "step": 66560 + }, + { + "epoch": 9.914358057789693, + "grad_norm": 0.08811664581298828, + "learning_rate": 2.9709350412079544e-05, + "loss": 0.048, + "num_input_tokens_seen": 38613600, + "step": 66565 + }, + { + "epoch": 9.915102770330652, + "grad_norm": 0.022195952013134956, + "learning_rate": 2.9706159124067123e-05, + "loss": 0.0002, + "num_input_tokens_seen": 38616544, + "step": 66570 + }, + { + "epoch": 9.915847482871612, + "grad_norm": 0.027011748403310776, + "learning_rate": 2.9702967756548927e-05, + "loss": 0.0003, + "num_input_tokens_seen": 38619264, + "step": 66575 + }, + { + "epoch": 9.916592195412571, + "grad_norm": 15.980470657348633, + "learning_rate": 2.969977630957887e-05, + "loss": 0.2048, + "num_input_tokens_seen": 38622208, + "step": 66580 + }, + { + "epoch": 9.91733690795353, + "grad_norm": 0.02998989075422287, + "learning_rate": 2.9696584783210874e-05, + "loss": 0.1244, + "num_input_tokens_seen": 38625056, + "step": 66585 + }, + { + "epoch": 9.918081620494489, + "grad_norm": 35.990203857421875, + "learning_rate": 2.969339317749884e-05, + "loss": 0.2913, + "num_input_tokens_seen": 38627904, + "step": 66590 + }, + { + "epoch": 9.91882633303545, + "grad_norm": 10.09718132019043, + "learning_rate": 2.9690201492496704e-05, + "loss": 0.3534, + "num_input_tokens_seen": 38630880, + "step": 66595 + }, + { + "epoch": 9.919571045576408, + "grad_norm": 0.5827385187149048, + "learning_rate": 2.968700972825838e-05, + "loss": 0.0309, + "num_input_tokens_seen": 38633888, + "step": 66600 + }, + { + "epoch": 9.920315758117367, + "grad_norm": 48.37803649902344, + "learning_rate": 2.9683817884837788e-05, + "loss": 0.0136, + "num_input_tokens_seen": 38636608, + "step": 66605 + }, + { + "epoch": 9.921060470658325, + "grad_norm": 5.47122859954834, + "learning_rate": 2.9680625962288856e-05, + "loss": 0.1174, + "num_input_tokens_seen": 38639200, + "step": 66610 + }, + { + "epoch": 9.921805183199286, + "grad_norm": 0.01191296149045229, + "learning_rate": 2.9677433960665512e-05, + "loss": 0.1534, + "num_input_tokens_seen": 38642240, + "step": 66615 + }, + { + "epoch": 9.922549895740245, + "grad_norm": 4.44636344909668, + "learning_rate": 2.967424188002167e-05, + "loss": 0.1208, + "num_input_tokens_seen": 38645088, + "step": 66620 + }, + { + "epoch": 9.923294608281203, + "grad_norm": 93.4595718383789, + "learning_rate": 2.967104972041126e-05, + "loss": 0.1693, + "num_input_tokens_seen": 38647808, + "step": 66625 + }, + { + "epoch": 9.924039320822162, + "grad_norm": 0.058076854795217514, + "learning_rate": 2.9667857481888218e-05, + "loss": 0.0006, + "num_input_tokens_seen": 38650624, + "step": 66630 + }, + { + "epoch": 9.924784033363121, + "grad_norm": 0.018656278029084206, + "learning_rate": 2.9664665164506455e-05, + "loss": 0.0015, + "num_input_tokens_seen": 38653376, + "step": 66635 + }, + { + "epoch": 9.925528745904082, + "grad_norm": 71.43083953857422, + "learning_rate": 2.9661472768319924e-05, + "loss": 0.1262, + "num_input_tokens_seen": 38656160, + "step": 66640 + }, + { + "epoch": 9.92627345844504, + "grad_norm": 6.69856595993042, + "learning_rate": 2.9658280293382545e-05, + "loss": 0.0038, + "num_input_tokens_seen": 38658944, + "step": 66645 + }, + { + "epoch": 9.927018170985999, + "grad_norm": 89.24234771728516, + "learning_rate": 2.9655087739748267e-05, + "loss": 0.2289, + "num_input_tokens_seen": 38662176, + "step": 66650 + }, + { + "epoch": 9.927762883526958, + "grad_norm": 0.0007189256139099598, + "learning_rate": 2.9651895107471004e-05, + "loss": 0.1693, + "num_input_tokens_seen": 38664992, + "step": 66655 + }, + { + "epoch": 9.928507596067918, + "grad_norm": 0.0019177644280716777, + "learning_rate": 2.96487023966047e-05, + "loss": 0.0002, + "num_input_tokens_seen": 38667744, + "step": 66660 + }, + { + "epoch": 9.929252308608877, + "grad_norm": 0.030222220346331596, + "learning_rate": 2.9645509607203294e-05, + "loss": 0.1236, + "num_input_tokens_seen": 38670432, + "step": 66665 + }, + { + "epoch": 9.929997021149836, + "grad_norm": 122.86050415039062, + "learning_rate": 2.9642316739320724e-05, + "loss": 0.4327, + "num_input_tokens_seen": 38673440, + "step": 66670 + }, + { + "epoch": 9.930741733690795, + "grad_norm": 0.09245087951421738, + "learning_rate": 2.9639123793010933e-05, + "loss": 0.101, + "num_input_tokens_seen": 38676608, + "step": 66675 + }, + { + "epoch": 9.931486446231755, + "grad_norm": 0.015157106332480907, + "learning_rate": 2.9635930768327856e-05, + "loss": 0.0002, + "num_input_tokens_seen": 38679680, + "step": 66680 + }, + { + "epoch": 9.932231158772714, + "grad_norm": 1.2344837188720703, + "learning_rate": 2.963273766532545e-05, + "loss": 0.0324, + "num_input_tokens_seen": 38682624, + "step": 66685 + }, + { + "epoch": 9.932975871313673, + "grad_norm": 0.0006535357097163796, + "learning_rate": 2.962954448405764e-05, + "loss": 0.0571, + "num_input_tokens_seen": 38685248, + "step": 66690 + }, + { + "epoch": 9.933720583854631, + "grad_norm": 0.006023471709340811, + "learning_rate": 2.9626351224578386e-05, + "loss": 0.1778, + "num_input_tokens_seen": 38688256, + "step": 66695 + }, + { + "epoch": 9.934465296395592, + "grad_norm": 0.01884331926703453, + "learning_rate": 2.9623157886941633e-05, + "loss": 0.0986, + "num_input_tokens_seen": 38691072, + "step": 66700 + }, + { + "epoch": 9.93521000893655, + "grad_norm": 0.9108232855796814, + "learning_rate": 2.961996447120132e-05, + "loss": 0.0063, + "num_input_tokens_seen": 38694144, + "step": 66705 + }, + { + "epoch": 9.93595472147751, + "grad_norm": 25.203853607177734, + "learning_rate": 2.9616770977411408e-05, + "loss": 0.4361, + "num_input_tokens_seen": 38696768, + "step": 66710 + }, + { + "epoch": 9.936699434018468, + "grad_norm": 0.16219636797904968, + "learning_rate": 2.9613577405625838e-05, + "loss": 0.206, + "num_input_tokens_seen": 38699456, + "step": 66715 + }, + { + "epoch": 9.937444146559429, + "grad_norm": 0.013168676756322384, + "learning_rate": 2.961038375589857e-05, + "loss": 0.3601, + "num_input_tokens_seen": 38702528, + "step": 66720 + }, + { + "epoch": 9.938188859100388, + "grad_norm": 0.980644941329956, + "learning_rate": 2.9607190028283548e-05, + "loss": 0.147, + "num_input_tokens_seen": 38705856, + "step": 66725 + }, + { + "epoch": 9.938933571641346, + "grad_norm": 24.091819763183594, + "learning_rate": 2.960399622283474e-05, + "loss": 0.2085, + "num_input_tokens_seen": 38708768, + "step": 66730 + }, + { + "epoch": 9.939678284182305, + "grad_norm": 0.5524162650108337, + "learning_rate": 2.960080233960609e-05, + "loss": 0.002, + "num_input_tokens_seen": 38711616, + "step": 66735 + }, + { + "epoch": 9.940422996723266, + "grad_norm": 0.03504699841141701, + "learning_rate": 2.959760837865157e-05, + "loss": 0.0165, + "num_input_tokens_seen": 38714560, + "step": 66740 + }, + { + "epoch": 9.941167709264224, + "grad_norm": 0.12465918809175491, + "learning_rate": 2.9594414340025118e-05, + "loss": 0.3286, + "num_input_tokens_seen": 38717504, + "step": 66745 + }, + { + "epoch": 9.941912421805183, + "grad_norm": 0.06286726891994476, + "learning_rate": 2.9591220223780714e-05, + "loss": 0.1579, + "num_input_tokens_seen": 38720288, + "step": 66750 + }, + { + "epoch": 9.942657134346142, + "grad_norm": 0.2305222451686859, + "learning_rate": 2.9588026029972305e-05, + "loss": 0.0005, + "num_input_tokens_seen": 38723072, + "step": 66755 + }, + { + "epoch": 9.943401846887102, + "grad_norm": 0.0051771411672234535, + "learning_rate": 2.9584831758653865e-05, + "loss": 0.0006, + "num_input_tokens_seen": 38726144, + "step": 66760 + }, + { + "epoch": 9.944146559428061, + "grad_norm": 0.32953354716300964, + "learning_rate": 2.9581637409879344e-05, + "loss": 0.0007, + "num_input_tokens_seen": 38729184, + "step": 66765 + }, + { + "epoch": 9.94489127196902, + "grad_norm": 0.2625392973423004, + "learning_rate": 2.9578442983702716e-05, + "loss": 0.0009, + "num_input_tokens_seen": 38732096, + "step": 66770 + }, + { + "epoch": 9.945635984509979, + "grad_norm": 0.07154237478971481, + "learning_rate": 2.9575248480177952e-05, + "loss": 0.0385, + "num_input_tokens_seen": 38734752, + "step": 66775 + }, + { + "epoch": 9.946380697050937, + "grad_norm": 3.8574140071868896, + "learning_rate": 2.9572053899359013e-05, + "loss": 0.0285, + "num_input_tokens_seen": 38737664, + "step": 66780 + }, + { + "epoch": 9.947125409591898, + "grad_norm": 0.00586987379938364, + "learning_rate": 2.9568859241299878e-05, + "loss": 0.0408, + "num_input_tokens_seen": 38740640, + "step": 66785 + }, + { + "epoch": 9.947870122132857, + "grad_norm": 2.059811592102051, + "learning_rate": 2.9565664506054503e-05, + "loss": 0.0366, + "num_input_tokens_seen": 38743904, + "step": 66790 + }, + { + "epoch": 9.948614834673815, + "grad_norm": 0.09940548241138458, + "learning_rate": 2.9562469693676865e-05, + "loss": 0.1119, + "num_input_tokens_seen": 38747200, + "step": 66795 + }, + { + "epoch": 9.949359547214776, + "grad_norm": 0.5589636564254761, + "learning_rate": 2.9559274804220936e-05, + "loss": 0.0151, + "num_input_tokens_seen": 38749920, + "step": 66800 + }, + { + "epoch": 9.950104259755735, + "grad_norm": 29.84409523010254, + "learning_rate": 2.9556079837740697e-05, + "loss": 0.2298, + "num_input_tokens_seen": 38752736, + "step": 66805 + }, + { + "epoch": 9.950848972296694, + "grad_norm": 0.003112170845270157, + "learning_rate": 2.9552884794290116e-05, + "loss": 0.3419, + "num_input_tokens_seen": 38755584, + "step": 66810 + }, + { + "epoch": 9.951593684837652, + "grad_norm": 108.38252258300781, + "learning_rate": 2.954968967392318e-05, + "loss": 0.1294, + "num_input_tokens_seen": 38758368, + "step": 66815 + }, + { + "epoch": 9.952338397378611, + "grad_norm": 0.0089788269251585, + "learning_rate": 2.9546494476693865e-05, + "loss": 0.0058, + "num_input_tokens_seen": 38761184, + "step": 66820 + }, + { + "epoch": 9.953083109919572, + "grad_norm": 0.2222364842891693, + "learning_rate": 2.954329920265614e-05, + "loss": 0.2123, + "num_input_tokens_seen": 38763936, + "step": 66825 + }, + { + "epoch": 9.95382782246053, + "grad_norm": 0.014815742149949074, + "learning_rate": 2.9540103851863986e-05, + "loss": 0.2189, + "num_input_tokens_seen": 38766912, + "step": 66830 + }, + { + "epoch": 9.954572535001489, + "grad_norm": 16.094629287719727, + "learning_rate": 2.95369084243714e-05, + "loss": 0.1926, + "num_input_tokens_seen": 38769888, + "step": 66835 + }, + { + "epoch": 9.955317247542448, + "grad_norm": 11.573246955871582, + "learning_rate": 2.9533712920232353e-05, + "loss": 0.0617, + "num_input_tokens_seen": 38773152, + "step": 66840 + }, + { + "epoch": 9.956061960083408, + "grad_norm": 55.78761672973633, + "learning_rate": 2.9530517339500835e-05, + "loss": 0.0179, + "num_input_tokens_seen": 38776096, + "step": 66845 + }, + { + "epoch": 9.956806672624367, + "grad_norm": 38.3532600402832, + "learning_rate": 2.952732168223084e-05, + "loss": 0.5229, + "num_input_tokens_seen": 38778912, + "step": 66850 + }, + { + "epoch": 9.957551385165326, + "grad_norm": 0.09134192019701004, + "learning_rate": 2.952412594847634e-05, + "loss": 0.028, + "num_input_tokens_seen": 38781792, + "step": 66855 + }, + { + "epoch": 9.958296097706285, + "grad_norm": 71.34827423095703, + "learning_rate": 2.952093013829133e-05, + "loss": 0.1609, + "num_input_tokens_seen": 38784576, + "step": 66860 + }, + { + "epoch": 9.959040810247245, + "grad_norm": 4.66023588180542, + "learning_rate": 2.95177342517298e-05, + "loss": 0.1261, + "num_input_tokens_seen": 38787648, + "step": 66865 + }, + { + "epoch": 9.959785522788204, + "grad_norm": 0.011505301110446453, + "learning_rate": 2.951453828884574e-05, + "loss": 0.0163, + "num_input_tokens_seen": 38790272, + "step": 66870 + }, + { + "epoch": 9.960530235329163, + "grad_norm": 0.032349273562431335, + "learning_rate": 2.951134224969314e-05, + "loss": 0.0109, + "num_input_tokens_seen": 38793216, + "step": 66875 + }, + { + "epoch": 9.961274947870121, + "grad_norm": 16.019996643066406, + "learning_rate": 2.9508146134326004e-05, + "loss": 0.3806, + "num_input_tokens_seen": 38796096, + "step": 66880 + }, + { + "epoch": 9.962019660411082, + "grad_norm": 23.886035919189453, + "learning_rate": 2.950494994279832e-05, + "loss": 0.0184, + "num_input_tokens_seen": 38798880, + "step": 66885 + }, + { + "epoch": 9.96276437295204, + "grad_norm": 0.014575707726180553, + "learning_rate": 2.950175367516409e-05, + "loss": 0.0897, + "num_input_tokens_seen": 38801632, + "step": 66890 + }, + { + "epoch": 9.963509085493, + "grad_norm": 65.88636779785156, + "learning_rate": 2.94985573314773e-05, + "loss": 0.0767, + "num_input_tokens_seen": 38804512, + "step": 66895 + }, + { + "epoch": 9.964253798033958, + "grad_norm": 0.00317951338365674, + "learning_rate": 2.949536091179196e-05, + "loss": 0.1691, + "num_input_tokens_seen": 38807200, + "step": 66900 + }, + { + "epoch": 9.964998510574919, + "grad_norm": 0.028497738763689995, + "learning_rate": 2.9492164416162066e-05, + "loss": 0.4198, + "num_input_tokens_seen": 38810304, + "step": 66905 + }, + { + "epoch": 9.965743223115878, + "grad_norm": 0.05896450951695442, + "learning_rate": 2.9488967844641612e-05, + "loss": 0.2046, + "num_input_tokens_seen": 38813248, + "step": 66910 + }, + { + "epoch": 9.966487935656836, + "grad_norm": 71.05242919921875, + "learning_rate": 2.9485771197284625e-05, + "loss": 0.3829, + "num_input_tokens_seen": 38816288, + "step": 66915 + }, + { + "epoch": 9.967232648197795, + "grad_norm": 0.005343075376003981, + "learning_rate": 2.948257447414508e-05, + "loss": 0.0003, + "num_input_tokens_seen": 38819040, + "step": 66920 + }, + { + "epoch": 9.967977360738756, + "grad_norm": 0.029220253229141235, + "learning_rate": 2.9479377675276998e-05, + "loss": 0.0003, + "num_input_tokens_seen": 38822016, + "step": 66925 + }, + { + "epoch": 9.968722073279714, + "grad_norm": 33.80643081665039, + "learning_rate": 2.9476180800734376e-05, + "loss": 0.3089, + "num_input_tokens_seen": 38825056, + "step": 66930 + }, + { + "epoch": 9.969466785820673, + "grad_norm": 50.103309631347656, + "learning_rate": 2.9472983850571235e-05, + "loss": 0.0329, + "num_input_tokens_seen": 38828032, + "step": 66935 + }, + { + "epoch": 9.970211498361632, + "grad_norm": 0.12901385128498077, + "learning_rate": 2.9469786824841584e-05, + "loss": 0.0012, + "num_input_tokens_seen": 38831136, + "step": 66940 + }, + { + "epoch": 9.970956210902592, + "grad_norm": 14.216940879821777, + "learning_rate": 2.946658972359942e-05, + "loss": 0.0845, + "num_input_tokens_seen": 38833728, + "step": 66945 + }, + { + "epoch": 9.971700923443551, + "grad_norm": 0.004046539776027203, + "learning_rate": 2.946339254689877e-05, + "loss": 0.0181, + "num_input_tokens_seen": 38836960, + "step": 66950 + }, + { + "epoch": 9.97244563598451, + "grad_norm": 0.1923869401216507, + "learning_rate": 2.946019529479363e-05, + "loss": 0.0037, + "num_input_tokens_seen": 38840192, + "step": 66955 + }, + { + "epoch": 9.973190348525469, + "grad_norm": 0.004191662650555372, + "learning_rate": 2.945699796733803e-05, + "loss": 0.0006, + "num_input_tokens_seen": 38843296, + "step": 66960 + }, + { + "epoch": 9.973935061066427, + "grad_norm": 12.072685241699219, + "learning_rate": 2.945380056458597e-05, + "loss": 0.2963, + "num_input_tokens_seen": 38845952, + "step": 66965 + }, + { + "epoch": 9.974679773607388, + "grad_norm": 0.023443955928087234, + "learning_rate": 2.9450603086591484e-05, + "loss": 0.1621, + "num_input_tokens_seen": 38848896, + "step": 66970 + }, + { + "epoch": 9.975424486148347, + "grad_norm": 0.06634984165430069, + "learning_rate": 2.944740553340858e-05, + "loss": 0.0009, + "num_input_tokens_seen": 38851776, + "step": 66975 + }, + { + "epoch": 9.976169198689306, + "grad_norm": 0.025452135130763054, + "learning_rate": 2.944420790509128e-05, + "loss": 0.0589, + "num_input_tokens_seen": 38854560, + "step": 66980 + }, + { + "epoch": 9.976913911230266, + "grad_norm": 0.013020699843764305, + "learning_rate": 2.9441010201693614e-05, + "loss": 0.0405, + "num_input_tokens_seen": 38857312, + "step": 66985 + }, + { + "epoch": 9.977658623771225, + "grad_norm": 137.72569274902344, + "learning_rate": 2.9437812423269585e-05, + "loss": 0.4342, + "num_input_tokens_seen": 38860064, + "step": 66990 + }, + { + "epoch": 9.978403336312184, + "grad_norm": 0.04077444598078728, + "learning_rate": 2.943461456987322e-05, + "loss": 0.2795, + "num_input_tokens_seen": 38862848, + "step": 66995 + }, + { + "epoch": 9.979148048853142, + "grad_norm": 23.71466636657715, + "learning_rate": 2.9431416641558558e-05, + "loss": 0.2882, + "num_input_tokens_seen": 38865824, + "step": 67000 + }, + { + "epoch": 9.979892761394101, + "grad_norm": 0.04411038011312485, + "learning_rate": 2.9428218638379608e-05, + "loss": 0.0018, + "num_input_tokens_seen": 38868832, + "step": 67005 + }, + { + "epoch": 9.980637473935062, + "grad_norm": 125.62255859375, + "learning_rate": 2.942502056039041e-05, + "loss": 0.2096, + "num_input_tokens_seen": 38871488, + "step": 67010 + }, + { + "epoch": 9.98138218647602, + "grad_norm": 0.29357850551605225, + "learning_rate": 2.9421822407644987e-05, + "loss": 0.1629, + "num_input_tokens_seen": 38874624, + "step": 67015 + }, + { + "epoch": 9.98212689901698, + "grad_norm": 0.12992776930332184, + "learning_rate": 2.9418624180197375e-05, + "loss": 0.0196, + "num_input_tokens_seen": 38877536, + "step": 67020 + }, + { + "epoch": 9.982871611557938, + "grad_norm": 0.028467334806919098, + "learning_rate": 2.941542587810159e-05, + "loss": 0.0004, + "num_input_tokens_seen": 38880384, + "step": 67025 + }, + { + "epoch": 9.983616324098898, + "grad_norm": 3.2055351734161377, + "learning_rate": 2.941222750141167e-05, + "loss": 0.0147, + "num_input_tokens_seen": 38883168, + "step": 67030 + }, + { + "epoch": 9.984361036639857, + "grad_norm": 0.006386511493474245, + "learning_rate": 2.9409029050181652e-05, + "loss": 0.0735, + "num_input_tokens_seen": 38885920, + "step": 67035 + }, + { + "epoch": 9.985105749180816, + "grad_norm": 0.4914112389087677, + "learning_rate": 2.9405830524465573e-05, + "loss": 0.1574, + "num_input_tokens_seen": 38888640, + "step": 67040 + }, + { + "epoch": 9.985850461721775, + "grad_norm": 165.48098754882812, + "learning_rate": 2.940263192431746e-05, + "loss": 0.1975, + "num_input_tokens_seen": 38891424, + "step": 67045 + }, + { + "epoch": 9.986595174262735, + "grad_norm": 0.08594619482755661, + "learning_rate": 2.9399433249791363e-05, + "loss": 0.0396, + "num_input_tokens_seen": 38894336, + "step": 67050 + }, + { + "epoch": 9.987339886803694, + "grad_norm": 0.03245466947555542, + "learning_rate": 2.9396234500941307e-05, + "loss": 0.0005, + "num_input_tokens_seen": 38897280, + "step": 67055 + }, + { + "epoch": 9.988084599344653, + "grad_norm": 0.025977322831749916, + "learning_rate": 2.939303567782134e-05, + "loss": 0.1942, + "num_input_tokens_seen": 38900320, + "step": 67060 + }, + { + "epoch": 9.988829311885612, + "grad_norm": 14.439376831054688, + "learning_rate": 2.9389836780485502e-05, + "loss": 0.2245, + "num_input_tokens_seen": 38903296, + "step": 67065 + }, + { + "epoch": 9.989574024426572, + "grad_norm": 0.1996297687292099, + "learning_rate": 2.9386637808987828e-05, + "loss": 0.0006, + "num_input_tokens_seen": 38906368, + "step": 67070 + }, + { + "epoch": 9.99031873696753, + "grad_norm": 0.08226090669631958, + "learning_rate": 2.9383438763382363e-05, + "loss": 0.0843, + "num_input_tokens_seen": 38909536, + "step": 67075 + }, + { + "epoch": 9.99106344950849, + "grad_norm": 0.1409207284450531, + "learning_rate": 2.9380239643723167e-05, + "loss": 0.1539, + "num_input_tokens_seen": 38912512, + "step": 67080 + }, + { + "epoch": 9.991808162049448, + "grad_norm": 5.9319748878479, + "learning_rate": 2.9377040450064268e-05, + "loss": 0.1499, + "num_input_tokens_seen": 38915520, + "step": 67085 + }, + { + "epoch": 9.992552874590409, + "grad_norm": 0.04411681368947029, + "learning_rate": 2.9373841182459715e-05, + "loss": 0.2121, + "num_input_tokens_seen": 38918400, + "step": 67090 + }, + { + "epoch": 9.993297587131368, + "grad_norm": 0.4670282006263733, + "learning_rate": 2.9370641840963565e-05, + "loss": 0.2233, + "num_input_tokens_seen": 38921472, + "step": 67095 + }, + { + "epoch": 9.994042299672326, + "grad_norm": 0.0686601996421814, + "learning_rate": 2.9367442425629866e-05, + "loss": 0.0012, + "num_input_tokens_seen": 38924352, + "step": 67100 + }, + { + "epoch": 9.994787012213285, + "grad_norm": 0.39254826307296753, + "learning_rate": 2.9364242936512665e-05, + "loss": 0.0804, + "num_input_tokens_seen": 38926976, + "step": 67105 + }, + { + "epoch": 9.995531724754246, + "grad_norm": 0.0149106215685606, + "learning_rate": 2.936104337366601e-05, + "loss": 0.2768, + "num_input_tokens_seen": 38929952, + "step": 67110 + }, + { + "epoch": 9.996276437295204, + "grad_norm": 0.04893730580806732, + "learning_rate": 2.935784373714397e-05, + "loss": 0.0011, + "num_input_tokens_seen": 38932544, + "step": 67115 + }, + { + "epoch": 9.997021149836163, + "grad_norm": 0.13021057844161987, + "learning_rate": 2.9354644027000577e-05, + "loss": 0.1704, + "num_input_tokens_seen": 38935840, + "step": 67120 + }, + { + "epoch": 9.997765862377122, + "grad_norm": 1.193980097770691, + "learning_rate": 2.9351444243289904e-05, + "loss": 0.0027, + "num_input_tokens_seen": 38938944, + "step": 67125 + }, + { + "epoch": 9.998510574918082, + "grad_norm": 0.015949483960866928, + "learning_rate": 2.9348244386066005e-05, + "loss": 0.105, + "num_input_tokens_seen": 38941696, + "step": 67130 + }, + { + "epoch": 9.999255287459041, + "grad_norm": 0.039156828075647354, + "learning_rate": 2.9345044455382932e-05, + "loss": 0.0806, + "num_input_tokens_seen": 38944672, + "step": 67135 + }, + { + "epoch": 10.0, + "grad_norm": 50.62285232543945, + "learning_rate": 2.9341844451294754e-05, + "loss": 0.4251, + "num_input_tokens_seen": 38947216, + "step": 67140 + }, + { + "epoch": 10.0, + "eval_loss": 1.6454750299453735, + "eval_runtime": 51.3337, + "eval_samples_per_second": 58.129, + "eval_steps_per_second": 14.532, + "num_input_tokens_seen": 38947216, + "step": 67140 + }, + { + "epoch": 10.000744712540959, + "grad_norm": 0.34038639068603516, + "learning_rate": 2.9338644373855522e-05, + "loss": 0.045, + "num_input_tokens_seen": 38950352, + "step": 67145 + }, + { + "epoch": 10.001489425081918, + "grad_norm": 1.5006643533706665, + "learning_rate": 2.9335444223119314e-05, + "loss": 0.0929, + "num_input_tokens_seen": 38953328, + "step": 67150 + }, + { + "epoch": 10.002234137622878, + "grad_norm": 0.03509865701198578, + "learning_rate": 2.9332243999140167e-05, + "loss": 0.1756, + "num_input_tokens_seen": 38956208, + "step": 67155 + }, + { + "epoch": 10.002978850163837, + "grad_norm": 0.05002956837415695, + "learning_rate": 2.932904370197217e-05, + "loss": 0.0005, + "num_input_tokens_seen": 38959120, + "step": 67160 + }, + { + "epoch": 10.003723562704796, + "grad_norm": 0.0286780446767807, + "learning_rate": 2.932584333166937e-05, + "loss": 0.0004, + "num_input_tokens_seen": 38961872, + "step": 67165 + }, + { + "epoch": 10.004468275245754, + "grad_norm": 0.008020345121622086, + "learning_rate": 2.9322642888285855e-05, + "loss": 0.0019, + "num_input_tokens_seen": 38964816, + "step": 67170 + }, + { + "epoch": 10.005212987786715, + "grad_norm": 0.06171717122197151, + "learning_rate": 2.931944237187567e-05, + "loss": 0.0158, + "num_input_tokens_seen": 38967632, + "step": 67175 + }, + { + "epoch": 10.005957700327674, + "grad_norm": 0.028178181499242783, + "learning_rate": 2.931624178249291e-05, + "loss": 0.0005, + "num_input_tokens_seen": 38970640, + "step": 67180 + }, + { + "epoch": 10.006702412868632, + "grad_norm": 0.016203759238123894, + "learning_rate": 2.931304112019163e-05, + "loss": 0.192, + "num_input_tokens_seen": 38973584, + "step": 67185 + }, + { + "epoch": 10.007447125409591, + "grad_norm": 0.0017575457459315658, + "learning_rate": 2.93098403850259e-05, + "loss": 0.0014, + "num_input_tokens_seen": 38976400, + "step": 67190 + }, + { + "epoch": 10.008191837950552, + "grad_norm": 0.0024083079770207405, + "learning_rate": 2.9306639577049793e-05, + "loss": 0.056, + "num_input_tokens_seen": 38979312, + "step": 67195 + }, + { + "epoch": 10.00893655049151, + "grad_norm": 0.11237676441669464, + "learning_rate": 2.9303438696317385e-05, + "loss": 0.2534, + "num_input_tokens_seen": 38982160, + "step": 67200 + }, + { + "epoch": 10.00968126303247, + "grad_norm": 0.16868741810321808, + "learning_rate": 2.9300237742882764e-05, + "loss": 0.001, + "num_input_tokens_seen": 38985040, + "step": 67205 + }, + { + "epoch": 10.010425975573428, + "grad_norm": 0.9530951976776123, + "learning_rate": 2.929703671679999e-05, + "loss": 0.0046, + "num_input_tokens_seen": 38987664, + "step": 67210 + }, + { + "epoch": 10.011170688114388, + "grad_norm": 0.006858530454337597, + "learning_rate": 2.9293835618123157e-05, + "loss": 0.0004, + "num_input_tokens_seen": 38990544, + "step": 67215 + }, + { + "epoch": 10.011915400655347, + "grad_norm": 0.04938291385769844, + "learning_rate": 2.929063444690633e-05, + "loss": 0.0003, + "num_input_tokens_seen": 38993520, + "step": 67220 + }, + { + "epoch": 10.012660113196306, + "grad_norm": 0.0057866936549544334, + "learning_rate": 2.9287433203203598e-05, + "loss": 0.0005, + "num_input_tokens_seen": 38996432, + "step": 67225 + }, + { + "epoch": 10.013404825737265, + "grad_norm": 0.02248293161392212, + "learning_rate": 2.928423188706903e-05, + "loss": 0.0003, + "num_input_tokens_seen": 38999248, + "step": 67230 + }, + { + "epoch": 10.014149538278225, + "grad_norm": 0.0010067522525787354, + "learning_rate": 2.9281030498556723e-05, + "loss": 0.0004, + "num_input_tokens_seen": 39002128, + "step": 67235 + }, + { + "epoch": 10.014894250819184, + "grad_norm": 0.016338879242539406, + "learning_rate": 2.9277829037720754e-05, + "loss": 0.2707, + "num_input_tokens_seen": 39005072, + "step": 67240 + }, + { + "epoch": 10.015638963360143, + "grad_norm": 0.016340874135494232, + "learning_rate": 2.927462750461522e-05, + "loss": 0.1577, + "num_input_tokens_seen": 39008112, + "step": 67245 + }, + { + "epoch": 10.016383675901102, + "grad_norm": 0.011390544474124908, + "learning_rate": 2.9271425899294193e-05, + "loss": 0.0747, + "num_input_tokens_seen": 39010832, + "step": 67250 + }, + { + "epoch": 10.017128388442062, + "grad_norm": 0.001003052107989788, + "learning_rate": 2.9268224221811763e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39013424, + "step": 67255 + }, + { + "epoch": 10.01787310098302, + "grad_norm": 0.00046090170508250594, + "learning_rate": 2.9265022472222032e-05, + "loss": 0.0175, + "num_input_tokens_seen": 39016304, + "step": 67260 + }, + { + "epoch": 10.01861781352398, + "grad_norm": 0.002578821498900652, + "learning_rate": 2.9261820650579074e-05, + "loss": 0.0277, + "num_input_tokens_seen": 39019280, + "step": 67265 + }, + { + "epoch": 10.019362526064938, + "grad_norm": 46.53000259399414, + "learning_rate": 2.925861875693699e-05, + "loss": 0.1693, + "num_input_tokens_seen": 39022160, + "step": 67270 + }, + { + "epoch": 10.020107238605899, + "grad_norm": 0.015855375677347183, + "learning_rate": 2.9255416791349867e-05, + "loss": 0.0944, + "num_input_tokens_seen": 39024752, + "step": 67275 + }, + { + "epoch": 10.020851951146858, + "grad_norm": 0.0018714715261012316, + "learning_rate": 2.925221475387181e-05, + "loss": 0.0862, + "num_input_tokens_seen": 39027760, + "step": 67280 + }, + { + "epoch": 10.021596663687816, + "grad_norm": 0.4656202495098114, + "learning_rate": 2.92490126445569e-05, + "loss": 0.0056, + "num_input_tokens_seen": 39030768, + "step": 67285 + }, + { + "epoch": 10.022341376228775, + "grad_norm": 0.021411065012216568, + "learning_rate": 2.9245810463459245e-05, + "loss": 0.0017, + "num_input_tokens_seen": 39033488, + "step": 67290 + }, + { + "epoch": 10.023086088769736, + "grad_norm": 0.29126712679862976, + "learning_rate": 2.9242608210632932e-05, + "loss": 0.0569, + "num_input_tokens_seen": 39036656, + "step": 67295 + }, + { + "epoch": 10.023830801310694, + "grad_norm": 0.022486526519060135, + "learning_rate": 2.9239405886132066e-05, + "loss": 0.0118, + "num_input_tokens_seen": 39039568, + "step": 67300 + }, + { + "epoch": 10.024575513851653, + "grad_norm": 0.1924462914466858, + "learning_rate": 2.923620349001075e-05, + "loss": 0.0005, + "num_input_tokens_seen": 39042640, + "step": 67305 + }, + { + "epoch": 10.025320226392612, + "grad_norm": 0.12377627193927765, + "learning_rate": 2.923300102232308e-05, + "loss": 0.0005, + "num_input_tokens_seen": 39045744, + "step": 67310 + }, + { + "epoch": 10.02606493893357, + "grad_norm": 9.162145614624023, + "learning_rate": 2.9229798483123162e-05, + "loss": 0.0391, + "num_input_tokens_seen": 39048528, + "step": 67315 + }, + { + "epoch": 10.026809651474531, + "grad_norm": 8.202503522625193e-05, + "learning_rate": 2.9226595872465097e-05, + "loss": 0.0113, + "num_input_tokens_seen": 39051280, + "step": 67320 + }, + { + "epoch": 10.02755436401549, + "grad_norm": 0.00777010153979063, + "learning_rate": 2.922339319040298e-05, + "loss": 0.104, + "num_input_tokens_seen": 39054224, + "step": 67325 + }, + { + "epoch": 10.028299076556449, + "grad_norm": 0.0022049162071198225, + "learning_rate": 2.922019043699094e-05, + "loss": 0.1318, + "num_input_tokens_seen": 39057072, + "step": 67330 + }, + { + "epoch": 10.029043789097408, + "grad_norm": 0.032176513224840164, + "learning_rate": 2.9216987612283064e-05, + "loss": 0.0209, + "num_input_tokens_seen": 39060080, + "step": 67335 + }, + { + "epoch": 10.029788501638368, + "grad_norm": 0.002600659616291523, + "learning_rate": 2.921378471633347e-05, + "loss": 0.0, + "num_input_tokens_seen": 39063152, + "step": 67340 + }, + { + "epoch": 10.030533214179327, + "grad_norm": 0.005449626594781876, + "learning_rate": 2.9210581749196274e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39066064, + "step": 67345 + }, + { + "epoch": 10.031277926720286, + "grad_norm": 0.0010671367635950446, + "learning_rate": 2.9207378710925575e-05, + "loss": 0.0004, + "num_input_tokens_seen": 39069008, + "step": 67350 + }, + { + "epoch": 10.032022639261244, + "grad_norm": 0.00014976724924053997, + "learning_rate": 2.920417560157549e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39071824, + "step": 67355 + }, + { + "epoch": 10.032767351802205, + "grad_norm": 0.2861883342266083, + "learning_rate": 2.9200972421200124e-05, + "loss": 0.1506, + "num_input_tokens_seen": 39074672, + "step": 67360 + }, + { + "epoch": 10.033512064343164, + "grad_norm": 19.995988845825195, + "learning_rate": 2.91977691698536e-05, + "loss": 0.1992, + "num_input_tokens_seen": 39077552, + "step": 67365 + }, + { + "epoch": 10.034256776884122, + "grad_norm": 8.821776390075684, + "learning_rate": 2.919456584759003e-05, + "loss": 0.0033, + "num_input_tokens_seen": 39080464, + "step": 67370 + }, + { + "epoch": 10.035001489425081, + "grad_norm": 0.009684406220912933, + "learning_rate": 2.919136245446354e-05, + "loss": 0.116, + "num_input_tokens_seen": 39083472, + "step": 67375 + }, + { + "epoch": 10.035746201966042, + "grad_norm": 0.007095653098076582, + "learning_rate": 2.918815899052824e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39086224, + "step": 67380 + }, + { + "epoch": 10.036490914507, + "grad_norm": 0.29580268263816833, + "learning_rate": 2.9184955455838258e-05, + "loss": 0.1474, + "num_input_tokens_seen": 39089296, + "step": 67385 + }, + { + "epoch": 10.03723562704796, + "grad_norm": 0.003980499692261219, + "learning_rate": 2.9181751850447698e-05, + "loss": 0.0008, + "num_input_tokens_seen": 39092432, + "step": 67390 + }, + { + "epoch": 10.037980339588918, + "grad_norm": 0.005458444356918335, + "learning_rate": 2.9178548174410687e-05, + "loss": 0.0057, + "num_input_tokens_seen": 39095152, + "step": 67395 + }, + { + "epoch": 10.038725052129879, + "grad_norm": 0.00906511303037405, + "learning_rate": 2.9175344427781354e-05, + "loss": 0.0132, + "num_input_tokens_seen": 39098032, + "step": 67400 + }, + { + "epoch": 10.039469764670837, + "grad_norm": 0.012121833860874176, + "learning_rate": 2.9172140610613825e-05, + "loss": 0.009, + "num_input_tokens_seen": 39101104, + "step": 67405 + }, + { + "epoch": 10.040214477211796, + "grad_norm": 0.005754635203629732, + "learning_rate": 2.916893672296222e-05, + "loss": 0.3225, + "num_input_tokens_seen": 39104016, + "step": 67410 + }, + { + "epoch": 10.040959189752755, + "grad_norm": 0.0012341117253527045, + "learning_rate": 2.916573276488066e-05, + "loss": 0.0283, + "num_input_tokens_seen": 39106864, + "step": 67415 + }, + { + "epoch": 10.041703902293715, + "grad_norm": 0.005300653167068958, + "learning_rate": 2.9162528736423283e-05, + "loss": 0.01, + "num_input_tokens_seen": 39109776, + "step": 67420 + }, + { + "epoch": 10.042448614834674, + "grad_norm": 0.007320486940443516, + "learning_rate": 2.915932463764422e-05, + "loss": 0.1346, + "num_input_tokens_seen": 39112912, + "step": 67425 + }, + { + "epoch": 10.043193327375633, + "grad_norm": 0.0027702192310243845, + "learning_rate": 2.9156120468597588e-05, + "loss": 0.0021, + "num_input_tokens_seen": 39115792, + "step": 67430 + }, + { + "epoch": 10.043938039916592, + "grad_norm": 0.0027434085495769978, + "learning_rate": 2.9152916229337525e-05, + "loss": 0.0234, + "num_input_tokens_seen": 39118608, + "step": 67435 + }, + { + "epoch": 10.044682752457552, + "grad_norm": 0.001759056467562914, + "learning_rate": 2.9149711919918154e-05, + "loss": 0.2024, + "num_input_tokens_seen": 39121488, + "step": 67440 + }, + { + "epoch": 10.045427464998511, + "grad_norm": 2.343986988067627, + "learning_rate": 2.9146507540393636e-05, + "loss": 0.0019, + "num_input_tokens_seen": 39124560, + "step": 67445 + }, + { + "epoch": 10.04617217753947, + "grad_norm": 0.003495692741125822, + "learning_rate": 2.9143303090818074e-05, + "loss": 0.0793, + "num_input_tokens_seen": 39127504, + "step": 67450 + }, + { + "epoch": 10.046916890080428, + "grad_norm": 0.004995174240320921, + "learning_rate": 2.9140098571245623e-05, + "loss": 0.0688, + "num_input_tokens_seen": 39130320, + "step": 67455 + }, + { + "epoch": 10.047661602621389, + "grad_norm": 0.023519884794950485, + "learning_rate": 2.9136893981730406e-05, + "loss": 0.1457, + "num_input_tokens_seen": 39133072, + "step": 67460 + }, + { + "epoch": 10.048406315162348, + "grad_norm": 23.707338333129883, + "learning_rate": 2.9133689322326586e-05, + "loss": 0.0656, + "num_input_tokens_seen": 39136016, + "step": 67465 + }, + { + "epoch": 10.049151027703306, + "grad_norm": 0.0013043016660958529, + "learning_rate": 2.9130484593088276e-05, + "loss": 0.1133, + "num_input_tokens_seen": 39138992, + "step": 67470 + }, + { + "epoch": 10.049895740244265, + "grad_norm": 0.056107860058546066, + "learning_rate": 2.9127279794069624e-05, + "loss": 0.0569, + "num_input_tokens_seen": 39141840, + "step": 67475 + }, + { + "epoch": 10.050640452785226, + "grad_norm": 74.31458282470703, + "learning_rate": 2.9124074925324785e-05, + "loss": 0.1257, + "num_input_tokens_seen": 39144624, + "step": 67480 + }, + { + "epoch": 10.051385165326185, + "grad_norm": 0.052156001329422, + "learning_rate": 2.9120869986907885e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39147440, + "step": 67485 + }, + { + "epoch": 10.052129877867143, + "grad_norm": 3.2178595066070557, + "learning_rate": 2.9117664978873072e-05, + "loss": 0.048, + "num_input_tokens_seen": 39150192, + "step": 67490 + }, + { + "epoch": 10.052874590408102, + "grad_norm": 0.0015022722072899342, + "learning_rate": 2.9114459901274493e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39153136, + "step": 67495 + }, + { + "epoch": 10.05361930294906, + "grad_norm": 0.000510222336743027, + "learning_rate": 2.91112547541663e-05, + "loss": 0.0007, + "num_input_tokens_seen": 39156176, + "step": 67500 + }, + { + "epoch": 10.054364015490021, + "grad_norm": 0.009886329993605614, + "learning_rate": 2.9108049537602637e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39159504, + "step": 67505 + }, + { + "epoch": 10.05510872803098, + "grad_norm": 0.001378641347400844, + "learning_rate": 2.9104844251637652e-05, + "loss": 0.1192, + "num_input_tokens_seen": 39162480, + "step": 67510 + }, + { + "epoch": 10.055853440571939, + "grad_norm": 0.05765018239617348, + "learning_rate": 2.91016388963255e-05, + "loss": 0.0005, + "num_input_tokens_seen": 39165232, + "step": 67515 + }, + { + "epoch": 10.056598153112898, + "grad_norm": 2.777271032333374, + "learning_rate": 2.9098433471720322e-05, + "loss": 0.0018, + "num_input_tokens_seen": 39168176, + "step": 67520 + }, + { + "epoch": 10.057342865653858, + "grad_norm": 54.248355865478516, + "learning_rate": 2.909522797787627e-05, + "loss": 0.0576, + "num_input_tokens_seen": 39171024, + "step": 67525 + }, + { + "epoch": 10.058087578194817, + "grad_norm": 18.083776473999023, + "learning_rate": 2.9092022414847514e-05, + "loss": 0.0053, + "num_input_tokens_seen": 39174192, + "step": 67530 + }, + { + "epoch": 10.058832290735776, + "grad_norm": 0.003549067070707679, + "learning_rate": 2.908881678268819e-05, + "loss": 0.0006, + "num_input_tokens_seen": 39177328, + "step": 67535 + }, + { + "epoch": 10.059577003276734, + "grad_norm": 0.025952763855457306, + "learning_rate": 2.908561108145247e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39180016, + "step": 67540 + }, + { + "epoch": 10.060321715817695, + "grad_norm": 0.003358852118253708, + "learning_rate": 2.90824053111945e-05, + "loss": 0.0084, + "num_input_tokens_seen": 39183056, + "step": 67545 + }, + { + "epoch": 10.061066428358654, + "grad_norm": 0.013839403167366982, + "learning_rate": 2.9079199471968444e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39185968, + "step": 67550 + }, + { + "epoch": 10.061811140899612, + "grad_norm": 0.005747631192207336, + "learning_rate": 2.9075993563828452e-05, + "loss": 0.026, + "num_input_tokens_seen": 39189168, + "step": 67555 + }, + { + "epoch": 10.062555853440571, + "grad_norm": 0.004511350300163031, + "learning_rate": 2.9072787586828697e-05, + "loss": 0.3395, + "num_input_tokens_seen": 39191984, + "step": 67560 + }, + { + "epoch": 10.063300565981532, + "grad_norm": 0.0017831375589594245, + "learning_rate": 2.9069581541023333e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39194928, + "step": 67565 + }, + { + "epoch": 10.06404527852249, + "grad_norm": 2.4269449710845947, + "learning_rate": 2.9066375426466518e-05, + "loss": 0.0009, + "num_input_tokens_seen": 39197808, + "step": 67570 + }, + { + "epoch": 10.06478999106345, + "grad_norm": 18.257980346679688, + "learning_rate": 2.906316924321244e-05, + "loss": 0.0595, + "num_input_tokens_seen": 39200752, + "step": 67575 + }, + { + "epoch": 10.065534703604408, + "grad_norm": 24.13970375061035, + "learning_rate": 2.9059962991315237e-05, + "loss": 0.3487, + "num_input_tokens_seen": 39203504, + "step": 67580 + }, + { + "epoch": 10.066279416145369, + "grad_norm": 0.002244776114821434, + "learning_rate": 2.9056756670829087e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39206192, + "step": 67585 + }, + { + "epoch": 10.067024128686327, + "grad_norm": 9.02820873260498, + "learning_rate": 2.9053550281808155e-05, + "loss": 0.0414, + "num_input_tokens_seen": 39209008, + "step": 67590 + }, + { + "epoch": 10.067768841227286, + "grad_norm": 49.592796325683594, + "learning_rate": 2.905034382430661e-05, + "loss": 0.1664, + "num_input_tokens_seen": 39212208, + "step": 67595 + }, + { + "epoch": 10.068513553768245, + "grad_norm": 0.0010483756195753813, + "learning_rate": 2.9047137298378624e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39214992, + "step": 67600 + }, + { + "epoch": 10.069258266309205, + "grad_norm": 0.002281525172293186, + "learning_rate": 2.9043930704078364e-05, + "loss": 0.0819, + "num_input_tokens_seen": 39217808, + "step": 67605 + }, + { + "epoch": 10.070002978850164, + "grad_norm": 0.6334359645843506, + "learning_rate": 2.904072404146001e-05, + "loss": 0.1724, + "num_input_tokens_seen": 39220560, + "step": 67610 + }, + { + "epoch": 10.070747691391123, + "grad_norm": 0.029145220294594765, + "learning_rate": 2.9037517310577726e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39223664, + "step": 67615 + }, + { + "epoch": 10.071492403932082, + "grad_norm": 0.024046640843153, + "learning_rate": 2.9034310511485692e-05, + "loss": 0.1705, + "num_input_tokens_seen": 39226736, + "step": 67620 + }, + { + "epoch": 10.072237116473042, + "grad_norm": 0.20839522778987885, + "learning_rate": 2.903110364423809e-05, + "loss": 0.0004, + "num_input_tokens_seen": 39229776, + "step": 67625 + }, + { + "epoch": 10.072981829014001, + "grad_norm": 69.46554565429688, + "learning_rate": 2.9027896708889073e-05, + "loss": 0.0316, + "num_input_tokens_seen": 39232656, + "step": 67630 + }, + { + "epoch": 10.07372654155496, + "grad_norm": 0.00519876042380929, + "learning_rate": 2.9024689705492847e-05, + "loss": 0.0009, + "num_input_tokens_seen": 39235792, + "step": 67635 + }, + { + "epoch": 10.074471254095918, + "grad_norm": 0.023642368614673615, + "learning_rate": 2.902148263410357e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39238800, + "step": 67640 + }, + { + "epoch": 10.075215966636879, + "grad_norm": 0.008475733920931816, + "learning_rate": 2.9018275494775442e-05, + "loss": 0.0007, + "num_input_tokens_seen": 39241840, + "step": 67645 + }, + { + "epoch": 10.075960679177838, + "grad_norm": 0.0009649714920669794, + "learning_rate": 2.9015068287562626e-05, + "loss": 0.0143, + "num_input_tokens_seen": 39244752, + "step": 67650 + }, + { + "epoch": 10.076705391718797, + "grad_norm": 0.018294373527169228, + "learning_rate": 2.9011861012519316e-05, + "loss": 0.002, + "num_input_tokens_seen": 39247568, + "step": 67655 + }, + { + "epoch": 10.077450104259755, + "grad_norm": 0.017650190740823746, + "learning_rate": 2.900865366969968e-05, + "loss": 0.0182, + "num_input_tokens_seen": 39250576, + "step": 67660 + }, + { + "epoch": 10.078194816800714, + "grad_norm": 0.02586185745894909, + "learning_rate": 2.900544625915793e-05, + "loss": 0.0345, + "num_input_tokens_seen": 39253232, + "step": 67665 + }, + { + "epoch": 10.078939529341675, + "grad_norm": 0.00244390731677413, + "learning_rate": 2.9002238780948232e-05, + "loss": 0.2175, + "num_input_tokens_seen": 39256112, + "step": 67670 + }, + { + "epoch": 10.079684241882633, + "grad_norm": 0.14016132056713104, + "learning_rate": 2.8999031235124775e-05, + "loss": 0.1443, + "num_input_tokens_seen": 39259152, + "step": 67675 + }, + { + "epoch": 10.080428954423592, + "grad_norm": 0.016209036111831665, + "learning_rate": 2.8995823621741754e-05, + "loss": 0.3036, + "num_input_tokens_seen": 39261872, + "step": 67680 + }, + { + "epoch": 10.08117366696455, + "grad_norm": 0.08350064605474472, + "learning_rate": 2.8992615940853347e-05, + "loss": 0.0755, + "num_input_tokens_seen": 39264752, + "step": 67685 + }, + { + "epoch": 10.081918379505511, + "grad_norm": 3.874722480773926, + "learning_rate": 2.8989408192513756e-05, + "loss": 0.1823, + "num_input_tokens_seen": 39267504, + "step": 67690 + }, + { + "epoch": 10.08266309204647, + "grad_norm": 0.10020995140075684, + "learning_rate": 2.898620037677717e-05, + "loss": 0.0058, + "num_input_tokens_seen": 39270608, + "step": 67695 + }, + { + "epoch": 10.083407804587429, + "grad_norm": 0.035599734634160995, + "learning_rate": 2.898299249369777e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39273712, + "step": 67700 + }, + { + "epoch": 10.084152517128388, + "grad_norm": 0.0059423865750432014, + "learning_rate": 2.8979784543329775e-05, + "loss": 0.1599, + "num_input_tokens_seen": 39276784, + "step": 67705 + }, + { + "epoch": 10.084897229669348, + "grad_norm": 0.8782421350479126, + "learning_rate": 2.897657652572735e-05, + "loss": 0.0031, + "num_input_tokens_seen": 39279792, + "step": 67710 + }, + { + "epoch": 10.085641942210307, + "grad_norm": 0.0023063283879309893, + "learning_rate": 2.897336844094472e-05, + "loss": 0.1938, + "num_input_tokens_seen": 39282576, + "step": 67715 + }, + { + "epoch": 10.086386654751266, + "grad_norm": 0.002371256472542882, + "learning_rate": 2.8970160289036064e-05, + "loss": 0.0458, + "num_input_tokens_seen": 39285712, + "step": 67720 + }, + { + "epoch": 10.087131367292224, + "grad_norm": 0.07904089987277985, + "learning_rate": 2.8966952070055582e-05, + "loss": 0.0004, + "num_input_tokens_seen": 39288592, + "step": 67725 + }, + { + "epoch": 10.087876079833185, + "grad_norm": 0.014199189841747284, + "learning_rate": 2.8963743784057474e-05, + "loss": 0.2215, + "num_input_tokens_seen": 39291440, + "step": 67730 + }, + { + "epoch": 10.088620792374144, + "grad_norm": 0.004855596460402012, + "learning_rate": 2.896053543109595e-05, + "loss": 0.1863, + "num_input_tokens_seen": 39294352, + "step": 67735 + }, + { + "epoch": 10.089365504915103, + "grad_norm": 0.006457823794335127, + "learning_rate": 2.8957327011225198e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39297520, + "step": 67740 + }, + { + "epoch": 10.090110217456061, + "grad_norm": 0.010261465795338154, + "learning_rate": 2.8954118524499434e-05, + "loss": 0.0004, + "num_input_tokens_seen": 39300592, + "step": 67745 + }, + { + "epoch": 10.090854929997022, + "grad_norm": 0.17447617650032043, + "learning_rate": 2.895090997097286e-05, + "loss": 0.0742, + "num_input_tokens_seen": 39303696, + "step": 67750 + }, + { + "epoch": 10.09159964253798, + "grad_norm": 0.034057047218084335, + "learning_rate": 2.894770135069967e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39306512, + "step": 67755 + }, + { + "epoch": 10.09234435507894, + "grad_norm": 111.81095123291016, + "learning_rate": 2.894449266373408e-05, + "loss": 0.1018, + "num_input_tokens_seen": 39309136, + "step": 67760 + }, + { + "epoch": 10.093089067619898, + "grad_norm": 0.0020844500977545977, + "learning_rate": 2.8941283910130295e-05, + "loss": 0.132, + "num_input_tokens_seen": 39311920, + "step": 67765 + }, + { + "epoch": 10.093833780160859, + "grad_norm": 0.008415916003286839, + "learning_rate": 2.8938075089942524e-05, + "loss": 0.1534, + "num_input_tokens_seen": 39314544, + "step": 67770 + }, + { + "epoch": 10.094578492701817, + "grad_norm": 101.21928405761719, + "learning_rate": 2.893486620322498e-05, + "loss": 0.2005, + "num_input_tokens_seen": 39317456, + "step": 67775 + }, + { + "epoch": 10.095323205242776, + "grad_norm": 0.06347393244504929, + "learning_rate": 2.893165725003187e-05, + "loss": 0.0147, + "num_input_tokens_seen": 39320368, + "step": 67780 + }, + { + "epoch": 10.096067917783735, + "grad_norm": 0.10800662636756897, + "learning_rate": 2.8928448230417404e-05, + "loss": 0.2066, + "num_input_tokens_seen": 39322992, + "step": 67785 + }, + { + "epoch": 10.096812630324695, + "grad_norm": 0.02174502983689308, + "learning_rate": 2.89252391444358e-05, + "loss": 0.0008, + "num_input_tokens_seen": 39325712, + "step": 67790 + }, + { + "epoch": 10.097557342865654, + "grad_norm": 8.822012901306152, + "learning_rate": 2.892202999214127e-05, + "loss": 0.2405, + "num_input_tokens_seen": 39328528, + "step": 67795 + }, + { + "epoch": 10.098302055406613, + "grad_norm": 0.2574504613876343, + "learning_rate": 2.8918820773588025e-05, + "loss": 0.0009, + "num_input_tokens_seen": 39331312, + "step": 67800 + }, + { + "epoch": 10.099046767947572, + "grad_norm": 0.014513373374938965, + "learning_rate": 2.8915611488830284e-05, + "loss": 0.0008, + "num_input_tokens_seen": 39334128, + "step": 67805 + }, + { + "epoch": 10.099791480488532, + "grad_norm": 0.14955653250217438, + "learning_rate": 2.891240213792228e-05, + "loss": 0.0029, + "num_input_tokens_seen": 39337008, + "step": 67810 + }, + { + "epoch": 10.100536193029491, + "grad_norm": 0.21056416630744934, + "learning_rate": 2.890919272091821e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39339920, + "step": 67815 + }, + { + "epoch": 10.10128090557045, + "grad_norm": 0.06796754896640778, + "learning_rate": 2.8905983237872304e-05, + "loss": 0.0203, + "num_input_tokens_seen": 39342640, + "step": 67820 + }, + { + "epoch": 10.102025618111409, + "grad_norm": 0.05903765559196472, + "learning_rate": 2.890277368883878e-05, + "loss": 0.2086, + "num_input_tokens_seen": 39345584, + "step": 67825 + }, + { + "epoch": 10.102770330652369, + "grad_norm": 0.000523426104336977, + "learning_rate": 2.889956407387186e-05, + "loss": 0.0004, + "num_input_tokens_seen": 39348400, + "step": 67830 + }, + { + "epoch": 10.103515043193328, + "grad_norm": 0.005336216650903225, + "learning_rate": 2.8896354393025765e-05, + "loss": 0.1605, + "num_input_tokens_seen": 39351440, + "step": 67835 + }, + { + "epoch": 10.104259755734287, + "grad_norm": 0.0037913245614618063, + "learning_rate": 2.8893144646354725e-05, + "loss": 0.0083, + "num_input_tokens_seen": 39354480, + "step": 67840 + }, + { + "epoch": 10.105004468275245, + "grad_norm": 0.024941369891166687, + "learning_rate": 2.888993483391297e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39357744, + "step": 67845 + }, + { + "epoch": 10.105749180816204, + "grad_norm": 0.17110411822795868, + "learning_rate": 2.8886724955754713e-05, + "loss": 0.0622, + "num_input_tokens_seen": 39360752, + "step": 67850 + }, + { + "epoch": 10.106493893357165, + "grad_norm": 0.005702323280274868, + "learning_rate": 2.8883515011934186e-05, + "loss": 0.0497, + "num_input_tokens_seen": 39363632, + "step": 67855 + }, + { + "epoch": 10.107238605898123, + "grad_norm": 0.01958557777106762, + "learning_rate": 2.8880305002505624e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39366672, + "step": 67860 + }, + { + "epoch": 10.107983318439082, + "grad_norm": 0.5394982695579529, + "learning_rate": 2.887709492752325e-05, + "loss": 0.0009, + "num_input_tokens_seen": 39369328, + "step": 67865 + }, + { + "epoch": 10.108728030980041, + "grad_norm": 0.025924472138285637, + "learning_rate": 2.8873884787041304e-05, + "loss": 0.2483, + "num_input_tokens_seen": 39372240, + "step": 67870 + }, + { + "epoch": 10.109472743521001, + "grad_norm": 0.01321425847709179, + "learning_rate": 2.8870674581114004e-05, + "loss": 0.0075, + "num_input_tokens_seen": 39375376, + "step": 67875 + }, + { + "epoch": 10.11021745606196, + "grad_norm": 0.002632671035826206, + "learning_rate": 2.88674643097956e-05, + "loss": 0.002, + "num_input_tokens_seen": 39378512, + "step": 67880 + }, + { + "epoch": 10.110962168602919, + "grad_norm": 0.018262643367052078, + "learning_rate": 2.886425397314031e-05, + "loss": 0.04, + "num_input_tokens_seen": 39381328, + "step": 67885 + }, + { + "epoch": 10.111706881143878, + "grad_norm": 113.01956176757812, + "learning_rate": 2.886104357120237e-05, + "loss": 0.0592, + "num_input_tokens_seen": 39384272, + "step": 67890 + }, + { + "epoch": 10.112451593684838, + "grad_norm": 74.32111358642578, + "learning_rate": 2.8857833104036036e-05, + "loss": 0.5715, + "num_input_tokens_seen": 39386992, + "step": 67895 + }, + { + "epoch": 10.113196306225797, + "grad_norm": 0.0023914880584925413, + "learning_rate": 2.8854622571695526e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39389904, + "step": 67900 + }, + { + "epoch": 10.113941018766756, + "grad_norm": 0.030088605359196663, + "learning_rate": 2.8851411974235086e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39392976, + "step": 67905 + }, + { + "epoch": 10.114685731307715, + "grad_norm": 0.03066064603626728, + "learning_rate": 2.884820131170896e-05, + "loss": 0.518, + "num_input_tokens_seen": 39395664, + "step": 67910 + }, + { + "epoch": 10.115430443848675, + "grad_norm": 6.666244029998779, + "learning_rate": 2.884499058417138e-05, + "loss": 0.2372, + "num_input_tokens_seen": 39398352, + "step": 67915 + }, + { + "epoch": 10.116175156389634, + "grad_norm": 0.0008283538627438247, + "learning_rate": 2.8841779791676594e-05, + "loss": 0.0194, + "num_input_tokens_seen": 39401200, + "step": 67920 + }, + { + "epoch": 10.116919868930593, + "grad_norm": 0.017252130433917046, + "learning_rate": 2.8838568934278843e-05, + "loss": 0.2283, + "num_input_tokens_seen": 39403856, + "step": 67925 + }, + { + "epoch": 10.117664581471551, + "grad_norm": 0.044757165014743805, + "learning_rate": 2.8835358012032364e-05, + "loss": 0.1284, + "num_input_tokens_seen": 39406672, + "step": 67930 + }, + { + "epoch": 10.118409294012512, + "grad_norm": 0.036058492958545685, + "learning_rate": 2.8832147024991412e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39409680, + "step": 67935 + }, + { + "epoch": 10.11915400655347, + "grad_norm": 0.005097055807709694, + "learning_rate": 2.882893597321024e-05, + "loss": 0.0008, + "num_input_tokens_seen": 39412336, + "step": 67940 + }, + { + "epoch": 10.11989871909443, + "grad_norm": 0.40294766426086426, + "learning_rate": 2.8825724856743075e-05, + "loss": 0.0904, + "num_input_tokens_seen": 39415856, + "step": 67945 + }, + { + "epoch": 10.120643431635388, + "grad_norm": 0.006102884188294411, + "learning_rate": 2.8822513675644192e-05, + "loss": 0.0033, + "num_input_tokens_seen": 39418640, + "step": 67950 + }, + { + "epoch": 10.121388144176349, + "grad_norm": 0.005544673185795546, + "learning_rate": 2.8819302429967808e-05, + "loss": 0.1997, + "num_input_tokens_seen": 39421520, + "step": 67955 + }, + { + "epoch": 10.122132856717307, + "grad_norm": 0.0046430411748588085, + "learning_rate": 2.88160911197682e-05, + "loss": 0.0736, + "num_input_tokens_seen": 39424528, + "step": 67960 + }, + { + "epoch": 10.122877569258266, + "grad_norm": 15.698598861694336, + "learning_rate": 2.881287974509961e-05, + "loss": 0.1054, + "num_input_tokens_seen": 39427184, + "step": 67965 + }, + { + "epoch": 10.123622281799225, + "grad_norm": 0.016519060358405113, + "learning_rate": 2.8809668306016286e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39430288, + "step": 67970 + }, + { + "epoch": 10.124366994340185, + "grad_norm": 0.018617970868945122, + "learning_rate": 2.8806456802572502e-05, + "loss": 0.0109, + "num_input_tokens_seen": 39433168, + "step": 67975 + }, + { + "epoch": 10.125111706881144, + "grad_norm": 0.01656881533563137, + "learning_rate": 2.8803245234822485e-05, + "loss": 0.1535, + "num_input_tokens_seen": 39435920, + "step": 67980 + }, + { + "epoch": 10.125856419422103, + "grad_norm": 30.23280906677246, + "learning_rate": 2.880003360282051e-05, + "loss": 0.0781, + "num_input_tokens_seen": 39438768, + "step": 67985 + }, + { + "epoch": 10.126601131963062, + "grad_norm": 0.057810697704553604, + "learning_rate": 2.8796821906620837e-05, + "loss": 0.157, + "num_input_tokens_seen": 39441712, + "step": 67990 + }, + { + "epoch": 10.127345844504022, + "grad_norm": 0.014312039129436016, + "learning_rate": 2.8793610146277707e-05, + "loss": 0.2532, + "num_input_tokens_seen": 39444496, + "step": 67995 + }, + { + "epoch": 10.128090557044981, + "grad_norm": 0.017364243045449257, + "learning_rate": 2.879039832184539e-05, + "loss": 0.001, + "num_input_tokens_seen": 39447216, + "step": 68000 + }, + { + "epoch": 10.12883526958594, + "grad_norm": 0.004996452946215868, + "learning_rate": 2.8787186433378142e-05, + "loss": 0.0916, + "num_input_tokens_seen": 39450256, + "step": 68005 + }, + { + "epoch": 10.129579982126899, + "grad_norm": 0.6150544881820679, + "learning_rate": 2.8783974480930244e-05, + "loss": 0.1548, + "num_input_tokens_seen": 39453104, + "step": 68010 + }, + { + "epoch": 10.130324694667857, + "grad_norm": 2.353059768676758, + "learning_rate": 2.8780762464555928e-05, + "loss": 0.0199, + "num_input_tokens_seen": 39455888, + "step": 68015 + }, + { + "epoch": 10.131069407208818, + "grad_norm": 0.03184642642736435, + "learning_rate": 2.8777550384309477e-05, + "loss": 0.001, + "num_input_tokens_seen": 39458736, + "step": 68020 + }, + { + "epoch": 10.131814119749777, + "grad_norm": 0.016063053160905838, + "learning_rate": 2.877433824024515e-05, + "loss": 0.066, + "num_input_tokens_seen": 39461616, + "step": 68025 + }, + { + "epoch": 10.132558832290735, + "grad_norm": 36.395729064941406, + "learning_rate": 2.8771126032417222e-05, + "loss": 0.073, + "num_input_tokens_seen": 39464656, + "step": 68030 + }, + { + "epoch": 10.133303544831694, + "grad_norm": 2.441103219985962, + "learning_rate": 2.876791376087995e-05, + "loss": 0.073, + "num_input_tokens_seen": 39467696, + "step": 68035 + }, + { + "epoch": 10.134048257372655, + "grad_norm": 0.003609119215980172, + "learning_rate": 2.8764701425687597e-05, + "loss": 0.0018, + "num_input_tokens_seen": 39470672, + "step": 68040 + }, + { + "epoch": 10.134792969913613, + "grad_norm": 0.007623743731528521, + "learning_rate": 2.876148902689445e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39473456, + "step": 68045 + }, + { + "epoch": 10.135537682454572, + "grad_norm": 15.165863037109375, + "learning_rate": 2.875827656455476e-05, + "loss": 0.2309, + "num_input_tokens_seen": 39476112, + "step": 68050 + }, + { + "epoch": 10.136282394995531, + "grad_norm": 0.008449668064713478, + "learning_rate": 2.8755064038722813e-05, + "loss": 0.1562, + "num_input_tokens_seen": 39479120, + "step": 68055 + }, + { + "epoch": 10.137027107536491, + "grad_norm": 0.006115499418228865, + "learning_rate": 2.875185144945287e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39482064, + "step": 68060 + }, + { + "epoch": 10.13777182007745, + "grad_norm": 197.1567840576172, + "learning_rate": 2.874863879679921e-05, + "loss": 0.0863, + "num_input_tokens_seen": 39484880, + "step": 68065 + }, + { + "epoch": 10.138516532618409, + "grad_norm": 0.059698015451431274, + "learning_rate": 2.8745426080816117e-05, + "loss": 0.0004, + "num_input_tokens_seen": 39487952, + "step": 68070 + }, + { + "epoch": 10.139261245159368, + "grad_norm": 0.00082636799197644, + "learning_rate": 2.8742213301557847e-05, + "loss": 0.1506, + "num_input_tokens_seen": 39490928, + "step": 68075 + }, + { + "epoch": 10.140005957700328, + "grad_norm": 0.0252506323158741, + "learning_rate": 2.8739000459078695e-05, + "loss": 0.3104, + "num_input_tokens_seen": 39494000, + "step": 68080 + }, + { + "epoch": 10.140750670241287, + "grad_norm": 0.03469270467758179, + "learning_rate": 2.8735787553432925e-05, + "loss": 0.1815, + "num_input_tokens_seen": 39497104, + "step": 68085 + }, + { + "epoch": 10.141495382782246, + "grad_norm": 0.2889667749404907, + "learning_rate": 2.873257458467482e-05, + "loss": 0.0121, + "num_input_tokens_seen": 39500016, + "step": 68090 + }, + { + "epoch": 10.142240095323205, + "grad_norm": 0.8110653758049011, + "learning_rate": 2.8729361552858662e-05, + "loss": 0.0029, + "num_input_tokens_seen": 39502704, + "step": 68095 + }, + { + "epoch": 10.142984807864165, + "grad_norm": 0.08081755042076111, + "learning_rate": 2.8726148458038732e-05, + "loss": 0.0004, + "num_input_tokens_seen": 39505296, + "step": 68100 + }, + { + "epoch": 10.143729520405124, + "grad_norm": 0.0011442431714385748, + "learning_rate": 2.8722935300269315e-05, + "loss": 0.0005, + "num_input_tokens_seen": 39507984, + "step": 68105 + }, + { + "epoch": 10.144474232946083, + "grad_norm": 0.019983474165201187, + "learning_rate": 2.8719722079604684e-05, + "loss": 0.0506, + "num_input_tokens_seen": 39510640, + "step": 68110 + }, + { + "epoch": 10.145218945487041, + "grad_norm": 6.899999618530273, + "learning_rate": 2.8716508796099135e-05, + "loss": 0.076, + "num_input_tokens_seen": 39513552, + "step": 68115 + }, + { + "epoch": 10.145963658028002, + "grad_norm": 12.80057430267334, + "learning_rate": 2.8713295449806944e-05, + "loss": 0.005, + "num_input_tokens_seen": 39516624, + "step": 68120 + }, + { + "epoch": 10.14670837056896, + "grad_norm": 0.0008149132481776178, + "learning_rate": 2.8710082040782392e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39519504, + "step": 68125 + }, + { + "epoch": 10.14745308310992, + "grad_norm": 18.070701599121094, + "learning_rate": 2.870686856907978e-05, + "loss": 0.1173, + "num_input_tokens_seen": 39522608, + "step": 68130 + }, + { + "epoch": 10.148197795650878, + "grad_norm": 0.0011828058632090688, + "learning_rate": 2.8703655034753397e-05, + "loss": 0.2251, + "num_input_tokens_seen": 39525168, + "step": 68135 + }, + { + "epoch": 10.148942508191839, + "grad_norm": 0.02390090562403202, + "learning_rate": 2.8700441437857527e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39528272, + "step": 68140 + }, + { + "epoch": 10.149687220732797, + "grad_norm": 0.02181168459355831, + "learning_rate": 2.869722777844645e-05, + "loss": 0.0005, + "num_input_tokens_seen": 39531056, + "step": 68145 + }, + { + "epoch": 10.150431933273756, + "grad_norm": 9.328812599182129, + "learning_rate": 2.869401405657448e-05, + "loss": 0.2533, + "num_input_tokens_seen": 39533744, + "step": 68150 + }, + { + "epoch": 10.151176645814715, + "grad_norm": 0.0831802487373352, + "learning_rate": 2.8690800272295888e-05, + "loss": 0.1946, + "num_input_tokens_seen": 39536624, + "step": 68155 + }, + { + "epoch": 10.151921358355676, + "grad_norm": 0.010651582852005959, + "learning_rate": 2.8687586425664974e-05, + "loss": 0.0932, + "num_input_tokens_seen": 39539728, + "step": 68160 + }, + { + "epoch": 10.152666070896634, + "grad_norm": 0.01922447979450226, + "learning_rate": 2.868437251673604e-05, + "loss": 0.1456, + "num_input_tokens_seen": 39542576, + "step": 68165 + }, + { + "epoch": 10.153410783437593, + "grad_norm": 0.0011843915563076735, + "learning_rate": 2.8681158545563375e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39545360, + "step": 68170 + }, + { + "epoch": 10.154155495978552, + "grad_norm": 0.032847750931978226, + "learning_rate": 2.8677944512201283e-05, + "loss": 0.0007, + "num_input_tokens_seen": 39548144, + "step": 68175 + }, + { + "epoch": 10.15490020851951, + "grad_norm": 138.86936950683594, + "learning_rate": 2.8674730416704056e-05, + "loss": 0.0516, + "num_input_tokens_seen": 39550800, + "step": 68180 + }, + { + "epoch": 10.155644921060471, + "grad_norm": 0.12423443049192429, + "learning_rate": 2.8671516259125985e-05, + "loss": 0.0008, + "num_input_tokens_seen": 39553872, + "step": 68185 + }, + { + "epoch": 10.15638963360143, + "grad_norm": 0.01079289335757494, + "learning_rate": 2.866830203952139e-05, + "loss": 0.0004, + "num_input_tokens_seen": 39556688, + "step": 68190 + }, + { + "epoch": 10.157134346142389, + "grad_norm": 17.62286376953125, + "learning_rate": 2.866508775794455e-05, + "loss": 0.0022, + "num_input_tokens_seen": 39559760, + "step": 68195 + }, + { + "epoch": 10.157879058683347, + "grad_norm": 0.3063320517539978, + "learning_rate": 2.866187341444978e-05, + "loss": 0.0024, + "num_input_tokens_seen": 39562448, + "step": 68200 + }, + { + "epoch": 10.158623771224308, + "grad_norm": 15.740387916564941, + "learning_rate": 2.8658659009091383e-05, + "loss": 0.0025, + "num_input_tokens_seen": 39565136, + "step": 68205 + }, + { + "epoch": 10.159368483765267, + "grad_norm": 0.010603416711091995, + "learning_rate": 2.865544454192366e-05, + "loss": 0.0006, + "num_input_tokens_seen": 39568016, + "step": 68210 + }, + { + "epoch": 10.160113196306225, + "grad_norm": 0.16327086091041565, + "learning_rate": 2.8652230013000914e-05, + "loss": 0.0683, + "num_input_tokens_seen": 39570768, + "step": 68215 + }, + { + "epoch": 10.160857908847184, + "grad_norm": 8.851922988891602, + "learning_rate": 2.8649015422377456e-05, + "loss": 0.0279, + "num_input_tokens_seen": 39573552, + "step": 68220 + }, + { + "epoch": 10.161602621388145, + "grad_norm": 0.0002990719804074615, + "learning_rate": 2.864580077010759e-05, + "loss": 0.0394, + "num_input_tokens_seen": 39576464, + "step": 68225 + }, + { + "epoch": 10.162347333929103, + "grad_norm": 0.02704116888344288, + "learning_rate": 2.8642586056245628e-05, + "loss": 0.1848, + "num_input_tokens_seen": 39579664, + "step": 68230 + }, + { + "epoch": 10.163092046470062, + "grad_norm": 0.015363679267466068, + "learning_rate": 2.8639371280845872e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39582800, + "step": 68235 + }, + { + "epoch": 10.163836759011021, + "grad_norm": 12.598794937133789, + "learning_rate": 2.863615644396264e-05, + "loss": 0.0429, + "num_input_tokens_seen": 39585776, + "step": 68240 + }, + { + "epoch": 10.164581471551982, + "grad_norm": 124.34226989746094, + "learning_rate": 2.863294154565025e-05, + "loss": 0.116, + "num_input_tokens_seen": 39588688, + "step": 68245 + }, + { + "epoch": 10.16532618409294, + "grad_norm": 0.013937078416347504, + "learning_rate": 2.862972658596299e-05, + "loss": 0.0046, + "num_input_tokens_seen": 39591472, + "step": 68250 + }, + { + "epoch": 10.166070896633899, + "grad_norm": 0.020046502351760864, + "learning_rate": 2.8626511564955195e-05, + "loss": 0.0065, + "num_input_tokens_seen": 39594448, + "step": 68255 + }, + { + "epoch": 10.166815609174858, + "grad_norm": 0.0035849264822900295, + "learning_rate": 2.8623296482681166e-05, + "loss": 0.0025, + "num_input_tokens_seen": 39597360, + "step": 68260 + }, + { + "epoch": 10.167560321715818, + "grad_norm": 0.01074313372373581, + "learning_rate": 2.862008133919523e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39600176, + "step": 68265 + }, + { + "epoch": 10.168305034256777, + "grad_norm": 0.046630583703517914, + "learning_rate": 2.8616866134551706e-05, + "loss": 0.0004, + "num_input_tokens_seen": 39603184, + "step": 68270 + }, + { + "epoch": 10.169049746797736, + "grad_norm": 0.018035942688584328, + "learning_rate": 2.86136508688049e-05, + "loss": 0.0567, + "num_input_tokens_seen": 39605840, + "step": 68275 + }, + { + "epoch": 10.169794459338695, + "grad_norm": 0.006811181548982859, + "learning_rate": 2.861043554200914e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39608912, + "step": 68280 + }, + { + "epoch": 10.170539171879655, + "grad_norm": 0.12171302735805511, + "learning_rate": 2.8607220154218734e-05, + "loss": 0.2223, + "num_input_tokens_seen": 39611664, + "step": 68285 + }, + { + "epoch": 10.171283884420614, + "grad_norm": 0.0037388557102531195, + "learning_rate": 2.860400470548801e-05, + "loss": 0.3022, + "num_input_tokens_seen": 39614512, + "step": 68290 + }, + { + "epoch": 10.172028596961573, + "grad_norm": 0.0032436323817819357, + "learning_rate": 2.8600789195871286e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39617264, + "step": 68295 + }, + { + "epoch": 10.172773309502531, + "grad_norm": 3.574435234069824, + "learning_rate": 2.8597573625422892e-05, + "loss": 0.0014, + "num_input_tokens_seen": 39619888, + "step": 68300 + }, + { + "epoch": 10.173518022043492, + "grad_norm": 0.2960142195224762, + "learning_rate": 2.859435799419715e-05, + "loss": 0.0045, + "num_input_tokens_seen": 39622928, + "step": 68305 + }, + { + "epoch": 10.17426273458445, + "grad_norm": 0.002695030765607953, + "learning_rate": 2.8591142302248392e-05, + "loss": 0.0008, + "num_input_tokens_seen": 39625904, + "step": 68310 + }, + { + "epoch": 10.17500744712541, + "grad_norm": 0.014737965539097786, + "learning_rate": 2.8587926549630923e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39628656, + "step": 68315 + }, + { + "epoch": 10.175752159666368, + "grad_norm": 0.03353272005915642, + "learning_rate": 2.858471073639908e-05, + "loss": 0.0446, + "num_input_tokens_seen": 39631760, + "step": 68320 + }, + { + "epoch": 10.176496872207329, + "grad_norm": 0.01896665245294571, + "learning_rate": 2.8581494862607194e-05, + "loss": 0.0342, + "num_input_tokens_seen": 39634928, + "step": 68325 + }, + { + "epoch": 10.177241584748288, + "grad_norm": 0.000532559584826231, + "learning_rate": 2.8578278928309594e-05, + "loss": 0.0507, + "num_input_tokens_seen": 39637776, + "step": 68330 + }, + { + "epoch": 10.177986297289246, + "grad_norm": 0.0005089406622573733, + "learning_rate": 2.8575062933560605e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39641008, + "step": 68335 + }, + { + "epoch": 10.178731009830205, + "grad_norm": 0.24986514449119568, + "learning_rate": 2.8571846878414565e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39643952, + "step": 68340 + }, + { + "epoch": 10.179475722371166, + "grad_norm": 0.008037654682993889, + "learning_rate": 2.8568630762925803e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39646448, + "step": 68345 + }, + { + "epoch": 10.180220434912124, + "grad_norm": 0.01017809472978115, + "learning_rate": 2.8565414587148654e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39649200, + "step": 68350 + }, + { + "epoch": 10.180965147453083, + "grad_norm": 0.001101891859434545, + "learning_rate": 2.856219835113744e-05, + "loss": 0.1688, + "num_input_tokens_seen": 39652208, + "step": 68355 + }, + { + "epoch": 10.181709859994042, + "grad_norm": 0.0026264251209795475, + "learning_rate": 2.8558982054946515e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39655344, + "step": 68360 + }, + { + "epoch": 10.182454572535, + "grad_norm": 79.80782318115234, + "learning_rate": 2.85557656986302e-05, + "loss": 0.0355, + "num_input_tokens_seen": 39658384, + "step": 68365 + }, + { + "epoch": 10.183199285075961, + "grad_norm": 0.0006321463151834905, + "learning_rate": 2.8552549282242836e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39661264, + "step": 68370 + }, + { + "epoch": 10.18394399761692, + "grad_norm": 0.0017278658924624324, + "learning_rate": 2.854933280583877e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39664240, + "step": 68375 + }, + { + "epoch": 10.184688710157879, + "grad_norm": 0.009482922032475471, + "learning_rate": 2.8546116269472322e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39666768, + "step": 68380 + }, + { + "epoch": 10.185433422698837, + "grad_norm": 64.43958282470703, + "learning_rate": 2.8542899673197847e-05, + "loss": 0.0803, + "num_input_tokens_seen": 39669616, + "step": 68385 + }, + { + "epoch": 10.186178135239798, + "grad_norm": 0.8501573204994202, + "learning_rate": 2.8539683017069697e-05, + "loss": 0.1919, + "num_input_tokens_seen": 39672784, + "step": 68390 + }, + { + "epoch": 10.186922847780757, + "grad_norm": 0.002679326804354787, + "learning_rate": 2.8536466301142185e-05, + "loss": 0.1604, + "num_input_tokens_seen": 39675664, + "step": 68395 + }, + { + "epoch": 10.187667560321715, + "grad_norm": 70.73193359375, + "learning_rate": 2.853324952546967e-05, + "loss": 0.0166, + "num_input_tokens_seen": 39678512, + "step": 68400 + }, + { + "epoch": 10.188412272862674, + "grad_norm": 0.0020894482731819153, + "learning_rate": 2.8530032690106494e-05, + "loss": 0.0009, + "num_input_tokens_seen": 39681424, + "step": 68405 + }, + { + "epoch": 10.189156985403635, + "grad_norm": 0.0015305771958082914, + "learning_rate": 2.8526815795107016e-05, + "loss": 0.0046, + "num_input_tokens_seen": 39684112, + "step": 68410 + }, + { + "epoch": 10.189901697944594, + "grad_norm": 120.22249603271484, + "learning_rate": 2.8523598840525563e-05, + "loss": 0.2051, + "num_input_tokens_seen": 39686800, + "step": 68415 + }, + { + "epoch": 10.190646410485552, + "grad_norm": 17.346471786499023, + "learning_rate": 2.852038182641648e-05, + "loss": 0.3629, + "num_input_tokens_seen": 39689616, + "step": 68420 + }, + { + "epoch": 10.191391123026511, + "grad_norm": 0.029471132904291153, + "learning_rate": 2.8517164752834136e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39692496, + "step": 68425 + }, + { + "epoch": 10.192135835567472, + "grad_norm": 0.1429152935743332, + "learning_rate": 2.8513947619832866e-05, + "loss": 0.0495, + "num_input_tokens_seen": 39695280, + "step": 68430 + }, + { + "epoch": 10.19288054810843, + "grad_norm": 0.0005249217501841486, + "learning_rate": 2.8510730427467015e-05, + "loss": 0.0283, + "num_input_tokens_seen": 39698448, + "step": 68435 + }, + { + "epoch": 10.19362526064939, + "grad_norm": 0.000338781566824764, + "learning_rate": 2.8507513175790944e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39701456, + "step": 68440 + }, + { + "epoch": 10.194369973190348, + "grad_norm": 0.01724533922970295, + "learning_rate": 2.850429586485901e-05, + "loss": 0.0054, + "num_input_tokens_seen": 39704528, + "step": 68445 + }, + { + "epoch": 10.195114685731308, + "grad_norm": 0.0007665685843676329, + "learning_rate": 2.850107849472555e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39707440, + "step": 68450 + }, + { + "epoch": 10.195859398272267, + "grad_norm": 0.00081637006951496, + "learning_rate": 2.8497861065444937e-05, + "loss": 0.0883, + "num_input_tokens_seen": 39710320, + "step": 68455 + }, + { + "epoch": 10.196604110813226, + "grad_norm": 0.006459720898419619, + "learning_rate": 2.8494643577071506e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39713360, + "step": 68460 + }, + { + "epoch": 10.197348823354185, + "grad_norm": 0.0225088931620121, + "learning_rate": 2.849142602965963e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39716272, + "step": 68465 + }, + { + "epoch": 10.198093535895145, + "grad_norm": 0.07648369669914246, + "learning_rate": 2.8488208423263663e-05, + "loss": 0.0647, + "num_input_tokens_seen": 39718992, + "step": 68470 + }, + { + "epoch": 10.198838248436104, + "grad_norm": 173.6855010986328, + "learning_rate": 2.8484990757937958e-05, + "loss": 0.1508, + "num_input_tokens_seen": 39721936, + "step": 68475 + }, + { + "epoch": 10.199582960977063, + "grad_norm": 0.09353107213973999, + "learning_rate": 2.848177303373687e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39725264, + "step": 68480 + }, + { + "epoch": 10.200327673518021, + "grad_norm": 0.0026440315414220095, + "learning_rate": 2.847855525071477e-05, + "loss": 0.1166, + "num_input_tokens_seen": 39728048, + "step": 68485 + }, + { + "epoch": 10.201072386058982, + "grad_norm": 0.00033317209454253316, + "learning_rate": 2.8475337408926005e-05, + "loss": 0.0051, + "num_input_tokens_seen": 39730800, + "step": 68490 + }, + { + "epoch": 10.20181709859994, + "grad_norm": 0.00021637792815454304, + "learning_rate": 2.8472119508424954e-05, + "loss": 0.1758, + "num_input_tokens_seen": 39733680, + "step": 68495 + }, + { + "epoch": 10.2025618111409, + "grad_norm": 0.24455797672271729, + "learning_rate": 2.8468901549265976e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39736720, + "step": 68500 + }, + { + "epoch": 10.203306523681858, + "grad_norm": 0.0020969416946172714, + "learning_rate": 2.8465683531503435e-05, + "loss": 0.0, + "num_input_tokens_seen": 39739248, + "step": 68505 + }, + { + "epoch": 10.204051236222819, + "grad_norm": 0.0037783682346343994, + "learning_rate": 2.8462465455191682e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39742000, + "step": 68510 + }, + { + "epoch": 10.204795948763778, + "grad_norm": 0.0008540835115127265, + "learning_rate": 2.845924732038511e-05, + "loss": 0.0404, + "num_input_tokens_seen": 39744784, + "step": 68515 + }, + { + "epoch": 10.205540661304736, + "grad_norm": 0.00749362213537097, + "learning_rate": 2.8456029127138056e-05, + "loss": 0.204, + "num_input_tokens_seen": 39747600, + "step": 68520 + }, + { + "epoch": 10.206285373845695, + "grad_norm": 0.012298455461859703, + "learning_rate": 2.8452810875504903e-05, + "loss": 0.0004, + "num_input_tokens_seen": 39750448, + "step": 68525 + }, + { + "epoch": 10.207030086386654, + "grad_norm": 0.006594938226044178, + "learning_rate": 2.8449592565540024e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39753168, + "step": 68530 + }, + { + "epoch": 10.207774798927614, + "grad_norm": 0.006947855465114117, + "learning_rate": 2.844637419729778e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39755984, + "step": 68535 + }, + { + "epoch": 10.208519511468573, + "grad_norm": 0.04933696985244751, + "learning_rate": 2.844315577083255e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39758736, + "step": 68540 + }, + { + "epoch": 10.209264224009532, + "grad_norm": 0.009713765233755112, + "learning_rate": 2.8439937286198704e-05, + "loss": 0.0014, + "num_input_tokens_seen": 39761808, + "step": 68545 + }, + { + "epoch": 10.21000893655049, + "grad_norm": 0.15152540802955627, + "learning_rate": 2.8436718743450614e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39764656, + "step": 68550 + }, + { + "epoch": 10.210753649091451, + "grad_norm": 0.48349693417549133, + "learning_rate": 2.8433500142642654e-05, + "loss": 0.0007, + "num_input_tokens_seen": 39767600, + "step": 68555 + }, + { + "epoch": 10.21149836163241, + "grad_norm": 13.059441566467285, + "learning_rate": 2.8430281483829196e-05, + "loss": 0.1284, + "num_input_tokens_seen": 39770416, + "step": 68560 + }, + { + "epoch": 10.212243074173369, + "grad_norm": 0.01451210305094719, + "learning_rate": 2.842706276706462e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39773200, + "step": 68565 + }, + { + "epoch": 10.212987786714327, + "grad_norm": 0.1285974234342575, + "learning_rate": 2.8423843992403298e-05, + "loss": 0.0005, + "num_input_tokens_seen": 39776272, + "step": 68570 + }, + { + "epoch": 10.213732499255288, + "grad_norm": 0.17561344802379608, + "learning_rate": 2.8420625159899622e-05, + "loss": 0.0035, + "num_input_tokens_seen": 39779280, + "step": 68575 + }, + { + "epoch": 10.214477211796247, + "grad_norm": 0.18812096118927002, + "learning_rate": 2.8417406269607954e-05, + "loss": 0.1321, + "num_input_tokens_seen": 39782384, + "step": 68580 + }, + { + "epoch": 10.215221924337206, + "grad_norm": 0.023762721568346024, + "learning_rate": 2.8414187321582676e-05, + "loss": 0.0034, + "num_input_tokens_seen": 39785616, + "step": 68585 + }, + { + "epoch": 10.215966636878164, + "grad_norm": 0.0246113333851099, + "learning_rate": 2.8410968315878178e-05, + "loss": 0.2409, + "num_input_tokens_seen": 39788816, + "step": 68590 + }, + { + "epoch": 10.216711349419125, + "grad_norm": 0.0016688939649611712, + "learning_rate": 2.8407749252548843e-05, + "loss": 0.0009, + "num_input_tokens_seen": 39791504, + "step": 68595 + }, + { + "epoch": 10.217456061960084, + "grad_norm": 0.02905399538576603, + "learning_rate": 2.8404530131649036e-05, + "loss": 0.072, + "num_input_tokens_seen": 39794320, + "step": 68600 + }, + { + "epoch": 10.218200774501042, + "grad_norm": 54.795169830322266, + "learning_rate": 2.8401310953233158e-05, + "loss": 0.1254, + "num_input_tokens_seen": 39797200, + "step": 68605 + }, + { + "epoch": 10.218945487042001, + "grad_norm": 0.009754246100783348, + "learning_rate": 2.839809171735559e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39800304, + "step": 68610 + }, + { + "epoch": 10.219690199582962, + "grad_norm": 9.33370590209961, + "learning_rate": 2.8394872424070716e-05, + "loss": 0.0778, + "num_input_tokens_seen": 39803088, + "step": 68615 + }, + { + "epoch": 10.22043491212392, + "grad_norm": 0.0005786597612313926, + "learning_rate": 2.8391653073432918e-05, + "loss": 0.0, + "num_input_tokens_seen": 39806160, + "step": 68620 + }, + { + "epoch": 10.22117962466488, + "grad_norm": 0.004338270518928766, + "learning_rate": 2.838843366549659e-05, + "loss": 0.0004, + "num_input_tokens_seen": 39809136, + "step": 68625 + }, + { + "epoch": 10.221924337205838, + "grad_norm": 0.00371676217764616, + "learning_rate": 2.8385214200316118e-05, + "loss": 0.0864, + "num_input_tokens_seen": 39811856, + "step": 68630 + }, + { + "epoch": 10.222669049746798, + "grad_norm": 0.002965745981782675, + "learning_rate": 2.83819946779459e-05, + "loss": 0.0009, + "num_input_tokens_seen": 39814768, + "step": 68635 + }, + { + "epoch": 10.223413762287757, + "grad_norm": 9.4461669921875, + "learning_rate": 2.8378775098440318e-05, + "loss": 0.0016, + "num_input_tokens_seen": 39818032, + "step": 68640 + }, + { + "epoch": 10.224158474828716, + "grad_norm": 0.004561875481158495, + "learning_rate": 2.8375555461853764e-05, + "loss": 0.0, + "num_input_tokens_seen": 39820912, + "step": 68645 + }, + { + "epoch": 10.224903187369675, + "grad_norm": 0.03374649956822395, + "learning_rate": 2.8372335768240626e-05, + "loss": 0.3814, + "num_input_tokens_seen": 39823824, + "step": 68650 + }, + { + "epoch": 10.225647899910635, + "grad_norm": 0.004462340846657753, + "learning_rate": 2.8369116017655307e-05, + "loss": 0.0072, + "num_input_tokens_seen": 39826896, + "step": 68655 + }, + { + "epoch": 10.226392612451594, + "grad_norm": 144.09071350097656, + "learning_rate": 2.836589621015219e-05, + "loss": 0.0885, + "num_input_tokens_seen": 39830128, + "step": 68660 + }, + { + "epoch": 10.227137324992553, + "grad_norm": 0.00917870458215475, + "learning_rate": 2.8362676345785683e-05, + "loss": 0.0532, + "num_input_tokens_seen": 39832784, + "step": 68665 + }, + { + "epoch": 10.227882037533512, + "grad_norm": 0.12469732761383057, + "learning_rate": 2.835945642461018e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39835664, + "step": 68670 + }, + { + "epoch": 10.228626750074472, + "grad_norm": 0.024480774998664856, + "learning_rate": 2.8356236446680073e-05, + "loss": 0.1115, + "num_input_tokens_seen": 39838480, + "step": 68675 + }, + { + "epoch": 10.22937146261543, + "grad_norm": 20.600488662719727, + "learning_rate": 2.835301641204976e-05, + "loss": 0.0742, + "num_input_tokens_seen": 39841360, + "step": 68680 + }, + { + "epoch": 10.23011617515639, + "grad_norm": 0.019251113757491112, + "learning_rate": 2.834979632077364e-05, + "loss": 0.0636, + "num_input_tokens_seen": 39844304, + "step": 68685 + }, + { + "epoch": 10.230860887697348, + "grad_norm": 0.016478801146149635, + "learning_rate": 2.834657617290612e-05, + "loss": 0.1628, + "num_input_tokens_seen": 39847056, + "step": 68690 + }, + { + "epoch": 10.231605600238307, + "grad_norm": 0.006100798025727272, + "learning_rate": 2.8343355968501596e-05, + "loss": 0.212, + "num_input_tokens_seen": 39849840, + "step": 68695 + }, + { + "epoch": 10.232350312779268, + "grad_norm": 0.11313407123088837, + "learning_rate": 2.8340135707614467e-05, + "loss": 0.0004, + "num_input_tokens_seen": 39852720, + "step": 68700 + }, + { + "epoch": 10.233095025320226, + "grad_norm": 0.2612793743610382, + "learning_rate": 2.8336915390299152e-05, + "loss": 0.001, + "num_input_tokens_seen": 39855280, + "step": 68705 + }, + { + "epoch": 10.233839737861185, + "grad_norm": 0.07196421176195145, + "learning_rate": 2.8333695016610034e-05, + "loss": 0.0006, + "num_input_tokens_seen": 39858192, + "step": 68710 + }, + { + "epoch": 10.234584450402144, + "grad_norm": 0.020417431369423866, + "learning_rate": 2.833047458660153e-05, + "loss": 0.2375, + "num_input_tokens_seen": 39860848, + "step": 68715 + }, + { + "epoch": 10.235329162943104, + "grad_norm": 56.378143310546875, + "learning_rate": 2.8327254100328044e-05, + "loss": 0.0276, + "num_input_tokens_seen": 39863952, + "step": 68720 + }, + { + "epoch": 10.236073875484063, + "grad_norm": 0.004392118658870459, + "learning_rate": 2.8324033557843975e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39866896, + "step": 68725 + }, + { + "epoch": 10.236818588025022, + "grad_norm": 0.06126611679792404, + "learning_rate": 2.832081295920374e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39869936, + "step": 68730 + }, + { + "epoch": 10.23756330056598, + "grad_norm": 0.01228520181030035, + "learning_rate": 2.8317592304461744e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39872848, + "step": 68735 + }, + { + "epoch": 10.238308013106941, + "grad_norm": 0.000461885763797909, + "learning_rate": 2.8314371593672408e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39875600, + "step": 68740 + }, + { + "epoch": 10.2390527256479, + "grad_norm": 0.18375350534915924, + "learning_rate": 2.8311150826890122e-05, + "loss": 0.0007, + "num_input_tokens_seen": 39878544, + "step": 68745 + }, + { + "epoch": 10.239797438188859, + "grad_norm": 0.02693570777773857, + "learning_rate": 2.830793000416931e-05, + "loss": 0.0324, + "num_input_tokens_seen": 39881456, + "step": 68750 + }, + { + "epoch": 10.240542150729818, + "grad_norm": 0.0037629876751452684, + "learning_rate": 2.8304709125564382e-05, + "loss": 0.135, + "num_input_tokens_seen": 39884240, + "step": 68755 + }, + { + "epoch": 10.241286863270778, + "grad_norm": 0.014825149439275265, + "learning_rate": 2.8301488191129756e-05, + "loss": 0.0005, + "num_input_tokens_seen": 39886992, + "step": 68760 + }, + { + "epoch": 10.242031575811737, + "grad_norm": 0.0794408991932869, + "learning_rate": 2.8298267200919836e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39889968, + "step": 68765 + }, + { + "epoch": 10.242776288352696, + "grad_norm": 8.671008110046387, + "learning_rate": 2.8295046154989047e-05, + "loss": 0.051, + "num_input_tokens_seen": 39892752, + "step": 68770 + }, + { + "epoch": 10.243521000893654, + "grad_norm": 0.020114919170737267, + "learning_rate": 2.8291825053391808e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39895824, + "step": 68775 + }, + { + "epoch": 10.244265713434615, + "grad_norm": 0.0021724854595959187, + "learning_rate": 2.828860389618252e-05, + "loss": 0.2032, + "num_input_tokens_seen": 39898640, + "step": 68780 + }, + { + "epoch": 10.245010425975574, + "grad_norm": 1.1855766773223877, + "learning_rate": 2.8285382683415617e-05, + "loss": 0.0359, + "num_input_tokens_seen": 39901424, + "step": 68785 + }, + { + "epoch": 10.245755138516532, + "grad_norm": 0.0016820324817672372, + "learning_rate": 2.8282161415145513e-05, + "loss": 0.093, + "num_input_tokens_seen": 39904368, + "step": 68790 + }, + { + "epoch": 10.246499851057491, + "grad_norm": 0.20201998949050903, + "learning_rate": 2.827894009142663e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39907216, + "step": 68795 + }, + { + "epoch": 10.247244563598452, + "grad_norm": 0.008526476100087166, + "learning_rate": 2.827571871231338e-05, + "loss": 0.0008, + "num_input_tokens_seen": 39910160, + "step": 68800 + }, + { + "epoch": 10.24798927613941, + "grad_norm": 0.020381109789013863, + "learning_rate": 2.82724972778602e-05, + "loss": 0.0004, + "num_input_tokens_seen": 39913008, + "step": 68805 + }, + { + "epoch": 10.24873398868037, + "grad_norm": 0.013504724018275738, + "learning_rate": 2.8269275788121503e-05, + "loss": 0.18, + "num_input_tokens_seen": 39915824, + "step": 68810 + }, + { + "epoch": 10.249478701221328, + "grad_norm": 0.015029189176857471, + "learning_rate": 2.8266054243151708e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39918736, + "step": 68815 + }, + { + "epoch": 10.250223413762289, + "grad_norm": 0.005507910158485174, + "learning_rate": 2.8262832643005242e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39922096, + "step": 68820 + }, + { + "epoch": 10.250968126303247, + "grad_norm": 24.607120513916016, + "learning_rate": 2.8259610987736545e-05, + "loss": 0.1816, + "num_input_tokens_seen": 39925104, + "step": 68825 + }, + { + "epoch": 10.251712838844206, + "grad_norm": 0.018732290714979172, + "learning_rate": 2.825638927740003e-05, + "loss": 0.0028, + "num_input_tokens_seen": 39927792, + "step": 68830 + }, + { + "epoch": 10.252457551385165, + "grad_norm": 0.0006149759283289313, + "learning_rate": 2.825316751205013e-05, + "loss": 0.0356, + "num_input_tokens_seen": 39930736, + "step": 68835 + }, + { + "epoch": 10.253202263926125, + "grad_norm": 0.03670412302017212, + "learning_rate": 2.8249945691741276e-05, + "loss": 0.0038, + "num_input_tokens_seen": 39933616, + "step": 68840 + }, + { + "epoch": 10.253946976467084, + "grad_norm": 188.06613159179688, + "learning_rate": 2.824672381652788e-05, + "loss": 0.0811, + "num_input_tokens_seen": 39936688, + "step": 68845 + }, + { + "epoch": 10.254691689008043, + "grad_norm": 34.56734085083008, + "learning_rate": 2.8243501886464392e-05, + "loss": 0.2234, + "num_input_tokens_seen": 39939536, + "step": 68850 + }, + { + "epoch": 10.255436401549002, + "grad_norm": 0.04610779508948326, + "learning_rate": 2.8240279901605238e-05, + "loss": 0.0003, + "num_input_tokens_seen": 39942672, + "step": 68855 + }, + { + "epoch": 10.256181114089962, + "grad_norm": 0.041584718972444534, + "learning_rate": 2.823705786200484e-05, + "loss": 0.0685, + "num_input_tokens_seen": 39945584, + "step": 68860 + }, + { + "epoch": 10.256925826630921, + "grad_norm": 0.6721713542938232, + "learning_rate": 2.8233835767717642e-05, + "loss": 0.0006, + "num_input_tokens_seen": 39948368, + "step": 68865 + }, + { + "epoch": 10.25767053917188, + "grad_norm": 0.04759812727570534, + "learning_rate": 2.8230613618798086e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39951056, + "step": 68870 + }, + { + "epoch": 10.258415251712838, + "grad_norm": 3.1724517345428467, + "learning_rate": 2.822739141530059e-05, + "loss": 0.1514, + "num_input_tokens_seen": 39953872, + "step": 68875 + }, + { + "epoch": 10.259159964253797, + "grad_norm": 0.025141632184386253, + "learning_rate": 2.8224169157279597e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39956976, + "step": 68880 + }, + { + "epoch": 10.259904676794758, + "grad_norm": 102.33554077148438, + "learning_rate": 2.8220946844789535e-05, + "loss": 0.0831, + "num_input_tokens_seen": 39959920, + "step": 68885 + }, + { + "epoch": 10.260649389335716, + "grad_norm": 5.1026082038879395, + "learning_rate": 2.8217724477884854e-05, + "loss": 0.0015, + "num_input_tokens_seen": 39962672, + "step": 68890 + }, + { + "epoch": 10.261394101876675, + "grad_norm": 0.00786516722291708, + "learning_rate": 2.821450205661999e-05, + "loss": 0.2514, + "num_input_tokens_seen": 39965456, + "step": 68895 + }, + { + "epoch": 10.262138814417634, + "grad_norm": 0.042353857308626175, + "learning_rate": 2.8211279581049384e-05, + "loss": 0.0007, + "num_input_tokens_seen": 39968304, + "step": 68900 + }, + { + "epoch": 10.262883526958595, + "grad_norm": 9.751754760742188, + "learning_rate": 2.8208057051227473e-05, + "loss": 0.0045, + "num_input_tokens_seen": 39971088, + "step": 68905 + }, + { + "epoch": 10.263628239499553, + "grad_norm": 0.019497180357575417, + "learning_rate": 2.820483446720869e-05, + "loss": 0.0197, + "num_input_tokens_seen": 39974096, + "step": 68910 + }, + { + "epoch": 10.264372952040512, + "grad_norm": 0.07228966057300568, + "learning_rate": 2.8201611829047498e-05, + "loss": 0.244, + "num_input_tokens_seen": 39976912, + "step": 68915 + }, + { + "epoch": 10.26511766458147, + "grad_norm": 49.5833854675293, + "learning_rate": 2.819838913679832e-05, + "loss": 0.3546, + "num_input_tokens_seen": 39979824, + "step": 68920 + }, + { + "epoch": 10.265862377122431, + "grad_norm": 108.22966003417969, + "learning_rate": 2.819516639051561e-05, + "loss": 0.2126, + "num_input_tokens_seen": 39982512, + "step": 68925 + }, + { + "epoch": 10.26660708966339, + "grad_norm": 0.019964946433901787, + "learning_rate": 2.8191943590253806e-05, + "loss": 0.0002, + "num_input_tokens_seen": 39985616, + "step": 68930 + }, + { + "epoch": 10.267351802204349, + "grad_norm": 0.0004478511691559106, + "learning_rate": 2.8188720736067364e-05, + "loss": 0.0001, + "num_input_tokens_seen": 39988560, + "step": 68935 + }, + { + "epoch": 10.268096514745308, + "grad_norm": 0.0002617106365505606, + "learning_rate": 2.818549782801073e-05, + "loss": 0.3947, + "num_input_tokens_seen": 39991184, + "step": 68940 + }, + { + "epoch": 10.268841227286268, + "grad_norm": 0.0004113568284083158, + "learning_rate": 2.8182274866138343e-05, + "loss": 0.2134, + "num_input_tokens_seen": 39994032, + "step": 68945 + }, + { + "epoch": 10.269585939827227, + "grad_norm": 18.391115188598633, + "learning_rate": 2.8179051850504656e-05, + "loss": 0.0515, + "num_input_tokens_seen": 39996784, + "step": 68950 + }, + { + "epoch": 10.270330652368186, + "grad_norm": 0.22881852090358734, + "learning_rate": 2.8175828781164127e-05, + "loss": 0.1727, + "num_input_tokens_seen": 39999568, + "step": 68955 + }, + { + "epoch": 10.271075364909144, + "grad_norm": 0.01979396492242813, + "learning_rate": 2.8172605658171192e-05, + "loss": 0.0209, + "num_input_tokens_seen": 40002320, + "step": 68960 + }, + { + "epoch": 10.271820077450105, + "grad_norm": 0.0032167350873351097, + "learning_rate": 2.8169382481580303e-05, + "loss": 0.1129, + "num_input_tokens_seen": 40005200, + "step": 68965 + }, + { + "epoch": 10.272564789991064, + "grad_norm": 1.2077325582504272, + "learning_rate": 2.8166159251445928e-05, + "loss": 0.0004, + "num_input_tokens_seen": 40008016, + "step": 68970 + }, + { + "epoch": 10.273309502532022, + "grad_norm": 0.005012722685933113, + "learning_rate": 2.8162935967822505e-05, + "loss": 0.1676, + "num_input_tokens_seen": 40010928, + "step": 68975 + }, + { + "epoch": 10.274054215072981, + "grad_norm": 73.12322998046875, + "learning_rate": 2.8159712630764494e-05, + "loss": 0.2668, + "num_input_tokens_seen": 40013712, + "step": 68980 + }, + { + "epoch": 10.274798927613942, + "grad_norm": 0.00464119017124176, + "learning_rate": 2.815648924032635e-05, + "loss": 0.0031, + "num_input_tokens_seen": 40016400, + "step": 68985 + }, + { + "epoch": 10.2755436401549, + "grad_norm": 0.000818550877738744, + "learning_rate": 2.8153265796562528e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40019216, + "step": 68990 + }, + { + "epoch": 10.27628835269586, + "grad_norm": 0.0010032389545813203, + "learning_rate": 2.815004229952749e-05, + "loss": 0.0525, + "num_input_tokens_seen": 40021968, + "step": 68995 + }, + { + "epoch": 10.277033065236818, + "grad_norm": 0.01598300039768219, + "learning_rate": 2.8146818749275684e-05, + "loss": 0.0408, + "num_input_tokens_seen": 40024880, + "step": 69000 + }, + { + "epoch": 10.277777777777779, + "grad_norm": 0.0037669790908694267, + "learning_rate": 2.814359514586158e-05, + "loss": 0.0, + "num_input_tokens_seen": 40027504, + "step": 69005 + }, + { + "epoch": 10.278522490318737, + "grad_norm": 0.02029714174568653, + "learning_rate": 2.8140371489339624e-05, + "loss": 0.1098, + "num_input_tokens_seen": 40030352, + "step": 69010 + }, + { + "epoch": 10.279267202859696, + "grad_norm": 0.002120146295055747, + "learning_rate": 2.8137147779764285e-05, + "loss": 0.1604, + "num_input_tokens_seen": 40033360, + "step": 69015 + }, + { + "epoch": 10.280011915400655, + "grad_norm": 0.00782864075154066, + "learning_rate": 2.8133924017190023e-05, + "loss": 0.1575, + "num_input_tokens_seen": 40036144, + "step": 69020 + }, + { + "epoch": 10.280756627941615, + "grad_norm": 0.10239457339048386, + "learning_rate": 2.8130700201671296e-05, + "loss": 0.0006, + "num_input_tokens_seen": 40038832, + "step": 69025 + }, + { + "epoch": 10.281501340482574, + "grad_norm": 0.3963054120540619, + "learning_rate": 2.812747633326257e-05, + "loss": 0.0006, + "num_input_tokens_seen": 40041680, + "step": 69030 + }, + { + "epoch": 10.282246053023533, + "grad_norm": 0.007838871330022812, + "learning_rate": 2.812425241201832e-05, + "loss": 0.0005, + "num_input_tokens_seen": 40044304, + "step": 69035 + }, + { + "epoch": 10.282990765564492, + "grad_norm": 0.005291424226015806, + "learning_rate": 2.8121028437993002e-05, + "loss": 0.0005, + "num_input_tokens_seen": 40047312, + "step": 69040 + }, + { + "epoch": 10.283735478105452, + "grad_norm": 0.030692845582962036, + "learning_rate": 2.8117804411241074e-05, + "loss": 0.0007, + "num_input_tokens_seen": 40050128, + "step": 69045 + }, + { + "epoch": 10.284480190646411, + "grad_norm": 0.05574547499418259, + "learning_rate": 2.8114580331817004e-05, + "loss": 0.0027, + "num_input_tokens_seen": 40053072, + "step": 69050 + }, + { + "epoch": 10.28522490318737, + "grad_norm": 0.02985963597893715, + "learning_rate": 2.8111356199775268e-05, + "loss": 0.1358, + "num_input_tokens_seen": 40055792, + "step": 69055 + }, + { + "epoch": 10.285969615728328, + "grad_norm": 0.01245670672506094, + "learning_rate": 2.8108132015170337e-05, + "loss": 0.1713, + "num_input_tokens_seen": 40058640, + "step": 69060 + }, + { + "epoch": 10.286714328269287, + "grad_norm": 89.92089080810547, + "learning_rate": 2.8104907778056667e-05, + "loss": 0.1654, + "num_input_tokens_seen": 40061968, + "step": 69065 + }, + { + "epoch": 10.287459040810248, + "grad_norm": 0.017475899308919907, + "learning_rate": 2.8101683488488745e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40064912, + "step": 69070 + }, + { + "epoch": 10.288203753351207, + "grad_norm": 0.012980150990188122, + "learning_rate": 2.8098459146521026e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40067888, + "step": 69075 + }, + { + "epoch": 10.288948465892165, + "grad_norm": 0.011738192290067673, + "learning_rate": 2.8095234752207993e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40070704, + "step": 69080 + }, + { + "epoch": 10.289693178433124, + "grad_norm": 0.045633379369974136, + "learning_rate": 2.809201030560411e-05, + "loss": 0.0448, + "num_input_tokens_seen": 40073392, + "step": 69085 + }, + { + "epoch": 10.290437890974085, + "grad_norm": 7.6183180809021, + "learning_rate": 2.8088785806763856e-05, + "loss": 0.1358, + "num_input_tokens_seen": 40076304, + "step": 69090 + }, + { + "epoch": 10.291182603515043, + "grad_norm": 0.020129986107349396, + "learning_rate": 2.8085561255741704e-05, + "loss": 0.0253, + "num_input_tokens_seen": 40079120, + "step": 69095 + }, + { + "epoch": 10.291927316056002, + "grad_norm": 0.008448035456240177, + "learning_rate": 2.8082336652592135e-05, + "loss": 0.0512, + "num_input_tokens_seen": 40082256, + "step": 69100 + }, + { + "epoch": 10.29267202859696, + "grad_norm": 0.003531975671648979, + "learning_rate": 2.8079111997369624e-05, + "loss": 0.0004, + "num_input_tokens_seen": 40084848, + "step": 69105 + }, + { + "epoch": 10.293416741137921, + "grad_norm": 0.003037248272448778, + "learning_rate": 2.807588729012864e-05, + "loss": 0.0004, + "num_input_tokens_seen": 40087792, + "step": 69110 + }, + { + "epoch": 10.29416145367888, + "grad_norm": 0.0012387784663587809, + "learning_rate": 2.8072662530923666e-05, + "loss": 0.0013, + "num_input_tokens_seen": 40090832, + "step": 69115 + }, + { + "epoch": 10.294906166219839, + "grad_norm": 48.993553161621094, + "learning_rate": 2.8069437719809182e-05, + "loss": 0.0664, + "num_input_tokens_seen": 40093872, + "step": 69120 + }, + { + "epoch": 10.295650878760798, + "grad_norm": 0.006125838030129671, + "learning_rate": 2.806621285683967e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40096624, + "step": 69125 + }, + { + "epoch": 10.296395591301758, + "grad_norm": 0.22074151039123535, + "learning_rate": 2.8062987942069603e-05, + "loss": 0.1587, + "num_input_tokens_seen": 40099728, + "step": 69130 + }, + { + "epoch": 10.297140303842717, + "grad_norm": 0.00350686046294868, + "learning_rate": 2.8059762975553478e-05, + "loss": 0.0817, + "num_input_tokens_seen": 40102832, + "step": 69135 + }, + { + "epoch": 10.297885016383676, + "grad_norm": 0.006208487320691347, + "learning_rate": 2.8056537957345757e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40105616, + "step": 69140 + }, + { + "epoch": 10.298629728924634, + "grad_norm": 0.009914093650877476, + "learning_rate": 2.8053312887500936e-05, + "loss": 0.2659, + "num_input_tokens_seen": 40108560, + "step": 69145 + }, + { + "epoch": 10.299374441465595, + "grad_norm": 0.06822813302278519, + "learning_rate": 2.8050087766073496e-05, + "loss": 0.2663, + "num_input_tokens_seen": 40111440, + "step": 69150 + }, + { + "epoch": 10.300119154006554, + "grad_norm": 0.006379222497344017, + "learning_rate": 2.804686259311792e-05, + "loss": 0.1969, + "num_input_tokens_seen": 40114480, + "step": 69155 + }, + { + "epoch": 10.300863866547513, + "grad_norm": 0.019976217299699783, + "learning_rate": 2.8043637368688707e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40117040, + "step": 69160 + }, + { + "epoch": 10.301608579088471, + "grad_norm": 0.004866710398346186, + "learning_rate": 2.804041209284033e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40120016, + "step": 69165 + }, + { + "epoch": 10.302353291629432, + "grad_norm": 97.10784912109375, + "learning_rate": 2.803718676562729e-05, + "loss": 0.1672, + "num_input_tokens_seen": 40122864, + "step": 69170 + }, + { + "epoch": 10.30309800417039, + "grad_norm": 0.0022327497135847807, + "learning_rate": 2.803396138710405e-05, + "loss": 0.0357, + "num_input_tokens_seen": 40125744, + "step": 69175 + }, + { + "epoch": 10.30384271671135, + "grad_norm": 0.08816829323768616, + "learning_rate": 2.8030735957325122e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40128720, + "step": 69180 + }, + { + "epoch": 10.304587429252308, + "grad_norm": 3.8856542110443115, + "learning_rate": 2.8027510476344986e-05, + "loss": 0.1214, + "num_input_tokens_seen": 40131760, + "step": 69185 + }, + { + "epoch": 10.305332141793269, + "grad_norm": 0.009480759501457214, + "learning_rate": 2.8024284944218145e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40134480, + "step": 69190 + }, + { + "epoch": 10.306076854334227, + "grad_norm": 0.07367964088916779, + "learning_rate": 2.802105936099908e-05, + "loss": 0.0006, + "num_input_tokens_seen": 40137488, + "step": 69195 + }, + { + "epoch": 10.306821566875186, + "grad_norm": 0.26438143849372864, + "learning_rate": 2.8017833726742293e-05, + "loss": 0.251, + "num_input_tokens_seen": 40140496, + "step": 69200 + }, + { + "epoch": 10.307566279416145, + "grad_norm": 139.94625854492188, + "learning_rate": 2.8014608041502273e-05, + "loss": 0.1648, + "num_input_tokens_seen": 40143600, + "step": 69205 + }, + { + "epoch": 10.308310991957104, + "grad_norm": 8.088919639587402, + "learning_rate": 2.8011382305333505e-05, + "loss": 0.1041, + "num_input_tokens_seen": 40146800, + "step": 69210 + }, + { + "epoch": 10.309055704498064, + "grad_norm": 0.006022125948220491, + "learning_rate": 2.8008156518290496e-05, + "loss": 0.0011, + "num_input_tokens_seen": 40149680, + "step": 69215 + }, + { + "epoch": 10.309800417039023, + "grad_norm": 0.0039965566247701645, + "learning_rate": 2.8004930680427742e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40152240, + "step": 69220 + }, + { + "epoch": 10.310545129579982, + "grad_norm": 90.98851776123047, + "learning_rate": 2.8001704791799732e-05, + "loss": 0.1312, + "num_input_tokens_seen": 40155280, + "step": 69225 + }, + { + "epoch": 10.31128984212094, + "grad_norm": 0.0028050062246620655, + "learning_rate": 2.799847885246098e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40158064, + "step": 69230 + }, + { + "epoch": 10.312034554661901, + "grad_norm": 0.008709446527063847, + "learning_rate": 2.799525286246597e-05, + "loss": 0.3286, + "num_input_tokens_seen": 40161040, + "step": 69235 + }, + { + "epoch": 10.31277926720286, + "grad_norm": 0.006067726761102676, + "learning_rate": 2.7992026821869215e-05, + "loss": 0.0342, + "num_input_tokens_seen": 40163856, + "step": 69240 + }, + { + "epoch": 10.313523979743819, + "grad_norm": 0.01295792032033205, + "learning_rate": 2.7988800730725202e-05, + "loss": 0.005, + "num_input_tokens_seen": 40167216, + "step": 69245 + }, + { + "epoch": 10.314268692284777, + "grad_norm": 0.0020762502681463957, + "learning_rate": 2.7985574589088437e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40170544, + "step": 69250 + }, + { + "epoch": 10.315013404825738, + "grad_norm": 0.0016491629648953676, + "learning_rate": 2.798234839701342e-05, + "loss": 0.0216, + "num_input_tokens_seen": 40173264, + "step": 69255 + }, + { + "epoch": 10.315758117366697, + "grad_norm": 0.7030937075614929, + "learning_rate": 2.797912215455466e-05, + "loss": 0.0006, + "num_input_tokens_seen": 40176368, + "step": 69260 + }, + { + "epoch": 10.316502829907655, + "grad_norm": 0.0003778589889407158, + "learning_rate": 2.797589586176666e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40179376, + "step": 69265 + }, + { + "epoch": 10.317247542448614, + "grad_norm": 0.4331783056259155, + "learning_rate": 2.797266951870393e-05, + "loss": 0.2931, + "num_input_tokens_seen": 40181936, + "step": 69270 + }, + { + "epoch": 10.317992254989575, + "grad_norm": 0.029837004840373993, + "learning_rate": 2.7969443125420963e-05, + "loss": 0.0011, + "num_input_tokens_seen": 40184752, + "step": 69275 + }, + { + "epoch": 10.318736967530533, + "grad_norm": 0.20796477794647217, + "learning_rate": 2.7966216681972278e-05, + "loss": 0.0646, + "num_input_tokens_seen": 40187536, + "step": 69280 + }, + { + "epoch": 10.319481680071492, + "grad_norm": 0.0019325015600770712, + "learning_rate": 2.7962990188412375e-05, + "loss": 0.0075, + "num_input_tokens_seen": 40190384, + "step": 69285 + }, + { + "epoch": 10.320226392612451, + "grad_norm": 0.0032138603273779154, + "learning_rate": 2.7959763644795762e-05, + "loss": 0.2196, + "num_input_tokens_seen": 40193904, + "step": 69290 + }, + { + "epoch": 10.320971105153411, + "grad_norm": 0.8473251461982727, + "learning_rate": 2.7956537051176952e-05, + "loss": 0.0052, + "num_input_tokens_seen": 40196944, + "step": 69295 + }, + { + "epoch": 10.32171581769437, + "grad_norm": 0.003098118584603071, + "learning_rate": 2.7953310407610455e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40199728, + "step": 69300 + }, + { + "epoch": 10.322460530235329, + "grad_norm": 8.680405616760254, + "learning_rate": 2.7950083714150776e-05, + "loss": 0.0313, + "num_input_tokens_seen": 40202352, + "step": 69305 + }, + { + "epoch": 10.323205242776288, + "grad_norm": 0.4667900800704956, + "learning_rate": 2.794685697085243e-05, + "loss": 0.0863, + "num_input_tokens_seen": 40205072, + "step": 69310 + }, + { + "epoch": 10.323949955317248, + "grad_norm": 0.005516365170478821, + "learning_rate": 2.7943630177769932e-05, + "loss": 0.033, + "num_input_tokens_seen": 40208016, + "step": 69315 + }, + { + "epoch": 10.324694667858207, + "grad_norm": 0.631949782371521, + "learning_rate": 2.79404033349578e-05, + "loss": 0.0006, + "num_input_tokens_seen": 40210704, + "step": 69320 + }, + { + "epoch": 10.325439380399166, + "grad_norm": 0.0007506184047088027, + "learning_rate": 2.7937176442470535e-05, + "loss": 0.1911, + "num_input_tokens_seen": 40213232, + "step": 69325 + }, + { + "epoch": 10.326184092940125, + "grad_norm": 0.0006293872720561922, + "learning_rate": 2.793394950036266e-05, + "loss": 0.1892, + "num_input_tokens_seen": 40216144, + "step": 69330 + }, + { + "epoch": 10.326928805481085, + "grad_norm": 0.1329876184463501, + "learning_rate": 2.7930722508688696e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40218672, + "step": 69335 + }, + { + "epoch": 10.327673518022044, + "grad_norm": 0.020353911444544792, + "learning_rate": 2.792749546750315e-05, + "loss": 0.0005, + "num_input_tokens_seen": 40221424, + "step": 69340 + }, + { + "epoch": 10.328418230563003, + "grad_norm": 0.0003611546999309212, + "learning_rate": 2.792426837686054e-05, + "loss": 0.1443, + "num_input_tokens_seen": 40224144, + "step": 69345 + }, + { + "epoch": 10.329162943103961, + "grad_norm": 0.022415468469262123, + "learning_rate": 2.7921041236815387e-05, + "loss": 0.0021, + "num_input_tokens_seen": 40226864, + "step": 69350 + }, + { + "epoch": 10.329907655644922, + "grad_norm": 21.735258102416992, + "learning_rate": 2.7917814047422214e-05, + "loss": 0.0055, + "num_input_tokens_seen": 40229808, + "step": 69355 + }, + { + "epoch": 10.33065236818588, + "grad_norm": 0.008203164674341679, + "learning_rate": 2.7914586808735542e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40232432, + "step": 69360 + }, + { + "epoch": 10.33139708072684, + "grad_norm": 119.94697570800781, + "learning_rate": 2.7911359520809886e-05, + "loss": 0.247, + "num_input_tokens_seen": 40235216, + "step": 69365 + }, + { + "epoch": 10.332141793267798, + "grad_norm": 0.005070372950285673, + "learning_rate": 2.7908132183699775e-05, + "loss": 0.2097, + "num_input_tokens_seen": 40238448, + "step": 69370 + }, + { + "epoch": 10.332886505808759, + "grad_norm": 0.002201403956860304, + "learning_rate": 2.790490479745972e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40241456, + "step": 69375 + }, + { + "epoch": 10.333631218349717, + "grad_norm": 5.278418029774912e-05, + "learning_rate": 2.7901677362144252e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40244272, + "step": 69380 + }, + { + "epoch": 10.334375930890676, + "grad_norm": 58.866878509521484, + "learning_rate": 2.7898449877807885e-05, + "loss": 0.0841, + "num_input_tokens_seen": 40247664, + "step": 69385 + }, + { + "epoch": 10.335120643431635, + "grad_norm": 0.688862144947052, + "learning_rate": 2.7895222344505163e-05, + "loss": 0.0557, + "num_input_tokens_seen": 40250352, + "step": 69390 + }, + { + "epoch": 10.335865355972594, + "grad_norm": 0.1146034225821495, + "learning_rate": 2.78919947622906e-05, + "loss": 0.0165, + "num_input_tokens_seen": 40253456, + "step": 69395 + }, + { + "epoch": 10.336610068513554, + "grad_norm": 0.29745057225227356, + "learning_rate": 2.788876713121873e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40256144, + "step": 69400 + }, + { + "epoch": 10.337354781054513, + "grad_norm": 0.013731016777455807, + "learning_rate": 2.7885539451344077e-05, + "loss": 0.0837, + "num_input_tokens_seen": 40259152, + "step": 69405 + }, + { + "epoch": 10.338099493595472, + "grad_norm": 0.10211361199617386, + "learning_rate": 2.788231172272116e-05, + "loss": 0.0447, + "num_input_tokens_seen": 40262032, + "step": 69410 + }, + { + "epoch": 10.33884420613643, + "grad_norm": 0.02365133911371231, + "learning_rate": 2.7879083945404517e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40264560, + "step": 69415 + }, + { + "epoch": 10.339588918677391, + "grad_norm": 0.06536377966403961, + "learning_rate": 2.7875856119448672e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40267376, + "step": 69420 + }, + { + "epoch": 10.34033363121835, + "grad_norm": 0.02471255511045456, + "learning_rate": 2.7872628244908167e-05, + "loss": 0.0265, + "num_input_tokens_seen": 40270384, + "step": 69425 + }, + { + "epoch": 10.341078343759309, + "grad_norm": 59.47067642211914, + "learning_rate": 2.7869400321837525e-05, + "loss": 0.2038, + "num_input_tokens_seen": 40273072, + "step": 69430 + }, + { + "epoch": 10.341823056300267, + "grad_norm": 0.009752728044986725, + "learning_rate": 2.7866172350291286e-05, + "loss": 0.0031, + "num_input_tokens_seen": 40276112, + "step": 69435 + }, + { + "epoch": 10.342567768841228, + "grad_norm": 0.02627844735980034, + "learning_rate": 2.7862944330323982e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40278832, + "step": 69440 + }, + { + "epoch": 10.343312481382187, + "grad_norm": 0.0020182784646749496, + "learning_rate": 2.785971626199013e-05, + "loss": 0.0927, + "num_input_tokens_seen": 40282032, + "step": 69445 + }, + { + "epoch": 10.344057193923145, + "grad_norm": 0.009409869089722633, + "learning_rate": 2.7856488145344285e-05, + "loss": 0.0103, + "num_input_tokens_seen": 40284816, + "step": 69450 + }, + { + "epoch": 10.344801906464104, + "grad_norm": 176.18150329589844, + "learning_rate": 2.785325998044097e-05, + "loss": 0.1254, + "num_input_tokens_seen": 40287728, + "step": 69455 + }, + { + "epoch": 10.345546619005065, + "grad_norm": 0.0007137917564250529, + "learning_rate": 2.7850031767334734e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40290576, + "step": 69460 + }, + { + "epoch": 10.346291331546023, + "grad_norm": 1.4359726905822754, + "learning_rate": 2.784680350608011e-05, + "loss": 0.0004, + "num_input_tokens_seen": 40293616, + "step": 69465 + }, + { + "epoch": 10.347036044086982, + "grad_norm": 0.002308862516656518, + "learning_rate": 2.784357519673163e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40296304, + "step": 69470 + }, + { + "epoch": 10.347780756627941, + "grad_norm": 0.021235167980194092, + "learning_rate": 2.784034683934384e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40299504, + "step": 69475 + }, + { + "epoch": 10.348525469168901, + "grad_norm": 0.048493314534425735, + "learning_rate": 2.7837118433971277e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40302064, + "step": 69480 + }, + { + "epoch": 10.34927018170986, + "grad_norm": 0.0006535195279866457, + "learning_rate": 2.7833889980668476e-05, + "loss": 0.1738, + "num_input_tokens_seen": 40305200, + "step": 69485 + }, + { + "epoch": 10.350014894250819, + "grad_norm": 0.0009243045351468027, + "learning_rate": 2.7830661479489987e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40308112, + "step": 69490 + }, + { + "epoch": 10.350759606791778, + "grad_norm": 0.008487271144986153, + "learning_rate": 2.782743293049035e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40311312, + "step": 69495 + }, + { + "epoch": 10.351504319332738, + "grad_norm": 0.011107349768280983, + "learning_rate": 2.7824204333724115e-05, + "loss": 0.0312, + "num_input_tokens_seen": 40314320, + "step": 69500 + }, + { + "epoch": 10.352249031873697, + "grad_norm": 0.0022287415340542793, + "learning_rate": 2.7820975689245805e-05, + "loss": 0.547, + "num_input_tokens_seen": 40317392, + "step": 69505 + }, + { + "epoch": 10.352993744414656, + "grad_norm": 0.00042734420276246965, + "learning_rate": 2.7817746997109983e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40320048, + "step": 69510 + }, + { + "epoch": 10.353738456955615, + "grad_norm": 0.9681394100189209, + "learning_rate": 2.7814518257371187e-05, + "loss": 0.2604, + "num_input_tokens_seen": 40323120, + "step": 69515 + }, + { + "epoch": 10.354483169496575, + "grad_norm": 0.003674690145999193, + "learning_rate": 2.7811289470083972e-05, + "loss": 0.2969, + "num_input_tokens_seen": 40325808, + "step": 69520 + }, + { + "epoch": 10.355227882037534, + "grad_norm": 0.018774114549160004, + "learning_rate": 2.7808060635302875e-05, + "loss": 0.3004, + "num_input_tokens_seen": 40328816, + "step": 69525 + }, + { + "epoch": 10.355972594578493, + "grad_norm": 0.0021246348042041063, + "learning_rate": 2.7804831753082445e-05, + "loss": 0.2816, + "num_input_tokens_seen": 40331632, + "step": 69530 + }, + { + "epoch": 10.356717307119451, + "grad_norm": 0.004000342451035976, + "learning_rate": 2.7801602823477236e-05, + "loss": 0.0033, + "num_input_tokens_seen": 40334576, + "step": 69535 + }, + { + "epoch": 10.357462019660412, + "grad_norm": 7.457290172576904, + "learning_rate": 2.779837384654179e-05, + "loss": 0.0012, + "num_input_tokens_seen": 40337456, + "step": 69540 + }, + { + "epoch": 10.35820673220137, + "grad_norm": 0.017477845773100853, + "learning_rate": 2.7795144822330673e-05, + "loss": 0.5069, + "num_input_tokens_seen": 40340080, + "step": 69545 + }, + { + "epoch": 10.35895144474233, + "grad_norm": 0.003096680622547865, + "learning_rate": 2.7791915750898413e-05, + "loss": 0.0054, + "num_input_tokens_seen": 40342928, + "step": 69550 + }, + { + "epoch": 10.359696157283288, + "grad_norm": 0.12348619848489761, + "learning_rate": 2.7788686632299577e-05, + "loss": 0.0004, + "num_input_tokens_seen": 40345648, + "step": 69555 + }, + { + "epoch": 10.360440869824249, + "grad_norm": 0.04400606080889702, + "learning_rate": 2.778545746658872e-05, + "loss": 0.0375, + "num_input_tokens_seen": 40348400, + "step": 69560 + }, + { + "epoch": 10.361185582365207, + "grad_norm": 0.03390780836343765, + "learning_rate": 2.7782228253820385e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40351408, + "step": 69565 + }, + { + "epoch": 10.361930294906166, + "grad_norm": 3.8742804527282715, + "learning_rate": 2.777899899404914e-05, + "loss": 0.0067, + "num_input_tokens_seen": 40354288, + "step": 69570 + }, + { + "epoch": 10.362675007447125, + "grad_norm": 0.004634256474673748, + "learning_rate": 2.777576968732952e-05, + "loss": 0.0092, + "num_input_tokens_seen": 40357200, + "step": 69575 + }, + { + "epoch": 10.363419719988084, + "grad_norm": 1.2450710535049438, + "learning_rate": 2.7772540333716102e-05, + "loss": 0.0006, + "num_input_tokens_seen": 40360176, + "step": 69580 + }, + { + "epoch": 10.364164432529044, + "grad_norm": 100.7286376953125, + "learning_rate": 2.7769310933263425e-05, + "loss": 0.0207, + "num_input_tokens_seen": 40363120, + "step": 69585 + }, + { + "epoch": 10.364909145070003, + "grad_norm": 117.06837463378906, + "learning_rate": 2.776608148602605e-05, + "loss": 0.1637, + "num_input_tokens_seen": 40366128, + "step": 69590 + }, + { + "epoch": 10.365653857610962, + "grad_norm": 2.9423091411590576, + "learning_rate": 2.7762851992058548e-05, + "loss": 0.1839, + "num_input_tokens_seen": 40368816, + "step": 69595 + }, + { + "epoch": 10.36639857015192, + "grad_norm": 138.9230194091797, + "learning_rate": 2.7759622451415473e-05, + "loss": 0.0131, + "num_input_tokens_seen": 40371536, + "step": 69600 + }, + { + "epoch": 10.367143282692881, + "grad_norm": 0.005749912932515144, + "learning_rate": 2.775639286415138e-05, + "loss": 0.3476, + "num_input_tokens_seen": 40374288, + "step": 69605 + }, + { + "epoch": 10.36788799523384, + "grad_norm": 0.0028625966515392065, + "learning_rate": 2.7753163230320828e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40377392, + "step": 69610 + }, + { + "epoch": 10.368632707774799, + "grad_norm": 121.15202331542969, + "learning_rate": 2.774993354997838e-05, + "loss": 0.0121, + "num_input_tokens_seen": 40380304, + "step": 69615 + }, + { + "epoch": 10.369377420315757, + "grad_norm": 0.03601572662591934, + "learning_rate": 2.77467038231786e-05, + "loss": 0.1114, + "num_input_tokens_seen": 40383184, + "step": 69620 + }, + { + "epoch": 10.370122132856718, + "grad_norm": 28.264896392822266, + "learning_rate": 2.7743474049976054e-05, + "loss": 0.1311, + "num_input_tokens_seen": 40386096, + "step": 69625 + }, + { + "epoch": 10.370866845397677, + "grad_norm": 0.006701839622110128, + "learning_rate": 2.77402442304253e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40388720, + "step": 69630 + }, + { + "epoch": 10.371611557938635, + "grad_norm": 48.593448638916016, + "learning_rate": 2.7737014364580904e-05, + "loss": 0.0257, + "num_input_tokens_seen": 40391536, + "step": 69635 + }, + { + "epoch": 10.372356270479594, + "grad_norm": 0.0003026935737580061, + "learning_rate": 2.7733784452497436e-05, + "loss": 0.1283, + "num_input_tokens_seen": 40394128, + "step": 69640 + }, + { + "epoch": 10.373100983020555, + "grad_norm": 0.04345129802823067, + "learning_rate": 2.7730554494229453e-05, + "loss": 0.244, + "num_input_tokens_seen": 40397328, + "step": 69645 + }, + { + "epoch": 10.373845695561513, + "grad_norm": 0.022325651720166206, + "learning_rate": 2.772732448983153e-05, + "loss": 0.1945, + "num_input_tokens_seen": 40400208, + "step": 69650 + }, + { + "epoch": 10.374590408102472, + "grad_norm": 0.0189062412828207, + "learning_rate": 2.7724094439358227e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40402800, + "step": 69655 + }, + { + "epoch": 10.375335120643431, + "grad_norm": 3.7129194736480713, + "learning_rate": 2.7720864342864123e-05, + "loss": 0.1557, + "num_input_tokens_seen": 40405712, + "step": 69660 + }, + { + "epoch": 10.376079833184392, + "grad_norm": 0.024392489343881607, + "learning_rate": 2.771763420040378e-05, + "loss": 0.1285, + "num_input_tokens_seen": 40408720, + "step": 69665 + }, + { + "epoch": 10.37682454572535, + "grad_norm": 0.21076524257659912, + "learning_rate": 2.771440401203177e-05, + "loss": 0.1987, + "num_input_tokens_seen": 40411696, + "step": 69670 + }, + { + "epoch": 10.377569258266309, + "grad_norm": 0.05839092656970024, + "learning_rate": 2.7711173777802657e-05, + "loss": 0.1029, + "num_input_tokens_seen": 40414640, + "step": 69675 + }, + { + "epoch": 10.378313970807268, + "grad_norm": 0.13419541716575623, + "learning_rate": 2.770794349777102e-05, + "loss": 0.1185, + "num_input_tokens_seen": 40417680, + "step": 69680 + }, + { + "epoch": 10.379058683348228, + "grad_norm": 21.26020622253418, + "learning_rate": 2.770471317199144e-05, + "loss": 0.0224, + "num_input_tokens_seen": 40420240, + "step": 69685 + }, + { + "epoch": 10.379803395889187, + "grad_norm": 1.0225975513458252, + "learning_rate": 2.7701482800518475e-05, + "loss": 0.0076, + "num_input_tokens_seen": 40422960, + "step": 69690 + }, + { + "epoch": 10.380548108430146, + "grad_norm": 13.689696311950684, + "learning_rate": 2.7698252383406696e-05, + "loss": 0.1913, + "num_input_tokens_seen": 40425616, + "step": 69695 + }, + { + "epoch": 10.381292820971105, + "grad_norm": 0.13883158564567566, + "learning_rate": 2.7695021920710694e-05, + "loss": 0.0023, + "num_input_tokens_seen": 40428528, + "step": 69700 + }, + { + "epoch": 10.382037533512065, + "grad_norm": 0.09324344992637634, + "learning_rate": 2.7691791412485035e-05, + "loss": 0.1062, + "num_input_tokens_seen": 40431280, + "step": 69705 + }, + { + "epoch": 10.382782246053024, + "grad_norm": 0.005336607340723276, + "learning_rate": 2.768856085878429e-05, + "loss": 0.1048, + "num_input_tokens_seen": 40434480, + "step": 69710 + }, + { + "epoch": 10.383526958593983, + "grad_norm": 0.23052705824375153, + "learning_rate": 2.7685330259663045e-05, + "loss": 0.0006, + "num_input_tokens_seen": 40437328, + "step": 69715 + }, + { + "epoch": 10.384271671134941, + "grad_norm": 0.020094547420740128, + "learning_rate": 2.768209961517587e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40440400, + "step": 69720 + }, + { + "epoch": 10.3850163836759, + "grad_norm": 0.6080392599105835, + "learning_rate": 2.7678868925377362e-05, + "loss": 0.296, + "num_input_tokens_seen": 40443344, + "step": 69725 + }, + { + "epoch": 10.38576109621686, + "grad_norm": 0.08559956401586533, + "learning_rate": 2.7675638190322073e-05, + "loss": 0.3224, + "num_input_tokens_seen": 40446832, + "step": 69730 + }, + { + "epoch": 10.38650580875782, + "grad_norm": 0.023732861503958702, + "learning_rate": 2.7672407410064603e-05, + "loss": 0.0006, + "num_input_tokens_seen": 40449968, + "step": 69735 + }, + { + "epoch": 10.387250521298778, + "grad_norm": 19.67704963684082, + "learning_rate": 2.7669176584659522e-05, + "loss": 0.1201, + "num_input_tokens_seen": 40452752, + "step": 69740 + }, + { + "epoch": 10.387995233839739, + "grad_norm": 1.239530086517334, + "learning_rate": 2.766594571416141e-05, + "loss": 0.0006, + "num_input_tokens_seen": 40455536, + "step": 69745 + }, + { + "epoch": 10.388739946380698, + "grad_norm": 0.020737800747156143, + "learning_rate": 2.7662714798624865e-05, + "loss": 0.029, + "num_input_tokens_seen": 40458160, + "step": 69750 + }, + { + "epoch": 10.389484658921656, + "grad_norm": 0.012577040120959282, + "learning_rate": 2.7659483838104456e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40461328, + "step": 69755 + }, + { + "epoch": 10.390229371462615, + "grad_norm": 0.010533109307289124, + "learning_rate": 2.7656252832654766e-05, + "loss": 0.2443, + "num_input_tokens_seen": 40464080, + "step": 69760 + }, + { + "epoch": 10.390974084003574, + "grad_norm": 0.09709250926971436, + "learning_rate": 2.765302178233039e-05, + "loss": 0.0012, + "num_input_tokens_seen": 40467024, + "step": 69765 + }, + { + "epoch": 10.391718796544534, + "grad_norm": 0.13654330372810364, + "learning_rate": 2.764979068718591e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40469904, + "step": 69770 + }, + { + "epoch": 10.392463509085493, + "grad_norm": 0.0019995635375380516, + "learning_rate": 2.7646559547275907e-05, + "loss": 0.0711, + "num_input_tokens_seen": 40472912, + "step": 69775 + }, + { + "epoch": 10.393208221626452, + "grad_norm": 0.1231350526213646, + "learning_rate": 2.7643328362654968e-05, + "loss": 0.0063, + "num_input_tokens_seen": 40475856, + "step": 69780 + }, + { + "epoch": 10.39395293416741, + "grad_norm": 0.005535657983273268, + "learning_rate": 2.764009713337768e-05, + "loss": 0.1201, + "num_input_tokens_seen": 40478896, + "step": 69785 + }, + { + "epoch": 10.394697646708371, + "grad_norm": 40.12206268310547, + "learning_rate": 2.763686585949864e-05, + "loss": 0.0372, + "num_input_tokens_seen": 40481936, + "step": 69790 + }, + { + "epoch": 10.39544235924933, + "grad_norm": 0.04357251524925232, + "learning_rate": 2.7633634541072428e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40485104, + "step": 69795 + }, + { + "epoch": 10.396187071790289, + "grad_norm": 0.007583441212773323, + "learning_rate": 2.763040317815364e-05, + "loss": 0.0664, + "num_input_tokens_seen": 40487664, + "step": 69800 + }, + { + "epoch": 10.396931784331247, + "grad_norm": 0.0048003289848566055, + "learning_rate": 2.7627171770796868e-05, + "loss": 0.188, + "num_input_tokens_seen": 40490832, + "step": 69805 + }, + { + "epoch": 10.397676496872208, + "grad_norm": 0.049629468470811844, + "learning_rate": 2.762394031905669e-05, + "loss": 0.0855, + "num_input_tokens_seen": 40493552, + "step": 69810 + }, + { + "epoch": 10.398421209413167, + "grad_norm": 2.223078966140747, + "learning_rate": 2.7620708822987708e-05, + "loss": 0.0019, + "num_input_tokens_seen": 40496240, + "step": 69815 + }, + { + "epoch": 10.399165921954125, + "grad_norm": 0.07591739296913147, + "learning_rate": 2.7617477282644515e-05, + "loss": 0.1531, + "num_input_tokens_seen": 40499184, + "step": 69820 + }, + { + "epoch": 10.399910634495084, + "grad_norm": 0.09138958156108856, + "learning_rate": 2.7614245698081702e-05, + "loss": 0.0773, + "num_input_tokens_seen": 40502032, + "step": 69825 + }, + { + "epoch": 10.400655347036045, + "grad_norm": 0.04994117468595505, + "learning_rate": 2.7611014069353874e-05, + "loss": 0.0006, + "num_input_tokens_seen": 40504880, + "step": 69830 + }, + { + "epoch": 10.401400059577004, + "grad_norm": 0.012770598754286766, + "learning_rate": 2.760778239651561e-05, + "loss": 0.0366, + "num_input_tokens_seen": 40507568, + "step": 69835 + }, + { + "epoch": 10.402144772117962, + "grad_norm": 0.027769992128014565, + "learning_rate": 2.760455067962151e-05, + "loss": 0.0434, + "num_input_tokens_seen": 40510448, + "step": 69840 + }, + { + "epoch": 10.402889484658921, + "grad_norm": 0.014196191914379597, + "learning_rate": 2.7601318918726184e-05, + "loss": 0.0009, + "num_input_tokens_seen": 40513488, + "step": 69845 + }, + { + "epoch": 10.403634197199882, + "grad_norm": 10.425786018371582, + "learning_rate": 2.7598087113884207e-05, + "loss": 0.008, + "num_input_tokens_seen": 40516304, + "step": 69850 + }, + { + "epoch": 10.40437890974084, + "grad_norm": 0.0006871598889119923, + "learning_rate": 2.7594855265150192e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40518960, + "step": 69855 + }, + { + "epoch": 10.405123622281799, + "grad_norm": 0.0003296216018497944, + "learning_rate": 2.7591623372578736e-05, + "loss": 0.0005, + "num_input_tokens_seen": 40521872, + "step": 69860 + }, + { + "epoch": 10.405868334822758, + "grad_norm": 0.0017286568181589246, + "learning_rate": 2.758839143622444e-05, + "loss": 0.1538, + "num_input_tokens_seen": 40524912, + "step": 69865 + }, + { + "epoch": 10.406613047363718, + "grad_norm": 0.002433359157294035, + "learning_rate": 2.7585159456141895e-05, + "loss": 0.247, + "num_input_tokens_seen": 40528048, + "step": 69870 + }, + { + "epoch": 10.407357759904677, + "grad_norm": 95.47602081298828, + "learning_rate": 2.7581927432385713e-05, + "loss": 0.161, + "num_input_tokens_seen": 40531440, + "step": 69875 + }, + { + "epoch": 10.408102472445636, + "grad_norm": 0.07817697525024414, + "learning_rate": 2.757869536501049e-05, + "loss": 0.1873, + "num_input_tokens_seen": 40534224, + "step": 69880 + }, + { + "epoch": 10.408847184986595, + "grad_norm": 0.021329443901777267, + "learning_rate": 2.7575463254070832e-05, + "loss": 0.0011, + "num_input_tokens_seen": 40537360, + "step": 69885 + }, + { + "epoch": 10.409591897527555, + "grad_norm": 0.00018718278442975134, + "learning_rate": 2.757223109962134e-05, + "loss": 0.0008, + "num_input_tokens_seen": 40540080, + "step": 69890 + }, + { + "epoch": 10.410336610068514, + "grad_norm": 0.017944063991308212, + "learning_rate": 2.7568998901716613e-05, + "loss": 0.0005, + "num_input_tokens_seen": 40543120, + "step": 69895 + }, + { + "epoch": 10.411081322609473, + "grad_norm": 0.05298147350549698, + "learning_rate": 2.7565766660411263e-05, + "loss": 0.0461, + "num_input_tokens_seen": 40546000, + "step": 69900 + }, + { + "epoch": 10.411826035150431, + "grad_norm": 0.031232785433530807, + "learning_rate": 2.7562534375759898e-05, + "loss": 0.0051, + "num_input_tokens_seen": 40549008, + "step": 69905 + }, + { + "epoch": 10.41257074769139, + "grad_norm": 0.0031752772629261017, + "learning_rate": 2.7559302047817108e-05, + "loss": 0.0552, + "num_input_tokens_seen": 40551856, + "step": 69910 + }, + { + "epoch": 10.41331546023235, + "grad_norm": 0.00014505474246107042, + "learning_rate": 2.755606967663752e-05, + "loss": 0.0017, + "num_input_tokens_seen": 40554736, + "step": 69915 + }, + { + "epoch": 10.41406017277331, + "grad_norm": 0.020771190524101257, + "learning_rate": 2.755283726227573e-05, + "loss": 0.0031, + "num_input_tokens_seen": 40557744, + "step": 69920 + }, + { + "epoch": 10.414804885314268, + "grad_norm": 0.009499991312623024, + "learning_rate": 2.754960480478635e-05, + "loss": 0.075, + "num_input_tokens_seen": 40560560, + "step": 69925 + }, + { + "epoch": 10.415549597855227, + "grad_norm": 0.0127768125385046, + "learning_rate": 2.7546372304223983e-05, + "loss": 0.0005, + "num_input_tokens_seen": 40563408, + "step": 69930 + }, + { + "epoch": 10.416294310396188, + "grad_norm": 0.8696167469024658, + "learning_rate": 2.7543139760643255e-05, + "loss": 0.0481, + "num_input_tokens_seen": 40566320, + "step": 69935 + }, + { + "epoch": 10.417039022937146, + "grad_norm": 0.0017213179962709546, + "learning_rate": 2.7539907174098755e-05, + "loss": 0.0104, + "num_input_tokens_seen": 40569456, + "step": 69940 + }, + { + "epoch": 10.417783735478105, + "grad_norm": 0.09057439118623734, + "learning_rate": 2.7536674544645108e-05, + "loss": 0.0005, + "num_input_tokens_seen": 40572272, + "step": 69945 + }, + { + "epoch": 10.418528448019064, + "grad_norm": 0.28769591450691223, + "learning_rate": 2.7533441872336923e-05, + "loss": 0.036, + "num_input_tokens_seen": 40575280, + "step": 69950 + }, + { + "epoch": 10.419273160560024, + "grad_norm": 0.0043305521830916405, + "learning_rate": 2.7530209157228808e-05, + "loss": 0.0454, + "num_input_tokens_seen": 40578448, + "step": 69955 + }, + { + "epoch": 10.420017873100983, + "grad_norm": 0.008470693603157997, + "learning_rate": 2.752697639937539e-05, + "loss": 0.0832, + "num_input_tokens_seen": 40581520, + "step": 69960 + }, + { + "epoch": 10.420762585641942, + "grad_norm": 0.0039037710521370173, + "learning_rate": 2.752374359883127e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40584496, + "step": 69965 + }, + { + "epoch": 10.4215072981829, + "grad_norm": 0.013544458895921707, + "learning_rate": 2.7520510755651068e-05, + "loss": 0.0018, + "num_input_tokens_seen": 40587312, + "step": 69970 + }, + { + "epoch": 10.422252010723861, + "grad_norm": 0.2508939206600189, + "learning_rate": 2.7517277869889395e-05, + "loss": 0.0005, + "num_input_tokens_seen": 40590352, + "step": 69975 + }, + { + "epoch": 10.42299672326482, + "grad_norm": 0.03155124559998512, + "learning_rate": 2.7514044941600874e-05, + "loss": 0.219, + "num_input_tokens_seen": 40593104, + "step": 69980 + }, + { + "epoch": 10.423741435805779, + "grad_norm": 0.03707507625222206, + "learning_rate": 2.7510811970840115e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40596176, + "step": 69985 + }, + { + "epoch": 10.424486148346737, + "grad_norm": 0.3093346059322357, + "learning_rate": 2.7507578957661746e-05, + "loss": 0.0424, + "num_input_tokens_seen": 40598864, + "step": 69990 + }, + { + "epoch": 10.425230860887698, + "grad_norm": 16.764293670654297, + "learning_rate": 2.7504345902120375e-05, + "loss": 0.2189, + "num_input_tokens_seen": 40602128, + "step": 69995 + }, + { + "epoch": 10.425975573428657, + "grad_norm": 0.034274909645318985, + "learning_rate": 2.7501112804270624e-05, + "loss": 0.0145, + "num_input_tokens_seen": 40605328, + "step": 70000 + }, + { + "epoch": 10.426720285969616, + "grad_norm": 56.99748611450195, + "learning_rate": 2.749787966416712e-05, + "loss": 0.1129, + "num_input_tokens_seen": 40608272, + "step": 70005 + }, + { + "epoch": 10.427464998510574, + "grad_norm": 0.1522458791732788, + "learning_rate": 2.7494646481864472e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40611472, + "step": 70010 + }, + { + "epoch": 10.428209711051535, + "grad_norm": 48.89912796020508, + "learning_rate": 2.749141325741731e-05, + "loss": 0.0795, + "num_input_tokens_seen": 40614160, + "step": 70015 + }, + { + "epoch": 10.428954423592494, + "grad_norm": 19.704402923583984, + "learning_rate": 2.7488179990880248e-05, + "loss": 0.4146, + "num_input_tokens_seen": 40617040, + "step": 70020 + }, + { + "epoch": 10.429699136133452, + "grad_norm": 16.61503028869629, + "learning_rate": 2.7484946682307917e-05, + "loss": 0.1596, + "num_input_tokens_seen": 40619888, + "step": 70025 + }, + { + "epoch": 10.430443848674411, + "grad_norm": 126.52696990966797, + "learning_rate": 2.7481713331754945e-05, + "loss": 0.0472, + "num_input_tokens_seen": 40622768, + "step": 70030 + }, + { + "epoch": 10.431188561215372, + "grad_norm": 0.021133633330464363, + "learning_rate": 2.7478479939275937e-05, + "loss": 0.013, + "num_input_tokens_seen": 40625552, + "step": 70035 + }, + { + "epoch": 10.43193327375633, + "grad_norm": 0.004754537250846624, + "learning_rate": 2.7475246504925535e-05, + "loss": 0.1856, + "num_input_tokens_seen": 40628272, + "step": 70040 + }, + { + "epoch": 10.43267798629729, + "grad_norm": 0.007273274473845959, + "learning_rate": 2.7472013028758364e-05, + "loss": 0.2083, + "num_input_tokens_seen": 40631248, + "step": 70045 + }, + { + "epoch": 10.433422698838248, + "grad_norm": 2.0184552669525146, + "learning_rate": 2.7468779510829036e-05, + "loss": 0.0024, + "num_input_tokens_seen": 40634128, + "step": 70050 + }, + { + "epoch": 10.434167411379208, + "grad_norm": 0.08783865720033646, + "learning_rate": 2.746554595119219e-05, + "loss": 0.1858, + "num_input_tokens_seen": 40637104, + "step": 70055 + }, + { + "epoch": 10.434912123920167, + "grad_norm": 36.91840744018555, + "learning_rate": 2.7462312349902452e-05, + "loss": 0.1577, + "num_input_tokens_seen": 40640112, + "step": 70060 + }, + { + "epoch": 10.435656836461126, + "grad_norm": 0.02458685263991356, + "learning_rate": 2.7459078707014453e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40643088, + "step": 70065 + }, + { + "epoch": 10.436401549002085, + "grad_norm": 0.031222466379404068, + "learning_rate": 2.745584502258281e-05, + "loss": 0.0579, + "num_input_tokens_seen": 40645840, + "step": 70070 + }, + { + "epoch": 10.437146261543045, + "grad_norm": 0.12671448290348053, + "learning_rate": 2.745261129666217e-05, + "loss": 0.0375, + "num_input_tokens_seen": 40648912, + "step": 70075 + }, + { + "epoch": 10.437890974084004, + "grad_norm": 71.4633560180664, + "learning_rate": 2.7449377529307147e-05, + "loss": 0.1285, + "num_input_tokens_seen": 40651792, + "step": 70080 + }, + { + "epoch": 10.438635686624963, + "grad_norm": 0.0030588156078010798, + "learning_rate": 2.7446143720572387e-05, + "loss": 0.0075, + "num_input_tokens_seen": 40654800, + "step": 70085 + }, + { + "epoch": 10.439380399165922, + "grad_norm": 231.08926391601562, + "learning_rate": 2.7442909870512513e-05, + "loss": 0.2681, + "num_input_tokens_seen": 40657776, + "step": 70090 + }, + { + "epoch": 10.44012511170688, + "grad_norm": 0.1096244677901268, + "learning_rate": 2.7439675979182155e-05, + "loss": 0.0005, + "num_input_tokens_seen": 40660688, + "step": 70095 + }, + { + "epoch": 10.44086982424784, + "grad_norm": 30.657073974609375, + "learning_rate": 2.7436442046635962e-05, + "loss": 0.2454, + "num_input_tokens_seen": 40663760, + "step": 70100 + }, + { + "epoch": 10.4416145367888, + "grad_norm": 0.052027422934770584, + "learning_rate": 2.7433208072928546e-05, + "loss": 0.1762, + "num_input_tokens_seen": 40667024, + "step": 70105 + }, + { + "epoch": 10.442359249329758, + "grad_norm": 0.02337770164012909, + "learning_rate": 2.7429974058114553e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40669840, + "step": 70110 + }, + { + "epoch": 10.443103961870717, + "grad_norm": 0.0006371290655806661, + "learning_rate": 2.7426740002248624e-05, + "loss": 0.0128, + "num_input_tokens_seen": 40672720, + "step": 70115 + }, + { + "epoch": 10.443848674411678, + "grad_norm": 0.007524711079895496, + "learning_rate": 2.7423505905385382e-05, + "loss": 0.0004, + "num_input_tokens_seen": 40675600, + "step": 70120 + }, + { + "epoch": 10.444593386952636, + "grad_norm": 0.008304232731461525, + "learning_rate": 2.742027176757948e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40678448, + "step": 70125 + }, + { + "epoch": 10.445338099493595, + "grad_norm": 0.034286268055438995, + "learning_rate": 2.741703758888554e-05, + "loss": 0.3754, + "num_input_tokens_seen": 40681424, + "step": 70130 + }, + { + "epoch": 10.446082812034554, + "grad_norm": 59.8144416809082, + "learning_rate": 2.7413803369358217e-05, + "loss": 0.1507, + "num_input_tokens_seen": 40684304, + "step": 70135 + }, + { + "epoch": 10.446827524575514, + "grad_norm": 0.009559686295688152, + "learning_rate": 2.7410569109052124e-05, + "loss": 0.0952, + "num_input_tokens_seen": 40687216, + "step": 70140 + }, + { + "epoch": 10.447572237116473, + "grad_norm": 0.12964460253715515, + "learning_rate": 2.7407334808021924e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40690032, + "step": 70145 + }, + { + "epoch": 10.448316949657432, + "grad_norm": 0.007347932085394859, + "learning_rate": 2.740410046632224e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40693040, + "step": 70150 + }, + { + "epoch": 10.44906166219839, + "grad_norm": 0.00294916401617229, + "learning_rate": 2.7400866084007732e-05, + "loss": 0.0021, + "num_input_tokens_seen": 40696944, + "step": 70155 + }, + { + "epoch": 10.449806374739351, + "grad_norm": 0.0052564069628715515, + "learning_rate": 2.7397631661133032e-05, + "loss": 0.0434, + "num_input_tokens_seen": 40699728, + "step": 70160 + }, + { + "epoch": 10.45055108728031, + "grad_norm": 0.07083535194396973, + "learning_rate": 2.7394397197752786e-05, + "loss": 0.0061, + "num_input_tokens_seen": 40703088, + "step": 70165 + }, + { + "epoch": 10.451295799821269, + "grad_norm": 433.88763427734375, + "learning_rate": 2.7391162693921624e-05, + "loss": 0.0948, + "num_input_tokens_seen": 40705904, + "step": 70170 + }, + { + "epoch": 10.452040512362228, + "grad_norm": 30.392072677612305, + "learning_rate": 2.7387928149694197e-05, + "loss": 0.023, + "num_input_tokens_seen": 40709008, + "step": 70175 + }, + { + "epoch": 10.452785224903188, + "grad_norm": 0.007155480794608593, + "learning_rate": 2.7384693565125153e-05, + "loss": 0.0012, + "num_input_tokens_seen": 40712080, + "step": 70180 + }, + { + "epoch": 10.453529937444147, + "grad_norm": 0.036526910960674286, + "learning_rate": 2.7381458940269134e-05, + "loss": 0.0007, + "num_input_tokens_seen": 40714800, + "step": 70185 + }, + { + "epoch": 10.454274649985106, + "grad_norm": 0.015888290479779243, + "learning_rate": 2.737822427518079e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40717680, + "step": 70190 + }, + { + "epoch": 10.455019362526064, + "grad_norm": 0.0050633251667022705, + "learning_rate": 2.7374989569914766e-05, + "loss": 0.0, + "num_input_tokens_seen": 40720592, + "step": 70195 + }, + { + "epoch": 10.455764075067025, + "grad_norm": 0.0015863690059632063, + "learning_rate": 2.73717548245257e-05, + "loss": 0.1751, + "num_input_tokens_seen": 40723600, + "step": 70200 + }, + { + "epoch": 10.456508787607984, + "grad_norm": 0.0016010659746825695, + "learning_rate": 2.736852003906826e-05, + "loss": 0.1626, + "num_input_tokens_seen": 40726576, + "step": 70205 + }, + { + "epoch": 10.457253500148942, + "grad_norm": 0.0014529683394357562, + "learning_rate": 2.736528521359707e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40729264, + "step": 70210 + }, + { + "epoch": 10.457998212689901, + "grad_norm": 0.001279207062907517, + "learning_rate": 2.736205034816679e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40732464, + "step": 70215 + }, + { + "epoch": 10.458742925230862, + "grad_norm": 0.0038343474734574556, + "learning_rate": 2.735881544283207e-05, + "loss": 0.0768, + "num_input_tokens_seen": 40735312, + "step": 70220 + }, + { + "epoch": 10.45948763777182, + "grad_norm": 24.788776397705078, + "learning_rate": 2.735558049764756e-05, + "loss": 0.0157, + "num_input_tokens_seen": 40738480, + "step": 70225 + }, + { + "epoch": 10.46023235031278, + "grad_norm": 5.368073463439941, + "learning_rate": 2.735234551266792e-05, + "loss": 0.0013, + "num_input_tokens_seen": 40741424, + "step": 70230 + }, + { + "epoch": 10.460977062853738, + "grad_norm": 0.005930907558649778, + "learning_rate": 2.734911048794779e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40744080, + "step": 70235 + }, + { + "epoch": 10.461721775394698, + "grad_norm": 0.322569876909256, + "learning_rate": 2.7345875423541817e-05, + "loss": 0.0291, + "num_input_tokens_seen": 40747024, + "step": 70240 + }, + { + "epoch": 10.462466487935657, + "grad_norm": 0.02677016519010067, + "learning_rate": 2.7342640319504674e-05, + "loss": 0.2826, + "num_input_tokens_seen": 40750064, + "step": 70245 + }, + { + "epoch": 10.463211200476616, + "grad_norm": 0.0035034487955272198, + "learning_rate": 2.7339405175890998e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40752784, + "step": 70250 + }, + { + "epoch": 10.463955913017575, + "grad_norm": 0.005301403347402811, + "learning_rate": 2.733616999275545e-05, + "loss": 0.0048, + "num_input_tokens_seen": 40755472, + "step": 70255 + }, + { + "epoch": 10.464700625558535, + "grad_norm": 0.006997674237936735, + "learning_rate": 2.7332934770152686e-05, + "loss": 0.0004, + "num_input_tokens_seen": 40758256, + "step": 70260 + }, + { + "epoch": 10.465445338099494, + "grad_norm": 2.4281184673309326, + "learning_rate": 2.7329699508137363e-05, + "loss": 0.0038, + "num_input_tokens_seen": 40761232, + "step": 70265 + }, + { + "epoch": 10.466190050640453, + "grad_norm": 0.015390085987746716, + "learning_rate": 2.7326464206764125e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40764048, + "step": 70270 + }, + { + "epoch": 10.466934763181412, + "grad_norm": 232.47817993164062, + "learning_rate": 2.7323228866087647e-05, + "loss": 0.1103, + "num_input_tokens_seen": 40767024, + "step": 70275 + }, + { + "epoch": 10.46767947572237, + "grad_norm": 4.189451694488525, + "learning_rate": 2.731999348616257e-05, + "loss": 0.0089, + "num_input_tokens_seen": 40769936, + "step": 70280 + }, + { + "epoch": 10.46842418826333, + "grad_norm": 40.03812789916992, + "learning_rate": 2.731675806704357e-05, + "loss": 0.1801, + "num_input_tokens_seen": 40773040, + "step": 70285 + }, + { + "epoch": 10.46916890080429, + "grad_norm": 0.004145576618611813, + "learning_rate": 2.7313522608785295e-05, + "loss": 0.0292, + "num_input_tokens_seen": 40776240, + "step": 70290 + }, + { + "epoch": 10.469913613345248, + "grad_norm": 0.04119434207677841, + "learning_rate": 2.7310287111442407e-05, + "loss": 0.0467, + "num_input_tokens_seen": 40779344, + "step": 70295 + }, + { + "epoch": 10.470658325886207, + "grad_norm": 0.17320501804351807, + "learning_rate": 2.730705157506957e-05, + "loss": 0.2349, + "num_input_tokens_seen": 40782256, + "step": 70300 + }, + { + "epoch": 10.471403038427168, + "grad_norm": 0.06509801745414734, + "learning_rate": 2.7303815999721433e-05, + "loss": 0.0977, + "num_input_tokens_seen": 40785040, + "step": 70305 + }, + { + "epoch": 10.472147750968126, + "grad_norm": 0.02889196015894413, + "learning_rate": 2.730058038545267e-05, + "loss": 0.1439, + "num_input_tokens_seen": 40788016, + "step": 70310 + }, + { + "epoch": 10.472892463509085, + "grad_norm": 13.069742202758789, + "learning_rate": 2.7297344732317938e-05, + "loss": 0.0017, + "num_input_tokens_seen": 40790768, + "step": 70315 + }, + { + "epoch": 10.473637176050044, + "grad_norm": 0.0079894894734025, + "learning_rate": 2.7294109040371902e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40793776, + "step": 70320 + }, + { + "epoch": 10.474381888591004, + "grad_norm": 0.0017860903171822429, + "learning_rate": 2.729087330966923e-05, + "loss": 0.0, + "num_input_tokens_seen": 40796784, + "step": 70325 + }, + { + "epoch": 10.475126601131963, + "grad_norm": 0.003625280922278762, + "learning_rate": 2.7287637540264584e-05, + "loss": 0.1049, + "num_input_tokens_seen": 40799824, + "step": 70330 + }, + { + "epoch": 10.475871313672922, + "grad_norm": 30.121244430541992, + "learning_rate": 2.7284401732212615e-05, + "loss": 0.223, + "num_input_tokens_seen": 40802544, + "step": 70335 + }, + { + "epoch": 10.47661602621388, + "grad_norm": 0.00363173452205956, + "learning_rate": 2.7281165885568006e-05, + "loss": 0.0, + "num_input_tokens_seen": 40805168, + "step": 70340 + }, + { + "epoch": 10.477360738754841, + "grad_norm": 0.008195169270038605, + "learning_rate": 2.7277930000385414e-05, + "loss": 0.1263, + "num_input_tokens_seen": 40808080, + "step": 70345 + }, + { + "epoch": 10.4781054512958, + "grad_norm": 110.96424102783203, + "learning_rate": 2.7274694076719513e-05, + "loss": 0.2433, + "num_input_tokens_seen": 40810800, + "step": 70350 + }, + { + "epoch": 10.478850163836759, + "grad_norm": 0.002263119909912348, + "learning_rate": 2.727145811462497e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40813392, + "step": 70355 + }, + { + "epoch": 10.479594876377718, + "grad_norm": 0.2166406363248825, + "learning_rate": 2.7268222114156454e-05, + "loss": 0.0005, + "num_input_tokens_seen": 40816592, + "step": 70360 + }, + { + "epoch": 10.480339588918678, + "grad_norm": 0.005048351362347603, + "learning_rate": 2.7264986075368625e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40819632, + "step": 70365 + }, + { + "epoch": 10.481084301459637, + "grad_norm": 0.0005896119400858879, + "learning_rate": 2.726174999831616e-05, + "loss": 0.0661, + "num_input_tokens_seen": 40822736, + "step": 70370 + }, + { + "epoch": 10.481829014000596, + "grad_norm": 0.009538992308080196, + "learning_rate": 2.7258513883053727e-05, + "loss": 0.3876, + "num_input_tokens_seen": 40825552, + "step": 70375 + }, + { + "epoch": 10.482573726541554, + "grad_norm": 0.10147932916879654, + "learning_rate": 2.7255277729635997e-05, + "loss": 0.0528, + "num_input_tokens_seen": 40828368, + "step": 70380 + }, + { + "epoch": 10.483318439082515, + "grad_norm": 0.0009877082193270326, + "learning_rate": 2.725204153811764e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40831600, + "step": 70385 + }, + { + "epoch": 10.484063151623474, + "grad_norm": 0.008317772299051285, + "learning_rate": 2.7248805308553333e-05, + "loss": 0.0118, + "num_input_tokens_seen": 40834512, + "step": 70390 + }, + { + "epoch": 10.484807864164432, + "grad_norm": 0.13020138442516327, + "learning_rate": 2.7245569040997747e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40837488, + "step": 70395 + }, + { + "epoch": 10.485552576705391, + "grad_norm": 0.039568834006786346, + "learning_rate": 2.7242332735505555e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40840368, + "step": 70400 + }, + { + "epoch": 10.486297289246352, + "grad_norm": 0.05553530529141426, + "learning_rate": 2.7239096392131423e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40843664, + "step": 70405 + }, + { + "epoch": 10.48704200178731, + "grad_norm": 0.0005905787693336606, + "learning_rate": 2.723586001093004e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40846896, + "step": 70410 + }, + { + "epoch": 10.48778671432827, + "grad_norm": 0.001934971078298986, + "learning_rate": 2.7232623591956074e-05, + "loss": 0.2676, + "num_input_tokens_seen": 40849840, + "step": 70415 + }, + { + "epoch": 10.488531426869228, + "grad_norm": 0.0049825566820800304, + "learning_rate": 2.72293871352642e-05, + "loss": 0.0, + "num_input_tokens_seen": 40852752, + "step": 70420 + }, + { + "epoch": 10.489276139410187, + "grad_norm": 0.7907485961914062, + "learning_rate": 2.7226150640909092e-05, + "loss": 0.0007, + "num_input_tokens_seen": 40855536, + "step": 70425 + }, + { + "epoch": 10.490020851951147, + "grad_norm": 0.0008542565046809614, + "learning_rate": 2.722291410894544e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40858416, + "step": 70430 + }, + { + "epoch": 10.490765564492106, + "grad_norm": 4.361741542816162, + "learning_rate": 2.721967753942791e-05, + "loss": 0.0443, + "num_input_tokens_seen": 40861072, + "step": 70435 + }, + { + "epoch": 10.491510277033065, + "grad_norm": 0.03820021450519562, + "learning_rate": 2.721644093241118e-05, + "loss": 0.0047, + "num_input_tokens_seen": 40863632, + "step": 70440 + }, + { + "epoch": 10.492254989574024, + "grad_norm": 0.049591656774282455, + "learning_rate": 2.7213204287949938e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40866576, + "step": 70445 + }, + { + "epoch": 10.492999702114984, + "grad_norm": 0.006974089425057173, + "learning_rate": 2.7209967606098862e-05, + "loss": 0.1133, + "num_input_tokens_seen": 40869488, + "step": 70450 + }, + { + "epoch": 10.493744414655943, + "grad_norm": 0.08295556157827377, + "learning_rate": 2.7206730886912624e-05, + "loss": 0.0024, + "num_input_tokens_seen": 40872464, + "step": 70455 + }, + { + "epoch": 10.494489127196902, + "grad_norm": 17.192440032958984, + "learning_rate": 2.7203494130445905e-05, + "loss": 0.1518, + "num_input_tokens_seen": 40875696, + "step": 70460 + }, + { + "epoch": 10.49523383973786, + "grad_norm": 0.0033742368686944246, + "learning_rate": 2.7200257336753405e-05, + "loss": 0.0871, + "num_input_tokens_seen": 40878768, + "step": 70465 + }, + { + "epoch": 10.495978552278821, + "grad_norm": 9.758747100830078, + "learning_rate": 2.7197020505889786e-05, + "loss": 0.2845, + "num_input_tokens_seen": 40881904, + "step": 70470 + }, + { + "epoch": 10.49672326481978, + "grad_norm": 1.2141293287277222, + "learning_rate": 2.7193783637909736e-05, + "loss": 0.0007, + "num_input_tokens_seen": 40884368, + "step": 70475 + }, + { + "epoch": 10.497467977360738, + "grad_norm": 0.002475134329870343, + "learning_rate": 2.7190546732867945e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40887088, + "step": 70480 + }, + { + "epoch": 10.498212689901697, + "grad_norm": 0.043256860226392746, + "learning_rate": 2.7187309790819092e-05, + "loss": 0.2035, + "num_input_tokens_seen": 40890000, + "step": 70485 + }, + { + "epoch": 10.498957402442658, + "grad_norm": 0.23031924664974213, + "learning_rate": 2.7184072811817867e-05, + "loss": 0.0108, + "num_input_tokens_seen": 40893168, + "step": 70490 + }, + { + "epoch": 10.499702114983616, + "grad_norm": 0.008061329834163189, + "learning_rate": 2.7180835795918952e-05, + "loss": 0.0823, + "num_input_tokens_seen": 40895696, + "step": 70495 + }, + { + "epoch": 10.500446827524575, + "grad_norm": 0.007922038435935974, + "learning_rate": 2.7177598743177028e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40898640, + "step": 70500 + }, + { + "epoch": 10.501191540065534, + "grad_norm": 0.00029727898072451353, + "learning_rate": 2.717436165364679e-05, + "loss": 0.0001, + "num_input_tokens_seen": 40901552, + "step": 70505 + }, + { + "epoch": 10.501936252606495, + "grad_norm": 0.0023140248376876116, + "learning_rate": 2.7171124527382917e-05, + "loss": 0.0054, + "num_input_tokens_seen": 40904432, + "step": 70510 + }, + { + "epoch": 10.502680965147453, + "grad_norm": 0.01729200780391693, + "learning_rate": 2.7167887364440102e-05, + "loss": 0.0004, + "num_input_tokens_seen": 40907152, + "step": 70515 + }, + { + "epoch": 10.503425677688412, + "grad_norm": 31.618894577026367, + "learning_rate": 2.7164650164873033e-05, + "loss": 0.1346, + "num_input_tokens_seen": 40910288, + "step": 70520 + }, + { + "epoch": 10.50417039022937, + "grad_norm": 185.52459716796875, + "learning_rate": 2.7161412928736407e-05, + "loss": 0.0824, + "num_input_tokens_seen": 40913168, + "step": 70525 + }, + { + "epoch": 10.504915102770331, + "grad_norm": 0.00557643361389637, + "learning_rate": 2.7158175656084906e-05, + "loss": 0.0033, + "num_input_tokens_seen": 40915984, + "step": 70530 + }, + { + "epoch": 10.50565981531129, + "grad_norm": 0.0014821842778474092, + "learning_rate": 2.7154938346973214e-05, + "loss": 0.1352, + "num_input_tokens_seen": 40918832, + "step": 70535 + }, + { + "epoch": 10.506404527852249, + "grad_norm": 0.11185727268457413, + "learning_rate": 2.715170100145603e-05, + "loss": 0.0002, + "num_input_tokens_seen": 40921520, + "step": 70540 + }, + { + "epoch": 10.507149240393208, + "grad_norm": 0.013520550914108753, + "learning_rate": 2.7148463619588045e-05, + "loss": 0.0005, + "num_input_tokens_seen": 40924656, + "step": 70545 + }, + { + "epoch": 10.507893952934168, + "grad_norm": 0.0007864300860092044, + "learning_rate": 2.714522620142395e-05, + "loss": 0.1193, + "num_input_tokens_seen": 40927504, + "step": 70550 + }, + { + "epoch": 10.508638665475127, + "grad_norm": 61.174373626708984, + "learning_rate": 2.7141988747018437e-05, + "loss": 0.1689, + "num_input_tokens_seen": 40930224, + "step": 70555 + }, + { + "epoch": 10.509383378016086, + "grad_norm": 0.18024516105651855, + "learning_rate": 2.713875125642621e-05, + "loss": 0.0027, + "num_input_tokens_seen": 40932912, + "step": 70560 + }, + { + "epoch": 10.510128090557044, + "grad_norm": 0.027902785688638687, + "learning_rate": 2.713551372970195e-05, + "loss": 0.1612, + "num_input_tokens_seen": 40935824, + "step": 70565 + }, + { + "epoch": 10.510872803098005, + "grad_norm": 0.0019630512688308954, + "learning_rate": 2.7132276166900357e-05, + "loss": 0.343, + "num_input_tokens_seen": 40938576, + "step": 70570 + }, + { + "epoch": 10.511617515638964, + "grad_norm": 0.007545177359133959, + "learning_rate": 2.7129038568076122e-05, + "loss": 0.0003, + "num_input_tokens_seen": 40941360, + "step": 70575 + }, + { + "epoch": 10.512362228179922, + "grad_norm": 0.014493613503873348, + "learning_rate": 2.712580093328394e-05, + "loss": 0.0137, + "num_input_tokens_seen": 40944144, + "step": 70580 + }, + { + "epoch": 10.513106940720881, + "grad_norm": 0.010068314149975777, + "learning_rate": 2.7122563262578515e-05, + "loss": 0.0841, + "num_input_tokens_seen": 40947024, + "step": 70585 + }, + { + "epoch": 10.513851653261842, + "grad_norm": 0.0619623102247715, + "learning_rate": 2.7119325556014546e-05, + "loss": 0.0005, + "num_input_tokens_seen": 40949968, + "step": 70590 + }, + { + "epoch": 10.5145963658028, + "grad_norm": 0.015846598893404007, + "learning_rate": 2.7116087813646724e-05, + "loss": 0.182, + "num_input_tokens_seen": 40953104, + "step": 70595 + }, + { + "epoch": 10.51534107834376, + "grad_norm": 63.90105056762695, + "learning_rate": 2.7112850035529748e-05, + "loss": 0.0647, + "num_input_tokens_seen": 40955824, + "step": 70600 + }, + { + "epoch": 10.516085790884718, + "grad_norm": 0.00634475564584136, + "learning_rate": 2.7109612221718316e-05, + "loss": 0.0142, + "num_input_tokens_seen": 40958544, + "step": 70605 + }, + { + "epoch": 10.516830503425677, + "grad_norm": 0.20515327155590057, + "learning_rate": 2.7106374372267136e-05, + "loss": 0.0056, + "num_input_tokens_seen": 40961328, + "step": 70610 + }, + { + "epoch": 10.517575215966637, + "grad_norm": 0.004085515160113573, + "learning_rate": 2.7103136487230895e-05, + "loss": 0.1862, + "num_input_tokens_seen": 40963952, + "step": 70615 + }, + { + "epoch": 10.518319928507596, + "grad_norm": 24.886219024658203, + "learning_rate": 2.70998985666643e-05, + "loss": 0.3143, + "num_input_tokens_seen": 40967088, + "step": 70620 + }, + { + "epoch": 10.519064641048555, + "grad_norm": 0.06210869923233986, + "learning_rate": 2.7096660610622055e-05, + "loss": 0.0007, + "num_input_tokens_seen": 40969936, + "step": 70625 + }, + { + "epoch": 10.519809353589514, + "grad_norm": 0.03488519787788391, + "learning_rate": 2.7093422619158866e-05, + "loss": 0.0005, + "num_input_tokens_seen": 40972752, + "step": 70630 + }, + { + "epoch": 10.520554066130474, + "grad_norm": 0.30278852581977844, + "learning_rate": 2.709018459232942e-05, + "loss": 0.0163, + "num_input_tokens_seen": 40975792, + "step": 70635 + }, + { + "epoch": 10.521298778671433, + "grad_norm": 0.4711835980415344, + "learning_rate": 2.7086946530188434e-05, + "loss": 0.0681, + "num_input_tokens_seen": 40978800, + "step": 70640 + }, + { + "epoch": 10.522043491212392, + "grad_norm": 0.011641189455986023, + "learning_rate": 2.7083708432790605e-05, + "loss": 0.1665, + "num_input_tokens_seen": 40981936, + "step": 70645 + }, + { + "epoch": 10.52278820375335, + "grad_norm": 0.011415163055062294, + "learning_rate": 2.7080470300190646e-05, + "loss": 0.0589, + "num_input_tokens_seen": 40984976, + "step": 70650 + }, + { + "epoch": 10.523532916294311, + "grad_norm": 62.04402542114258, + "learning_rate": 2.7077232132443247e-05, + "loss": 0.2484, + "num_input_tokens_seen": 40988112, + "step": 70655 + }, + { + "epoch": 10.52427762883527, + "grad_norm": 0.061358675360679626, + "learning_rate": 2.7073993929603138e-05, + "loss": 0.0305, + "num_input_tokens_seen": 40991248, + "step": 70660 + }, + { + "epoch": 10.525022341376228, + "grad_norm": 0.044711582362651825, + "learning_rate": 2.7070755691724993e-05, + "loss": 0.0009, + "num_input_tokens_seen": 40994096, + "step": 70665 + }, + { + "epoch": 10.525767053917187, + "grad_norm": 0.00788439717143774, + "learning_rate": 2.7067517418863543e-05, + "loss": 0.0005, + "num_input_tokens_seen": 40997040, + "step": 70670 + }, + { + "epoch": 10.526511766458148, + "grad_norm": 0.016636043787002563, + "learning_rate": 2.706427911107348e-05, + "loss": 0.1231, + "num_input_tokens_seen": 40999760, + "step": 70675 + }, + { + "epoch": 10.527256478999107, + "grad_norm": 0.0032676602713763714, + "learning_rate": 2.7061040768409523e-05, + "loss": 0.0558, + "num_input_tokens_seen": 41002448, + "step": 70680 + }, + { + "epoch": 10.528001191540065, + "grad_norm": 2.870669364929199, + "learning_rate": 2.705780239092638e-05, + "loss": 0.004, + "num_input_tokens_seen": 41005232, + "step": 70685 + }, + { + "epoch": 10.528745904081024, + "grad_norm": 0.015643218532204628, + "learning_rate": 2.705456397867876e-05, + "loss": 0.0664, + "num_input_tokens_seen": 41007888, + "step": 70690 + }, + { + "epoch": 10.529490616621985, + "grad_norm": 39.76130676269531, + "learning_rate": 2.7051325531721366e-05, + "loss": 0.2001, + "num_input_tokens_seen": 41010672, + "step": 70695 + }, + { + "epoch": 10.530235329162943, + "grad_norm": 0.02986820600926876, + "learning_rate": 2.704808705010891e-05, + "loss": 0.0786, + "num_input_tokens_seen": 41013520, + "step": 70700 + }, + { + "epoch": 10.530980041703902, + "grad_norm": 1.016593337059021, + "learning_rate": 2.7044848533896105e-05, + "loss": 0.0861, + "num_input_tokens_seen": 41016432, + "step": 70705 + }, + { + "epoch": 10.53172475424486, + "grad_norm": 0.0018403996946290135, + "learning_rate": 2.704160998313766e-05, + "loss": 0.0002, + "num_input_tokens_seen": 41019120, + "step": 70710 + }, + { + "epoch": 10.532469466785821, + "grad_norm": 0.001553493901155889, + "learning_rate": 2.7038371397888295e-05, + "loss": 0.113, + "num_input_tokens_seen": 41021840, + "step": 70715 + }, + { + "epoch": 10.53321417932678, + "grad_norm": 25.80925941467285, + "learning_rate": 2.7035132778202717e-05, + "loss": 0.1658, + "num_input_tokens_seen": 41024720, + "step": 70720 + }, + { + "epoch": 10.533958891867739, + "grad_norm": 0.006665575318038464, + "learning_rate": 2.7031894124135638e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41027792, + "step": 70725 + }, + { + "epoch": 10.534703604408698, + "grad_norm": 0.00406304094940424, + "learning_rate": 2.7028655435741772e-05, + "loss": 0.0002, + "num_input_tokens_seen": 41030576, + "step": 70730 + }, + { + "epoch": 10.535448316949658, + "grad_norm": 0.0036341510713100433, + "learning_rate": 2.7025416713075836e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41033392, + "step": 70735 + }, + { + "epoch": 10.536193029490617, + "grad_norm": 0.0013687530299648643, + "learning_rate": 2.702217795619254e-05, + "loss": 0.0853, + "num_input_tokens_seen": 41036144, + "step": 70740 + }, + { + "epoch": 10.536937742031576, + "grad_norm": 0.0055048828944563866, + "learning_rate": 2.7018939165146606e-05, + "loss": 0.0949, + "num_input_tokens_seen": 41039056, + "step": 70745 + }, + { + "epoch": 10.537682454572534, + "grad_norm": 7.867244720458984, + "learning_rate": 2.701570033999274e-05, + "loss": 0.0352, + "num_input_tokens_seen": 41042032, + "step": 70750 + }, + { + "epoch": 10.538427167113493, + "grad_norm": 0.0008228392107412219, + "learning_rate": 2.7012461480785668e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41045040, + "step": 70755 + }, + { + "epoch": 10.539171879654454, + "grad_norm": 0.003188088070601225, + "learning_rate": 2.7009222587580114e-05, + "loss": 0.2189, + "num_input_tokens_seen": 41048080, + "step": 70760 + }, + { + "epoch": 10.539916592195413, + "grad_norm": 7.0853352546691895, + "learning_rate": 2.7005983660430778e-05, + "loss": 0.0468, + "num_input_tokens_seen": 41051024, + "step": 70765 + }, + { + "epoch": 10.540661304736371, + "grad_norm": 0.019339896738529205, + "learning_rate": 2.700274469939239e-05, + "loss": 0.5846, + "num_input_tokens_seen": 41054032, + "step": 70770 + }, + { + "epoch": 10.541406017277332, + "grad_norm": 44.31002426147461, + "learning_rate": 2.6999505704519662e-05, + "loss": 0.1846, + "num_input_tokens_seen": 41056720, + "step": 70775 + }, + { + "epoch": 10.54215072981829, + "grad_norm": 0.40185901522636414, + "learning_rate": 2.6996266675867322e-05, + "loss": 0.0035, + "num_input_tokens_seen": 41059920, + "step": 70780 + }, + { + "epoch": 10.54289544235925, + "grad_norm": 0.007771010976284742, + "learning_rate": 2.6993027613490078e-05, + "loss": 0.2087, + "num_input_tokens_seen": 41062896, + "step": 70785 + }, + { + "epoch": 10.543640154900208, + "grad_norm": 0.07834413647651672, + "learning_rate": 2.698978851744266e-05, + "loss": 0.1353, + "num_input_tokens_seen": 41065904, + "step": 70790 + }, + { + "epoch": 10.544384867441167, + "grad_norm": 0.1590726226568222, + "learning_rate": 2.69865493877798e-05, + "loss": 0.1733, + "num_input_tokens_seen": 41068784, + "step": 70795 + }, + { + "epoch": 10.545129579982127, + "grad_norm": 13.843846321105957, + "learning_rate": 2.698331022455619e-05, + "loss": 0.1059, + "num_input_tokens_seen": 41071728, + "step": 70800 + }, + { + "epoch": 10.545874292523086, + "grad_norm": 0.011011063121259212, + "learning_rate": 2.6980071027826574e-05, + "loss": 0.0633, + "num_input_tokens_seen": 41074800, + "step": 70805 + }, + { + "epoch": 10.546619005064045, + "grad_norm": 0.1068844199180603, + "learning_rate": 2.697683179764568e-05, + "loss": 0.0858, + "num_input_tokens_seen": 41077488, + "step": 70810 + }, + { + "epoch": 10.547363717605004, + "grad_norm": 0.17043869197368622, + "learning_rate": 2.6973592534068216e-05, + "loss": 0.0011, + "num_input_tokens_seen": 41080368, + "step": 70815 + }, + { + "epoch": 10.548108430145964, + "grad_norm": 20.257295608520508, + "learning_rate": 2.697035323714891e-05, + "loss": 0.1576, + "num_input_tokens_seen": 41083152, + "step": 70820 + }, + { + "epoch": 10.548853142686923, + "grad_norm": 0.03759790584445, + "learning_rate": 2.6967113906942494e-05, + "loss": 0.3502, + "num_input_tokens_seen": 41086000, + "step": 70825 + }, + { + "epoch": 10.549597855227882, + "grad_norm": 2.7480978965759277, + "learning_rate": 2.696387454350368e-05, + "loss": 0.1986, + "num_input_tokens_seen": 41088912, + "step": 70830 + }, + { + "epoch": 10.55034256776884, + "grad_norm": 2.829967975616455, + "learning_rate": 2.6960635146887202e-05, + "loss": 0.0014, + "num_input_tokens_seen": 41091952, + "step": 70835 + }, + { + "epoch": 10.551087280309801, + "grad_norm": 4.153541088104248, + "learning_rate": 2.6957395717147794e-05, + "loss": 0.1748, + "num_input_tokens_seen": 41094832, + "step": 70840 + }, + { + "epoch": 10.55183199285076, + "grad_norm": 23.798076629638672, + "learning_rate": 2.695415625434017e-05, + "loss": 0.1608, + "num_input_tokens_seen": 41097680, + "step": 70845 + }, + { + "epoch": 10.552576705391719, + "grad_norm": 0.10956432670354843, + "learning_rate": 2.695091675851906e-05, + "loss": 0.1835, + "num_input_tokens_seen": 41100496, + "step": 70850 + }, + { + "epoch": 10.553321417932677, + "grad_norm": 0.38666144013404846, + "learning_rate": 2.6947677229739198e-05, + "loss": 0.0019, + "num_input_tokens_seen": 41103152, + "step": 70855 + }, + { + "epoch": 10.554066130473638, + "grad_norm": 0.07060039788484573, + "learning_rate": 2.6944437668055313e-05, + "loss": 0.0003, + "num_input_tokens_seen": 41105936, + "step": 70860 + }, + { + "epoch": 10.554810843014597, + "grad_norm": 33.13420867919922, + "learning_rate": 2.6941198073522118e-05, + "loss": 0.1415, + "num_input_tokens_seen": 41108912, + "step": 70865 + }, + { + "epoch": 10.555555555555555, + "grad_norm": 0.01744937151670456, + "learning_rate": 2.693795844619436e-05, + "loss": 0.3166, + "num_input_tokens_seen": 41111824, + "step": 70870 + }, + { + "epoch": 10.556300268096514, + "grad_norm": 5.966365814208984, + "learning_rate": 2.6934718786126763e-05, + "loss": 0.0055, + "num_input_tokens_seen": 41114928, + "step": 70875 + }, + { + "epoch": 10.557044980637475, + "grad_norm": 0.0076737781055271626, + "learning_rate": 2.6931479093374056e-05, + "loss": 0.2597, + "num_input_tokens_seen": 41117936, + "step": 70880 + }, + { + "epoch": 10.557789693178433, + "grad_norm": 39.136383056640625, + "learning_rate": 2.6928239367990974e-05, + "loss": 0.1924, + "num_input_tokens_seen": 41120880, + "step": 70885 + }, + { + "epoch": 10.558534405719392, + "grad_norm": 0.008686182089149952, + "learning_rate": 2.692499961003226e-05, + "loss": 0.0402, + "num_input_tokens_seen": 41123920, + "step": 70890 + }, + { + "epoch": 10.559279118260351, + "grad_norm": 0.003650377271696925, + "learning_rate": 2.692175981955263e-05, + "loss": 0.1157, + "num_input_tokens_seen": 41126736, + "step": 70895 + }, + { + "epoch": 10.560023830801311, + "grad_norm": 0.004227509256452322, + "learning_rate": 2.691851999660681e-05, + "loss": 0.0004, + "num_input_tokens_seen": 41129392, + "step": 70900 + }, + { + "epoch": 10.56076854334227, + "grad_norm": 0.434526652097702, + "learning_rate": 2.691528014124955e-05, + "loss": 0.0004, + "num_input_tokens_seen": 41132016, + "step": 70905 + }, + { + "epoch": 10.561513255883229, + "grad_norm": 0.004243763629347086, + "learning_rate": 2.6912040253535574e-05, + "loss": 0.0755, + "num_input_tokens_seen": 41134928, + "step": 70910 + }, + { + "epoch": 10.562257968424188, + "grad_norm": 0.030079057440161705, + "learning_rate": 2.6908800333519625e-05, + "loss": 0.0668, + "num_input_tokens_seen": 41138064, + "step": 70915 + }, + { + "epoch": 10.563002680965148, + "grad_norm": 0.3321489989757538, + "learning_rate": 2.6905560381256434e-05, + "loss": 0.0632, + "num_input_tokens_seen": 41140752, + "step": 70920 + }, + { + "epoch": 10.563747393506107, + "grad_norm": 57.755069732666016, + "learning_rate": 2.690232039680074e-05, + "loss": 0.0544, + "num_input_tokens_seen": 41143568, + "step": 70925 + }, + { + "epoch": 10.564492106047066, + "grad_norm": 0.0005249588284641504, + "learning_rate": 2.6899080380207276e-05, + "loss": 0.2535, + "num_input_tokens_seen": 41146384, + "step": 70930 + }, + { + "epoch": 10.565236818588025, + "grad_norm": 0.0037727137096226215, + "learning_rate": 2.689584033153078e-05, + "loss": 0.1537, + "num_input_tokens_seen": 41149744, + "step": 70935 + }, + { + "epoch": 10.565981531128983, + "grad_norm": 0.24681080877780914, + "learning_rate": 2.6892600250825982e-05, + "loss": 0.0114, + "num_input_tokens_seen": 41152464, + "step": 70940 + }, + { + "epoch": 10.566726243669944, + "grad_norm": 24.78066062927246, + "learning_rate": 2.688936013814763e-05, + "loss": 0.0037, + "num_input_tokens_seen": 41155248, + "step": 70945 + }, + { + "epoch": 10.567470956210903, + "grad_norm": 0.0016927436226978898, + "learning_rate": 2.688611999355046e-05, + "loss": 0.2065, + "num_input_tokens_seen": 41158032, + "step": 70950 + }, + { + "epoch": 10.568215668751861, + "grad_norm": 0.004265312571078539, + "learning_rate": 2.6882879817089207e-05, + "loss": 0.1131, + "num_input_tokens_seen": 41160816, + "step": 70955 + }, + { + "epoch": 10.568960381292822, + "grad_norm": 0.07564736157655716, + "learning_rate": 2.6879639608818618e-05, + "loss": 0.1994, + "num_input_tokens_seen": 41163920, + "step": 70960 + }, + { + "epoch": 10.56970509383378, + "grad_norm": 0.01779872179031372, + "learning_rate": 2.6876399368793425e-05, + "loss": 0.0008, + "num_input_tokens_seen": 41166576, + "step": 70965 + }, + { + "epoch": 10.57044980637474, + "grad_norm": 0.01613595150411129, + "learning_rate": 2.6873159097068366e-05, + "loss": 0.0024, + "num_input_tokens_seen": 41169168, + "step": 70970 + }, + { + "epoch": 10.571194518915698, + "grad_norm": 0.013383558951318264, + "learning_rate": 2.68699187936982e-05, + "loss": 0.2047, + "num_input_tokens_seen": 41171920, + "step": 70975 + }, + { + "epoch": 10.571939231456657, + "grad_norm": 0.0016350727528333664, + "learning_rate": 2.686667845873765e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41174544, + "step": 70980 + }, + { + "epoch": 10.572683943997617, + "grad_norm": 0.00405712379142642, + "learning_rate": 2.686343809224146e-05, + "loss": 0.0924, + "num_input_tokens_seen": 41177712, + "step": 70985 + }, + { + "epoch": 10.573428656538576, + "grad_norm": 0.019260363653302193, + "learning_rate": 2.6860197694264388e-05, + "loss": 0.0841, + "num_input_tokens_seen": 41180560, + "step": 70990 + }, + { + "epoch": 10.574173369079535, + "grad_norm": 0.26840710639953613, + "learning_rate": 2.685695726486116e-05, + "loss": 0.1116, + "num_input_tokens_seen": 41183280, + "step": 70995 + }, + { + "epoch": 10.574918081620494, + "grad_norm": 53.89603805541992, + "learning_rate": 2.6853716804086527e-05, + "loss": 0.2454, + "num_input_tokens_seen": 41185968, + "step": 71000 + }, + { + "epoch": 10.575662794161454, + "grad_norm": 0.0010951071744784713, + "learning_rate": 2.6850476311995226e-05, + "loss": 0.3512, + "num_input_tokens_seen": 41188752, + "step": 71005 + }, + { + "epoch": 10.576407506702413, + "grad_norm": 0.010430445894598961, + "learning_rate": 2.6847235788642018e-05, + "loss": 0.0015, + "num_input_tokens_seen": 41191568, + "step": 71010 + }, + { + "epoch": 10.577152219243372, + "grad_norm": 0.008730470202863216, + "learning_rate": 2.6843995234081636e-05, + "loss": 0.2522, + "num_input_tokens_seen": 41194576, + "step": 71015 + }, + { + "epoch": 10.57789693178433, + "grad_norm": 0.008107253350317478, + "learning_rate": 2.6840754648368826e-05, + "loss": 0.0735, + "num_input_tokens_seen": 41197520, + "step": 71020 + }, + { + "epoch": 10.578641644325291, + "grad_norm": 0.004469345323741436, + "learning_rate": 2.6837514031558347e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41200528, + "step": 71025 + }, + { + "epoch": 10.57938635686625, + "grad_norm": 0.018351465463638306, + "learning_rate": 2.6834273383704927e-05, + "loss": 0.064, + "num_input_tokens_seen": 41203760, + "step": 71030 + }, + { + "epoch": 10.580131069407209, + "grad_norm": 0.004165989812463522, + "learning_rate": 2.6831032704863324e-05, + "loss": 0.0003, + "num_input_tokens_seen": 41206672, + "step": 71035 + }, + { + "epoch": 10.580875781948167, + "grad_norm": 0.01143832877278328, + "learning_rate": 2.6827791995088282e-05, + "loss": 0.1412, + "num_input_tokens_seen": 41209488, + "step": 71040 + }, + { + "epoch": 10.581620494489128, + "grad_norm": 0.0025579440407454967, + "learning_rate": 2.6824551254434555e-05, + "loss": 0.0005, + "num_input_tokens_seen": 41212784, + "step": 71045 + }, + { + "epoch": 10.582365207030087, + "grad_norm": 72.63348388671875, + "learning_rate": 2.6821310482956886e-05, + "loss": 0.0362, + "num_input_tokens_seen": 41215728, + "step": 71050 + }, + { + "epoch": 10.583109919571045, + "grad_norm": 0.6699756383895874, + "learning_rate": 2.6818069680710034e-05, + "loss": 0.0011, + "num_input_tokens_seen": 41218768, + "step": 71055 + }, + { + "epoch": 10.583854632112004, + "grad_norm": 9.513893127441406, + "learning_rate": 2.681482884774874e-05, + "loss": 0.0025, + "num_input_tokens_seen": 41221392, + "step": 71060 + }, + { + "epoch": 10.584599344652965, + "grad_norm": 0.004091512877494097, + "learning_rate": 2.6811587984127758e-05, + "loss": 0.0003, + "num_input_tokens_seen": 41224144, + "step": 71065 + }, + { + "epoch": 10.585344057193923, + "grad_norm": 57.53245162963867, + "learning_rate": 2.680834708990183e-05, + "loss": 0.1626, + "num_input_tokens_seen": 41227152, + "step": 71070 + }, + { + "epoch": 10.586088769734882, + "grad_norm": 0.37858355045318604, + "learning_rate": 2.680510616512572e-05, + "loss": 0.0003, + "num_input_tokens_seen": 41229904, + "step": 71075 + }, + { + "epoch": 10.586833482275841, + "grad_norm": 0.08722787350416183, + "learning_rate": 2.6801865209854177e-05, + "loss": 0.292, + "num_input_tokens_seen": 41233040, + "step": 71080 + }, + { + "epoch": 10.587578194816802, + "grad_norm": 0.0016010876279324293, + "learning_rate": 2.6798624224141954e-05, + "loss": 0.0006, + "num_input_tokens_seen": 41235600, + "step": 71085 + }, + { + "epoch": 10.58832290735776, + "grad_norm": 1.8478988409042358, + "learning_rate": 2.6795383208043805e-05, + "loss": 0.0008, + "num_input_tokens_seen": 41238672, + "step": 71090 + }, + { + "epoch": 10.589067619898719, + "grad_norm": 0.007755507715046406, + "learning_rate": 2.679214216161448e-05, + "loss": 0.0078, + "num_input_tokens_seen": 41241552, + "step": 71095 + }, + { + "epoch": 10.589812332439678, + "grad_norm": 99.0380859375, + "learning_rate": 2.6788901084908734e-05, + "loss": 0.0177, + "num_input_tokens_seen": 41244304, + "step": 71100 + }, + { + "epoch": 10.590557044980638, + "grad_norm": 0.07372143119573593, + "learning_rate": 2.678565997798132e-05, + "loss": 0.1583, + "num_input_tokens_seen": 41247408, + "step": 71105 + }, + { + "epoch": 10.591301757521597, + "grad_norm": 24.981115341186523, + "learning_rate": 2.6782418840886997e-05, + "loss": 0.2324, + "num_input_tokens_seen": 41250288, + "step": 71110 + }, + { + "epoch": 10.592046470062556, + "grad_norm": 0.04065077006816864, + "learning_rate": 2.6779177673680516e-05, + "loss": 0.2601, + "num_input_tokens_seen": 41253008, + "step": 71115 + }, + { + "epoch": 10.592791182603515, + "grad_norm": 0.0024341184180229902, + "learning_rate": 2.6775936476416636e-05, + "loss": 0.1169, + "num_input_tokens_seen": 41255824, + "step": 71120 + }, + { + "epoch": 10.593535895144473, + "grad_norm": 0.02909545786678791, + "learning_rate": 2.6772695249150125e-05, + "loss": 0.0053, + "num_input_tokens_seen": 41258800, + "step": 71125 + }, + { + "epoch": 10.594280607685434, + "grad_norm": 0.09193477779626846, + "learning_rate": 2.6769453991935717e-05, + "loss": 0.0015, + "num_input_tokens_seen": 41261776, + "step": 71130 + }, + { + "epoch": 10.595025320226393, + "grad_norm": 0.006310959346592426, + "learning_rate": 2.676621270482819e-05, + "loss": 0.2116, + "num_input_tokens_seen": 41264496, + "step": 71135 + }, + { + "epoch": 10.595770032767351, + "grad_norm": 0.032820235937833786, + "learning_rate": 2.6762971387882297e-05, + "loss": 0.135, + "num_input_tokens_seen": 41267472, + "step": 71140 + }, + { + "epoch": 10.59651474530831, + "grad_norm": 49.60670852661133, + "learning_rate": 2.6759730041152787e-05, + "loss": 0.376, + "num_input_tokens_seen": 41270480, + "step": 71145 + }, + { + "epoch": 10.59725945784927, + "grad_norm": 0.03518237918615341, + "learning_rate": 2.6756488664694422e-05, + "loss": 0.0011, + "num_input_tokens_seen": 41273104, + "step": 71150 + }, + { + "epoch": 10.59800417039023, + "grad_norm": 0.013945955783128738, + "learning_rate": 2.675324725856198e-05, + "loss": 0.1599, + "num_input_tokens_seen": 41276080, + "step": 71155 + }, + { + "epoch": 10.598748882931188, + "grad_norm": 100.75947570800781, + "learning_rate": 2.6750005822810197e-05, + "loss": 0.0511, + "num_input_tokens_seen": 41278672, + "step": 71160 + }, + { + "epoch": 10.599493595472147, + "grad_norm": 3.1248347759246826, + "learning_rate": 2.6746764357493848e-05, + "loss": 0.0268, + "num_input_tokens_seen": 41281328, + "step": 71165 + }, + { + "epoch": 10.600238308013108, + "grad_norm": 5.9680328369140625, + "learning_rate": 2.6743522862667687e-05, + "loss": 0.0033, + "num_input_tokens_seen": 41284048, + "step": 71170 + }, + { + "epoch": 10.600983020554066, + "grad_norm": 0.0018249114509671926, + "learning_rate": 2.6740281338386484e-05, + "loss": 0.0067, + "num_input_tokens_seen": 41286928, + "step": 71175 + }, + { + "epoch": 10.601727733095025, + "grad_norm": 0.021829878911376, + "learning_rate": 2.6737039784705e-05, + "loss": 0.0642, + "num_input_tokens_seen": 41289968, + "step": 71180 + }, + { + "epoch": 10.602472445635984, + "grad_norm": 0.00016963643429335207, + "learning_rate": 2.6733798201677985e-05, + "loss": 0.0381, + "num_input_tokens_seen": 41292656, + "step": 71185 + }, + { + "epoch": 10.603217158176944, + "grad_norm": 115.627197265625, + "learning_rate": 2.6730556589360216e-05, + "loss": 0.0583, + "num_input_tokens_seen": 41295568, + "step": 71190 + }, + { + "epoch": 10.603961870717903, + "grad_norm": 90.46863555908203, + "learning_rate": 2.672731494780645e-05, + "loss": 0.0658, + "num_input_tokens_seen": 41298352, + "step": 71195 + }, + { + "epoch": 10.604706583258862, + "grad_norm": 0.004102998413145542, + "learning_rate": 2.672407327707146e-05, + "loss": 0.0007, + "num_input_tokens_seen": 41301040, + "step": 71200 + }, + { + "epoch": 10.60545129579982, + "grad_norm": 0.001457112724892795, + "learning_rate": 2.6720831577209997e-05, + "loss": 0.1876, + "num_input_tokens_seen": 41303856, + "step": 71205 + }, + { + "epoch": 10.606196008340781, + "grad_norm": 0.00019695521041285247, + "learning_rate": 2.6717589848276835e-05, + "loss": 0.0558, + "num_input_tokens_seen": 41306672, + "step": 71210 + }, + { + "epoch": 10.60694072088174, + "grad_norm": 2.2346041202545166, + "learning_rate": 2.671434809032674e-05, + "loss": 0.1762, + "num_input_tokens_seen": 41309552, + "step": 71215 + }, + { + "epoch": 10.607685433422699, + "grad_norm": 0.06026512756943703, + "learning_rate": 2.6711106303414478e-05, + "loss": 0.1048, + "num_input_tokens_seen": 41312976, + "step": 71220 + }, + { + "epoch": 10.608430145963657, + "grad_norm": 18.621807098388672, + "learning_rate": 2.6707864487594815e-05, + "loss": 0.2408, + "num_input_tokens_seen": 41316304, + "step": 71225 + }, + { + "epoch": 10.609174858504618, + "grad_norm": 0.12802091240882874, + "learning_rate": 2.6704622642922512e-05, + "loss": 0.0007, + "num_input_tokens_seen": 41318992, + "step": 71230 + }, + { + "epoch": 10.609919571045577, + "grad_norm": 0.27497273683547974, + "learning_rate": 2.6701380769452346e-05, + "loss": 0.0003, + "num_input_tokens_seen": 41322000, + "step": 71235 + }, + { + "epoch": 10.610664283586535, + "grad_norm": 1.536953091621399, + "learning_rate": 2.6698138867239076e-05, + "loss": 0.314, + "num_input_tokens_seen": 41325104, + "step": 71240 + }, + { + "epoch": 10.611408996127494, + "grad_norm": 0.001784415915608406, + "learning_rate": 2.6694896936337477e-05, + "loss": 0.3658, + "num_input_tokens_seen": 41327888, + "step": 71245 + }, + { + "epoch": 10.612153708668455, + "grad_norm": 0.06845390051603317, + "learning_rate": 2.6691654976802316e-05, + "loss": 0.0059, + "num_input_tokens_seen": 41330672, + "step": 71250 + }, + { + "epoch": 10.612898421209414, + "grad_norm": 0.1685042679309845, + "learning_rate": 2.6688412988688372e-05, + "loss": 0.1444, + "num_input_tokens_seen": 41333360, + "step": 71255 + }, + { + "epoch": 10.613643133750372, + "grad_norm": 0.02285400591790676, + "learning_rate": 2.6685170972050404e-05, + "loss": 0.0002, + "num_input_tokens_seen": 41336400, + "step": 71260 + }, + { + "epoch": 10.614387846291331, + "grad_norm": 0.07172837853431702, + "learning_rate": 2.6681928926943183e-05, + "loss": 0.0002, + "num_input_tokens_seen": 41339120, + "step": 71265 + }, + { + "epoch": 10.615132558832292, + "grad_norm": 0.013538309372961521, + "learning_rate": 2.667868685342148e-05, + "loss": 0.0002, + "num_input_tokens_seen": 41342096, + "step": 71270 + }, + { + "epoch": 10.61587727137325, + "grad_norm": 14.74854564666748, + "learning_rate": 2.6675444751540068e-05, + "loss": 0.1195, + "num_input_tokens_seen": 41344944, + "step": 71275 + }, + { + "epoch": 10.616621983914209, + "grad_norm": 0.0262588020414114, + "learning_rate": 2.667220262135372e-05, + "loss": 0.0485, + "num_input_tokens_seen": 41347600, + "step": 71280 + }, + { + "epoch": 10.617366696455168, + "grad_norm": 0.08073362708091736, + "learning_rate": 2.666896046291721e-05, + "loss": 0.0056, + "num_input_tokens_seen": 41350640, + "step": 71285 + }, + { + "epoch": 10.618111408996128, + "grad_norm": 11.332273483276367, + "learning_rate": 2.6665718276285312e-05, + "loss": 0.1269, + "num_input_tokens_seen": 41353392, + "step": 71290 + }, + { + "epoch": 10.618856121537087, + "grad_norm": 0.0009196019964292645, + "learning_rate": 2.66624760615128e-05, + "loss": 0.0026, + "num_input_tokens_seen": 41356560, + "step": 71295 + }, + { + "epoch": 10.619600834078046, + "grad_norm": 28.046483993530273, + "learning_rate": 2.6659233818654434e-05, + "loss": 0.4171, + "num_input_tokens_seen": 41359440, + "step": 71300 + }, + { + "epoch": 10.620345546619005, + "grad_norm": 0.00278886198066175, + "learning_rate": 2.6655991547765e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41362288, + "step": 71305 + }, + { + "epoch": 10.621090259159963, + "grad_norm": 0.03389273211359978, + "learning_rate": 2.6652749248899277e-05, + "loss": 0.036, + "num_input_tokens_seen": 41365040, + "step": 71310 + }, + { + "epoch": 10.621834971700924, + "grad_norm": 16.28943634033203, + "learning_rate": 2.6649506922112033e-05, + "loss": 0.1886, + "num_input_tokens_seen": 41367824, + "step": 71315 + }, + { + "epoch": 10.622579684241883, + "grad_norm": 0.0010434784926474094, + "learning_rate": 2.6646264567458052e-05, + "loss": 0.0002, + "num_input_tokens_seen": 41370640, + "step": 71320 + }, + { + "epoch": 10.623324396782841, + "grad_norm": 0.009114288724958897, + "learning_rate": 2.6643022184992096e-05, + "loss": 0.0567, + "num_input_tokens_seen": 41373392, + "step": 71325 + }, + { + "epoch": 10.6240691093238, + "grad_norm": 1.9963990449905396, + "learning_rate": 2.6639779774768953e-05, + "loss": 0.0008, + "num_input_tokens_seen": 41376208, + "step": 71330 + }, + { + "epoch": 10.62481382186476, + "grad_norm": 0.02522377297282219, + "learning_rate": 2.6636537336843396e-05, + "loss": 0.0003, + "num_input_tokens_seen": 41379216, + "step": 71335 + }, + { + "epoch": 10.62555853440572, + "grad_norm": 4.858310699462891, + "learning_rate": 2.663329487127021e-05, + "loss": 0.1701, + "num_input_tokens_seen": 41381872, + "step": 71340 + }, + { + "epoch": 10.626303246946678, + "grad_norm": 0.20873227715492249, + "learning_rate": 2.663005237810416e-05, + "loss": 0.0023, + "num_input_tokens_seen": 41384688, + "step": 71345 + }, + { + "epoch": 10.627047959487637, + "grad_norm": 0.015232146717607975, + "learning_rate": 2.6626809857400033e-05, + "loss": 0.1103, + "num_input_tokens_seen": 41387824, + "step": 71350 + }, + { + "epoch": 10.627792672028598, + "grad_norm": 0.8965855240821838, + "learning_rate": 2.662356730921261e-05, + "loss": 0.2647, + "num_input_tokens_seen": 41390640, + "step": 71355 + }, + { + "epoch": 10.628537384569556, + "grad_norm": 45.39133834838867, + "learning_rate": 2.6620324733596664e-05, + "loss": 0.1164, + "num_input_tokens_seen": 41393584, + "step": 71360 + }, + { + "epoch": 10.629282097110515, + "grad_norm": 0.005783216096460819, + "learning_rate": 2.661708213060698e-05, + "loss": 0.0009, + "num_input_tokens_seen": 41396432, + "step": 71365 + }, + { + "epoch": 10.630026809651474, + "grad_norm": 0.003157905302941799, + "learning_rate": 2.661383950029834e-05, + "loss": 0.002, + "num_input_tokens_seen": 41399184, + "step": 71370 + }, + { + "epoch": 10.630771522192434, + "grad_norm": 0.00950952060520649, + "learning_rate": 2.6610596842725522e-05, + "loss": 0.2941, + "num_input_tokens_seen": 41401904, + "step": 71375 + }, + { + "epoch": 10.631516234733393, + "grad_norm": 0.07080670446157455, + "learning_rate": 2.66073541579433e-05, + "loss": 0.0069, + "num_input_tokens_seen": 41404752, + "step": 71380 + }, + { + "epoch": 10.632260947274352, + "grad_norm": 0.011503832414746284, + "learning_rate": 2.6604111446006464e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41407984, + "step": 71385 + }, + { + "epoch": 10.63300565981531, + "grad_norm": 0.9023592472076416, + "learning_rate": 2.6600868706969806e-05, + "loss": 0.0835, + "num_input_tokens_seen": 41410768, + "step": 71390 + }, + { + "epoch": 10.633750372356271, + "grad_norm": 0.020282847806811333, + "learning_rate": 2.6597625940888087e-05, + "loss": 0.0117, + "num_input_tokens_seen": 41413936, + "step": 71395 + }, + { + "epoch": 10.63449508489723, + "grad_norm": 0.006680551916360855, + "learning_rate": 2.6594383147816103e-05, + "loss": 0.0123, + "num_input_tokens_seen": 41417072, + "step": 71400 + }, + { + "epoch": 10.635239797438189, + "grad_norm": 0.004169408231973648, + "learning_rate": 2.659114032780864e-05, + "loss": 0.0208, + "num_input_tokens_seen": 41420048, + "step": 71405 + }, + { + "epoch": 10.635984509979147, + "grad_norm": 0.016003098338842392, + "learning_rate": 2.6587897480920478e-05, + "loss": 0.0002, + "num_input_tokens_seen": 41423152, + "step": 71410 + }, + { + "epoch": 10.636729222520108, + "grad_norm": 2.7459473609924316, + "learning_rate": 2.6584654607206404e-05, + "loss": 0.1358, + "num_input_tokens_seen": 41425872, + "step": 71415 + }, + { + "epoch": 10.637473935061067, + "grad_norm": 0.020556041970849037, + "learning_rate": 2.6581411706721194e-05, + "loss": 0.0002, + "num_input_tokens_seen": 41428688, + "step": 71420 + }, + { + "epoch": 10.638218647602026, + "grad_norm": 0.00866967998445034, + "learning_rate": 2.6578168779519652e-05, + "loss": 0.1432, + "num_input_tokens_seen": 41431472, + "step": 71425 + }, + { + "epoch": 10.638963360142984, + "grad_norm": 47.775299072265625, + "learning_rate": 2.657492582565654e-05, + "loss": 0.0888, + "num_input_tokens_seen": 41434352, + "step": 71430 + }, + { + "epoch": 10.639708072683945, + "grad_norm": 0.020422732457518578, + "learning_rate": 2.6571682845186662e-05, + "loss": 0.1135, + "num_input_tokens_seen": 41437328, + "step": 71435 + }, + { + "epoch": 10.640452785224904, + "grad_norm": 0.07181219011545181, + "learning_rate": 2.6568439838164798e-05, + "loss": 0.0646, + "num_input_tokens_seen": 41440368, + "step": 71440 + }, + { + "epoch": 10.641197497765862, + "grad_norm": 0.0011160237481817603, + "learning_rate": 2.6565196804645738e-05, + "loss": 0.3329, + "num_input_tokens_seen": 41443312, + "step": 71445 + }, + { + "epoch": 10.641942210306821, + "grad_norm": 0.45661965012550354, + "learning_rate": 2.6561953744684264e-05, + "loss": 0.0004, + "num_input_tokens_seen": 41446032, + "step": 71450 + }, + { + "epoch": 10.64268692284778, + "grad_norm": 30.415508270263672, + "learning_rate": 2.655871065833518e-05, + "loss": 0.2871, + "num_input_tokens_seen": 41448944, + "step": 71455 + }, + { + "epoch": 10.64343163538874, + "grad_norm": 0.024206750094890594, + "learning_rate": 2.655546754565326e-05, + "loss": 0.1326, + "num_input_tokens_seen": 41451824, + "step": 71460 + }, + { + "epoch": 10.6441763479297, + "grad_norm": 0.02152038924396038, + "learning_rate": 2.6552224406693293e-05, + "loss": 0.0008, + "num_input_tokens_seen": 41454768, + "step": 71465 + }, + { + "epoch": 10.644921060470658, + "grad_norm": 12.054374694824219, + "learning_rate": 2.6548981241510073e-05, + "loss": 0.3113, + "num_input_tokens_seen": 41457552, + "step": 71470 + }, + { + "epoch": 10.645665773011618, + "grad_norm": 36.4455451965332, + "learning_rate": 2.654573805015839e-05, + "loss": 0.1624, + "num_input_tokens_seen": 41460368, + "step": 71475 + }, + { + "epoch": 10.646410485552577, + "grad_norm": 6.2793660163879395, + "learning_rate": 2.654249483269303e-05, + "loss": 0.0294, + "num_input_tokens_seen": 41463248, + "step": 71480 + }, + { + "epoch": 10.647155198093536, + "grad_norm": 0.005758436396718025, + "learning_rate": 2.65392515891688e-05, + "loss": 0.0004, + "num_input_tokens_seen": 41466096, + "step": 71485 + }, + { + "epoch": 10.647899910634495, + "grad_norm": 3.4750983715057373, + "learning_rate": 2.6536008319640466e-05, + "loss": 0.3869, + "num_input_tokens_seen": 41469456, + "step": 71490 + }, + { + "epoch": 10.648644623175453, + "grad_norm": 0.08812308311462402, + "learning_rate": 2.6532765024162837e-05, + "loss": 0.0627, + "num_input_tokens_seen": 41472176, + "step": 71495 + }, + { + "epoch": 10.649389335716414, + "grad_norm": 0.012157969176769257, + "learning_rate": 2.6529521702790705e-05, + "loss": 0.09, + "num_input_tokens_seen": 41475152, + "step": 71500 + }, + { + "epoch": 10.650134048257373, + "grad_norm": 0.03232705965638161, + "learning_rate": 2.6526278355578848e-05, + "loss": 0.0791, + "num_input_tokens_seen": 41478032, + "step": 71505 + }, + { + "epoch": 10.650878760798332, + "grad_norm": 90.78668212890625, + "learning_rate": 2.6523034982582078e-05, + "loss": 0.0204, + "num_input_tokens_seen": 41481392, + "step": 71510 + }, + { + "epoch": 10.65162347333929, + "grad_norm": 0.11325258016586304, + "learning_rate": 2.6519791583855174e-05, + "loss": 0.1633, + "num_input_tokens_seen": 41484304, + "step": 71515 + }, + { + "epoch": 10.65236818588025, + "grad_norm": 0.06837808340787888, + "learning_rate": 2.6516548159452943e-05, + "loss": 0.0006, + "num_input_tokens_seen": 41487440, + "step": 71520 + }, + { + "epoch": 10.65311289842121, + "grad_norm": 0.01911776512861252, + "learning_rate": 2.651330470943017e-05, + "loss": 0.151, + "num_input_tokens_seen": 41490544, + "step": 71525 + }, + { + "epoch": 10.653857610962168, + "grad_norm": 0.16598466038703918, + "learning_rate": 2.651006123384165e-05, + "loss": 0.0056, + "num_input_tokens_seen": 41493584, + "step": 71530 + }, + { + "epoch": 10.654602323503127, + "grad_norm": 0.013737834990024567, + "learning_rate": 2.6506817732742173e-05, + "loss": 0.0051, + "num_input_tokens_seen": 41496432, + "step": 71535 + }, + { + "epoch": 10.655347036044088, + "grad_norm": 0.046485282480716705, + "learning_rate": 2.6503574206186553e-05, + "loss": 0.0287, + "num_input_tokens_seen": 41499184, + "step": 71540 + }, + { + "epoch": 10.656091748585046, + "grad_norm": 0.12807343900203705, + "learning_rate": 2.6500330654229573e-05, + "loss": 0.0398, + "num_input_tokens_seen": 41502000, + "step": 71545 + }, + { + "epoch": 10.656836461126005, + "grad_norm": 0.011882208287715912, + "learning_rate": 2.649708707692603e-05, + "loss": 0.0028, + "num_input_tokens_seen": 41504752, + "step": 71550 + }, + { + "epoch": 10.657581173666964, + "grad_norm": 0.02268213778734207, + "learning_rate": 2.6493843474330727e-05, + "loss": 0.0009, + "num_input_tokens_seen": 41507664, + "step": 71555 + }, + { + "epoch": 10.658325886207924, + "grad_norm": 40.13235855102539, + "learning_rate": 2.649059984649845e-05, + "loss": 0.0104, + "num_input_tokens_seen": 41510288, + "step": 71560 + }, + { + "epoch": 10.659070598748883, + "grad_norm": 0.057389456778764725, + "learning_rate": 2.6487356193484002e-05, + "loss": 0.0275, + "num_input_tokens_seen": 41513456, + "step": 71565 + }, + { + "epoch": 10.659815311289842, + "grad_norm": 0.001778869889676571, + "learning_rate": 2.6484112515342186e-05, + "loss": 0.2042, + "num_input_tokens_seen": 41516464, + "step": 71570 + }, + { + "epoch": 10.6605600238308, + "grad_norm": 0.018735891208052635, + "learning_rate": 2.6480868812127795e-05, + "loss": 0.1527, + "num_input_tokens_seen": 41519568, + "step": 71575 + }, + { + "epoch": 10.661304736371761, + "grad_norm": 155.58241271972656, + "learning_rate": 2.6477625083895636e-05, + "loss": 0.0286, + "num_input_tokens_seen": 41522192, + "step": 71580 + }, + { + "epoch": 10.66204944891272, + "grad_norm": 0.007377150934189558, + "learning_rate": 2.6474381330700497e-05, + "loss": 0.1722, + "num_input_tokens_seen": 41524880, + "step": 71585 + }, + { + "epoch": 10.662794161453679, + "grad_norm": 0.13645653426647186, + "learning_rate": 2.6471137552597193e-05, + "loss": 0.0011, + "num_input_tokens_seen": 41527664, + "step": 71590 + }, + { + "epoch": 10.663538873994638, + "grad_norm": 0.010057404637336731, + "learning_rate": 2.646789374964051e-05, + "loss": 0.0779, + "num_input_tokens_seen": 41530704, + "step": 71595 + }, + { + "epoch": 10.664283586535598, + "grad_norm": 0.0011996027315035462, + "learning_rate": 2.6464649921885247e-05, + "loss": 0.2213, + "num_input_tokens_seen": 41533520, + "step": 71600 + }, + { + "epoch": 10.665028299076557, + "grad_norm": 0.0022300579585134983, + "learning_rate": 2.646140606938622e-05, + "loss": 0.2704, + "num_input_tokens_seen": 41536880, + "step": 71605 + }, + { + "epoch": 10.665773011617516, + "grad_norm": 0.059444013983011246, + "learning_rate": 2.6458162192198223e-05, + "loss": 0.0003, + "num_input_tokens_seen": 41539856, + "step": 71610 + }, + { + "epoch": 10.666517724158474, + "grad_norm": 0.16554567217826843, + "learning_rate": 2.6454918290376053e-05, + "loss": 0.0006, + "num_input_tokens_seen": 41542928, + "step": 71615 + }, + { + "epoch": 10.667262436699435, + "grad_norm": 0.02611895091831684, + "learning_rate": 2.645167436397452e-05, + "loss": 0.0034, + "num_input_tokens_seen": 41546224, + "step": 71620 + }, + { + "epoch": 10.668007149240394, + "grad_norm": 0.00421491451561451, + "learning_rate": 2.644843041304843e-05, + "loss": 0.1658, + "num_input_tokens_seen": 41548880, + "step": 71625 + }, + { + "epoch": 10.668751861781352, + "grad_norm": 0.0029760911129415035, + "learning_rate": 2.6445186437652577e-05, + "loss": 0.0407, + "num_input_tokens_seen": 41551856, + "step": 71630 + }, + { + "epoch": 10.669496574322311, + "grad_norm": 0.0025859863962978125, + "learning_rate": 2.644194243784176e-05, + "loss": 0.0006, + "num_input_tokens_seen": 41555216, + "step": 71635 + }, + { + "epoch": 10.67024128686327, + "grad_norm": 0.0015665809623897076, + "learning_rate": 2.64386984136708e-05, + "loss": 0.0002, + "num_input_tokens_seen": 41558384, + "step": 71640 + }, + { + "epoch": 10.67098599940423, + "grad_norm": 9.58238697052002, + "learning_rate": 2.6435454365194483e-05, + "loss": 0.0129, + "num_input_tokens_seen": 41561232, + "step": 71645 + }, + { + "epoch": 10.67173071194519, + "grad_norm": 0.6347225904464722, + "learning_rate": 2.6432210292467634e-05, + "loss": 0.0004, + "num_input_tokens_seen": 41564016, + "step": 71650 + }, + { + "epoch": 10.672475424486148, + "grad_norm": 1.656065583229065, + "learning_rate": 2.642896619554504e-05, + "loss": 0.0008, + "num_input_tokens_seen": 41566832, + "step": 71655 + }, + { + "epoch": 10.673220137027108, + "grad_norm": 0.014572618529200554, + "learning_rate": 2.6425722074481525e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41569616, + "step": 71660 + }, + { + "epoch": 10.673964849568067, + "grad_norm": 0.003883359720930457, + "learning_rate": 2.6422477929331875e-05, + "loss": 0.1191, + "num_input_tokens_seen": 41572240, + "step": 71665 + }, + { + "epoch": 10.674709562109026, + "grad_norm": 0.004919151775538921, + "learning_rate": 2.6419233760150907e-05, + "loss": 0.0796, + "num_input_tokens_seen": 41575216, + "step": 71670 + }, + { + "epoch": 10.675454274649985, + "grad_norm": 0.011117439717054367, + "learning_rate": 2.6415989566993425e-05, + "loss": 0.1115, + "num_input_tokens_seen": 41578032, + "step": 71675 + }, + { + "epoch": 10.676198987190944, + "grad_norm": 0.0131113575771451, + "learning_rate": 2.6412745349914242e-05, + "loss": 0.0632, + "num_input_tokens_seen": 41580944, + "step": 71680 + }, + { + "epoch": 10.676943699731904, + "grad_norm": 191.86131286621094, + "learning_rate": 2.6409501108968164e-05, + "loss": 0.1122, + "num_input_tokens_seen": 41583952, + "step": 71685 + }, + { + "epoch": 10.677688412272863, + "grad_norm": 0.0029378507751971483, + "learning_rate": 2.6406256844209998e-05, + "loss": 0.1259, + "num_input_tokens_seen": 41586608, + "step": 71690 + }, + { + "epoch": 10.678433124813822, + "grad_norm": 0.004915943369269371, + "learning_rate": 2.6403012555694546e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41589520, + "step": 71695 + }, + { + "epoch": 10.67917783735478, + "grad_norm": 0.004934869706630707, + "learning_rate": 2.639976824347663e-05, + "loss": 0.0008, + "num_input_tokens_seen": 41592464, + "step": 71700 + }, + { + "epoch": 10.67992254989574, + "grad_norm": 0.03716779872775078, + "learning_rate": 2.639652390761105e-05, + "loss": 0.0186, + "num_input_tokens_seen": 41595376, + "step": 71705 + }, + { + "epoch": 10.6806672624367, + "grad_norm": 0.0015014654491096735, + "learning_rate": 2.639327954815261e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41598256, + "step": 71710 + }, + { + "epoch": 10.681411974977658, + "grad_norm": 0.1885976344347, + "learning_rate": 2.6390035165156136e-05, + "loss": 0.0003, + "num_input_tokens_seen": 41601072, + "step": 71715 + }, + { + "epoch": 10.682156687518617, + "grad_norm": 0.05726464092731476, + "learning_rate": 2.6386790758676432e-05, + "loss": 0.2483, + "num_input_tokens_seen": 41603920, + "step": 71720 + }, + { + "epoch": 10.682901400059578, + "grad_norm": 0.001588217681273818, + "learning_rate": 2.6383546328768305e-05, + "loss": 0.0785, + "num_input_tokens_seen": 41607120, + "step": 71725 + }, + { + "epoch": 10.683646112600536, + "grad_norm": 0.14278627932071686, + "learning_rate": 2.6380301875486568e-05, + "loss": 0.0261, + "num_input_tokens_seen": 41609968, + "step": 71730 + }, + { + "epoch": 10.684390825141495, + "grad_norm": 0.007022663485258818, + "learning_rate": 2.637705739888603e-05, + "loss": 0.0044, + "num_input_tokens_seen": 41612944, + "step": 71735 + }, + { + "epoch": 10.685135537682454, + "grad_norm": 0.003913493361324072, + "learning_rate": 2.6373812899021516e-05, + "loss": 0.0002, + "num_input_tokens_seen": 41615728, + "step": 71740 + }, + { + "epoch": 10.685880250223414, + "grad_norm": 1.5063711404800415, + "learning_rate": 2.6370568375947825e-05, + "loss": 0.0375, + "num_input_tokens_seen": 41618896, + "step": 71745 + }, + { + "epoch": 10.686624962764373, + "grad_norm": 0.2666791081428528, + "learning_rate": 2.636732382971977e-05, + "loss": 0.0005, + "num_input_tokens_seen": 41621872, + "step": 71750 + }, + { + "epoch": 10.687369675305332, + "grad_norm": 0.004372549708932638, + "learning_rate": 2.6364079260392178e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41624880, + "step": 71755 + }, + { + "epoch": 10.68811438784629, + "grad_norm": 0.003976015839725733, + "learning_rate": 2.6360834668019845e-05, + "loss": 0.2096, + "num_input_tokens_seen": 41627632, + "step": 71760 + }, + { + "epoch": 10.688859100387251, + "grad_norm": 0.13187725841999054, + "learning_rate": 2.6357590052657595e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41630672, + "step": 71765 + }, + { + "epoch": 10.68960381292821, + "grad_norm": 0.16167189180850983, + "learning_rate": 2.6354345414360236e-05, + "loss": 0.0005, + "num_input_tokens_seen": 41633584, + "step": 71770 + }, + { + "epoch": 10.690348525469169, + "grad_norm": 0.016817154362797737, + "learning_rate": 2.6351100753182594e-05, + "loss": 0.0053, + "num_input_tokens_seen": 41636272, + "step": 71775 + }, + { + "epoch": 10.691093238010128, + "grad_norm": 0.001007315469905734, + "learning_rate": 2.6347856069179483e-05, + "loss": 0.2698, + "num_input_tokens_seen": 41638992, + "step": 71780 + }, + { + "epoch": 10.691837950551088, + "grad_norm": 0.022698359563946724, + "learning_rate": 2.6344611362405708e-05, + "loss": 0.0602, + "num_input_tokens_seen": 41642000, + "step": 71785 + }, + { + "epoch": 10.692582663092047, + "grad_norm": 0.05529305711388588, + "learning_rate": 2.634136663291609e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41645040, + "step": 71790 + }, + { + "epoch": 10.693327375633006, + "grad_norm": 66.7984390258789, + "learning_rate": 2.6338121880765447e-05, + "loss": 0.1349, + "num_input_tokens_seen": 41647856, + "step": 71795 + }, + { + "epoch": 10.694072088173964, + "grad_norm": 0.00352780451066792, + "learning_rate": 2.6334877106008594e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41650736, + "step": 71800 + }, + { + "epoch": 10.694816800714925, + "grad_norm": 20.384368896484375, + "learning_rate": 2.633163230870035e-05, + "loss": 0.051, + "num_input_tokens_seen": 41653584, + "step": 71805 + }, + { + "epoch": 10.695561513255884, + "grad_norm": 0.006420549005270004, + "learning_rate": 2.632838748889553e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41656368, + "step": 71810 + }, + { + "epoch": 10.696306225796842, + "grad_norm": 0.01762533001601696, + "learning_rate": 2.6325142646648958e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41659216, + "step": 71815 + }, + { + "epoch": 10.697050938337801, + "grad_norm": 0.04517560452222824, + "learning_rate": 2.632189778201544e-05, + "loss": 0.2095, + "num_input_tokens_seen": 41662288, + "step": 71820 + }, + { + "epoch": 10.69779565087876, + "grad_norm": 1.6171045303344727, + "learning_rate": 2.631865289504981e-05, + "loss": 0.1924, + "num_input_tokens_seen": 41665104, + "step": 71825 + }, + { + "epoch": 10.69854036341972, + "grad_norm": 0.0004553699982352555, + "learning_rate": 2.631540798580688e-05, + "loss": 0.0515, + "num_input_tokens_seen": 41667856, + "step": 71830 + }, + { + "epoch": 10.69928507596068, + "grad_norm": 88.13021087646484, + "learning_rate": 2.6312163054341464e-05, + "loss": 0.1122, + "num_input_tokens_seen": 41670704, + "step": 71835 + }, + { + "epoch": 10.700029788501638, + "grad_norm": 0.00017447520804125816, + "learning_rate": 2.6308918100708386e-05, + "loss": 0.1532, + "num_input_tokens_seen": 41673424, + "step": 71840 + }, + { + "epoch": 10.700774501042597, + "grad_norm": 0.0021653841249644756, + "learning_rate": 2.6305673124962466e-05, + "loss": 0.0345, + "num_input_tokens_seen": 41676400, + "step": 71845 + }, + { + "epoch": 10.701519213583557, + "grad_norm": 0.006949133239686489, + "learning_rate": 2.6302428127158535e-05, + "loss": 0.0, + "num_input_tokens_seen": 41679088, + "step": 71850 + }, + { + "epoch": 10.702263926124516, + "grad_norm": 12.98704719543457, + "learning_rate": 2.6299183107351395e-05, + "loss": 0.2229, + "num_input_tokens_seen": 41682096, + "step": 71855 + }, + { + "epoch": 10.703008638665475, + "grad_norm": 0.0018913747044280171, + "learning_rate": 2.629593806559589e-05, + "loss": 0.0023, + "num_input_tokens_seen": 41684976, + "step": 71860 + }, + { + "epoch": 10.703753351206434, + "grad_norm": 0.11396406590938568, + "learning_rate": 2.629269300194681e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41687856, + "step": 71865 + }, + { + "epoch": 10.704498063747394, + "grad_norm": 0.0005993142258375883, + "learning_rate": 2.6289447916459005e-05, + "loss": 0.0004, + "num_input_tokens_seen": 41690864, + "step": 71870 + }, + { + "epoch": 10.705242776288353, + "grad_norm": 0.0034221108071506023, + "learning_rate": 2.628620280918729e-05, + "loss": 0.2349, + "num_input_tokens_seen": 41693616, + "step": 71875 + }, + { + "epoch": 10.705987488829312, + "grad_norm": 0.039836421608924866, + "learning_rate": 2.6282957680186476e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41696624, + "step": 71880 + }, + { + "epoch": 10.70673220137027, + "grad_norm": 23.734941482543945, + "learning_rate": 2.6279712529511406e-05, + "loss": 0.1813, + "num_input_tokens_seen": 41699696, + "step": 71885 + }, + { + "epoch": 10.707476913911231, + "grad_norm": 0.005938305053859949, + "learning_rate": 2.6276467357216895e-05, + "loss": 0.0009, + "num_input_tokens_seen": 41702800, + "step": 71890 + }, + { + "epoch": 10.70822162645219, + "grad_norm": 0.10097389668226242, + "learning_rate": 2.627322216335776e-05, + "loss": 0.2065, + "num_input_tokens_seen": 41705392, + "step": 71895 + }, + { + "epoch": 10.708966338993148, + "grad_norm": 4.317886829376221, + "learning_rate": 2.6269976947988834e-05, + "loss": 0.0161, + "num_input_tokens_seen": 41708464, + "step": 71900 + }, + { + "epoch": 10.709711051534107, + "grad_norm": 0.0056167468428611755, + "learning_rate": 2.626673171116493e-05, + "loss": 0.0871, + "num_input_tokens_seen": 41711664, + "step": 71905 + }, + { + "epoch": 10.710455764075068, + "grad_norm": 1.9400005340576172, + "learning_rate": 2.626348645294089e-05, + "loss": 0.0014, + "num_input_tokens_seen": 41714448, + "step": 71910 + }, + { + "epoch": 10.711200476616026, + "grad_norm": 0.0002221674076281488, + "learning_rate": 2.6260241173371525e-05, + "loss": 0.1718, + "num_input_tokens_seen": 41717328, + "step": 71915 + }, + { + "epoch": 10.711945189156985, + "grad_norm": 0.005585573147982359, + "learning_rate": 2.625699587251167e-05, + "loss": 0.018, + "num_input_tokens_seen": 41720144, + "step": 71920 + }, + { + "epoch": 10.712689901697944, + "grad_norm": 104.63296508789062, + "learning_rate": 2.6253750550416144e-05, + "loss": 0.1675, + "num_input_tokens_seen": 41723120, + "step": 71925 + }, + { + "epoch": 10.713434614238905, + "grad_norm": 0.00762786902487278, + "learning_rate": 2.6250505207139782e-05, + "loss": 0.0049, + "num_input_tokens_seen": 41725680, + "step": 71930 + }, + { + "epoch": 10.714179326779863, + "grad_norm": 0.005151340272277594, + "learning_rate": 2.62472598427374e-05, + "loss": 0.3173, + "num_input_tokens_seen": 41728464, + "step": 71935 + }, + { + "epoch": 10.714924039320822, + "grad_norm": 0.045832615345716476, + "learning_rate": 2.624401445726383e-05, + "loss": 0.0003, + "num_input_tokens_seen": 41731280, + "step": 71940 + }, + { + "epoch": 10.71566875186178, + "grad_norm": 0.0021406179293990135, + "learning_rate": 2.6240769050773906e-05, + "loss": 0.113, + "num_input_tokens_seen": 41734320, + "step": 71945 + }, + { + "epoch": 10.716413464402741, + "grad_norm": 61.391571044921875, + "learning_rate": 2.6237523623322446e-05, + "loss": 0.1433, + "num_input_tokens_seen": 41737424, + "step": 71950 + }, + { + "epoch": 10.7171581769437, + "grad_norm": 69.27832794189453, + "learning_rate": 2.6234278174964288e-05, + "loss": 0.0191, + "num_input_tokens_seen": 41740368, + "step": 71955 + }, + { + "epoch": 10.717902889484659, + "grad_norm": 0.23120926320552826, + "learning_rate": 2.6231032705754243e-05, + "loss": 0.0374, + "num_input_tokens_seen": 41743120, + "step": 71960 + }, + { + "epoch": 10.718647602025618, + "grad_norm": 0.00736359553411603, + "learning_rate": 2.6227787215747156e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41745776, + "step": 71965 + }, + { + "epoch": 10.719392314566576, + "grad_norm": 106.14434814453125, + "learning_rate": 2.6224541704997856e-05, + "loss": 0.2688, + "num_input_tokens_seen": 41748944, + "step": 71970 + }, + { + "epoch": 10.720137027107537, + "grad_norm": 0.0011236629216000438, + "learning_rate": 2.6221296173561166e-05, + "loss": 0.0011, + "num_input_tokens_seen": 41751984, + "step": 71975 + }, + { + "epoch": 10.720881739648496, + "grad_norm": 0.07749531418085098, + "learning_rate": 2.6218050621491925e-05, + "loss": 0.0003, + "num_input_tokens_seen": 41754704, + "step": 71980 + }, + { + "epoch": 10.721626452189454, + "grad_norm": 0.6300771832466125, + "learning_rate": 2.6214805048844947e-05, + "loss": 0.3002, + "num_input_tokens_seen": 41757808, + "step": 71985 + }, + { + "epoch": 10.722371164730415, + "grad_norm": 0.053577978163957596, + "learning_rate": 2.621155945567508e-05, + "loss": 0.0061, + "num_input_tokens_seen": 41760816, + "step": 71990 + }, + { + "epoch": 10.723115877271374, + "grad_norm": 0.002101041143760085, + "learning_rate": 2.620831384203714e-05, + "loss": 0.2363, + "num_input_tokens_seen": 41763472, + "step": 71995 + }, + { + "epoch": 10.723860589812332, + "grad_norm": 0.0008337919134646654, + "learning_rate": 2.6205068207985965e-05, + "loss": 0.1438, + "num_input_tokens_seen": 41766416, + "step": 72000 + }, + { + "epoch": 10.724605302353291, + "grad_norm": 0.004310390446335077, + "learning_rate": 2.6201822553576394e-05, + "loss": 0.2158, + "num_input_tokens_seen": 41769424, + "step": 72005 + }, + { + "epoch": 10.72535001489425, + "grad_norm": 0.529360830783844, + "learning_rate": 2.619857687886325e-05, + "loss": 0.0168, + "num_input_tokens_seen": 41772272, + "step": 72010 + }, + { + "epoch": 10.72609472743521, + "grad_norm": 0.0010352276731282473, + "learning_rate": 2.6195331183901374e-05, + "loss": 0.3289, + "num_input_tokens_seen": 41776240, + "step": 72015 + }, + { + "epoch": 10.72683943997617, + "grad_norm": 51.176361083984375, + "learning_rate": 2.6192085468745585e-05, + "loss": 0.2922, + "num_input_tokens_seen": 41779504, + "step": 72020 + }, + { + "epoch": 10.727584152517128, + "grad_norm": 0.11004702746868134, + "learning_rate": 2.6188839733450727e-05, + "loss": 0.0002, + "num_input_tokens_seen": 41782512, + "step": 72025 + }, + { + "epoch": 10.728328865058087, + "grad_norm": 51.521331787109375, + "learning_rate": 2.6185593978071627e-05, + "loss": 0.1223, + "num_input_tokens_seen": 41785488, + "step": 72030 + }, + { + "epoch": 10.729073577599047, + "grad_norm": 0.005392061546444893, + "learning_rate": 2.6182348202663122e-05, + "loss": 0.1851, + "num_input_tokens_seen": 41788272, + "step": 72035 + }, + { + "epoch": 10.729818290140006, + "grad_norm": 0.0035731082316488028, + "learning_rate": 2.617910240728004e-05, + "loss": 0.0685, + "num_input_tokens_seen": 41791184, + "step": 72040 + }, + { + "epoch": 10.730563002680965, + "grad_norm": 135.67294311523438, + "learning_rate": 2.6175856591977226e-05, + "loss": 0.153, + "num_input_tokens_seen": 41793904, + "step": 72045 + }, + { + "epoch": 10.731307715221924, + "grad_norm": 0.0480228066444397, + "learning_rate": 2.6172610756809517e-05, + "loss": 0.0002, + "num_input_tokens_seen": 41796592, + "step": 72050 + }, + { + "epoch": 10.732052427762884, + "grad_norm": 0.16745951771736145, + "learning_rate": 2.6169364901831732e-05, + "loss": 0.0256, + "num_input_tokens_seen": 41799536, + "step": 72055 + }, + { + "epoch": 10.732797140303843, + "grad_norm": 0.026995087042450905, + "learning_rate": 2.6166119027098724e-05, + "loss": 0.2539, + "num_input_tokens_seen": 41802288, + "step": 72060 + }, + { + "epoch": 10.733541852844802, + "grad_norm": 0.01090320385992527, + "learning_rate": 2.6162873132665315e-05, + "loss": 0.3194, + "num_input_tokens_seen": 41804976, + "step": 72065 + }, + { + "epoch": 10.73428656538576, + "grad_norm": 0.004339896142482758, + "learning_rate": 2.6159627218586345e-05, + "loss": 0.0008, + "num_input_tokens_seen": 41808144, + "step": 72070 + }, + { + "epoch": 10.735031277926721, + "grad_norm": 0.015202418901026249, + "learning_rate": 2.6156381284916653e-05, + "loss": 0.0249, + "num_input_tokens_seen": 41811088, + "step": 72075 + }, + { + "epoch": 10.73577599046768, + "grad_norm": 0.13415732979774475, + "learning_rate": 2.615313533171107e-05, + "loss": 0.0003, + "num_input_tokens_seen": 41813776, + "step": 72080 + }, + { + "epoch": 10.736520703008638, + "grad_norm": 0.1690273880958557, + "learning_rate": 2.6149889359024447e-05, + "loss": 0.2441, + "num_input_tokens_seen": 41816592, + "step": 72085 + }, + { + "epoch": 10.737265415549597, + "grad_norm": 0.004788754507899284, + "learning_rate": 2.6146643366911612e-05, + "loss": 0.1134, + "num_input_tokens_seen": 41819824, + "step": 72090 + }, + { + "epoch": 10.738010128090558, + "grad_norm": 4.49160099029541, + "learning_rate": 2.614339735542739e-05, + "loss": 0.0487, + "num_input_tokens_seen": 41822704, + "step": 72095 + }, + { + "epoch": 10.738754840631517, + "grad_norm": 0.005957779474556446, + "learning_rate": 2.6140151324626644e-05, + "loss": 0.0425, + "num_input_tokens_seen": 41825328, + "step": 72100 + }, + { + "epoch": 10.739499553172475, + "grad_norm": 0.009102130308747292, + "learning_rate": 2.61369052745642e-05, + "loss": 0.0003, + "num_input_tokens_seen": 41828240, + "step": 72105 + }, + { + "epoch": 10.740244265713434, + "grad_norm": 0.03406320512294769, + "learning_rate": 2.6133659205294892e-05, + "loss": 0.1925, + "num_input_tokens_seen": 41830928, + "step": 72110 + }, + { + "epoch": 10.740988978254395, + "grad_norm": 0.08990755677223206, + "learning_rate": 2.6130413116873557e-05, + "loss": 0.2438, + "num_input_tokens_seen": 41834256, + "step": 72115 + }, + { + "epoch": 10.741733690795353, + "grad_norm": 4.236364841461182, + "learning_rate": 2.6127167009355058e-05, + "loss": 0.0411, + "num_input_tokens_seen": 41837040, + "step": 72120 + }, + { + "epoch": 10.742478403336312, + "grad_norm": 24.816131591796875, + "learning_rate": 2.6123920882794208e-05, + "loss": 0.0936, + "num_input_tokens_seen": 41839984, + "step": 72125 + }, + { + "epoch": 10.74322311587727, + "grad_norm": 46.601741790771484, + "learning_rate": 2.6120674737245854e-05, + "loss": 0.0491, + "num_input_tokens_seen": 41843024, + "step": 72130 + }, + { + "epoch": 10.743967828418231, + "grad_norm": 0.033524416387081146, + "learning_rate": 2.611742857276484e-05, + "loss": 0.0002, + "num_input_tokens_seen": 41846064, + "step": 72135 + }, + { + "epoch": 10.74471254095919, + "grad_norm": 0.03650490194559097, + "learning_rate": 2.6114182389406012e-05, + "loss": 0.1046, + "num_input_tokens_seen": 41848848, + "step": 72140 + }, + { + "epoch": 10.745457253500149, + "grad_norm": 12.708672523498535, + "learning_rate": 2.6110936187224205e-05, + "loss": 0.1282, + "num_input_tokens_seen": 41851952, + "step": 72145 + }, + { + "epoch": 10.746201966041108, + "grad_norm": 5.490777969360352, + "learning_rate": 2.610768996627426e-05, + "loss": 0.1208, + "num_input_tokens_seen": 41854800, + "step": 72150 + }, + { + "epoch": 10.746946678582066, + "grad_norm": 0.16524304449558258, + "learning_rate": 2.6104443726611016e-05, + "loss": 0.0004, + "num_input_tokens_seen": 41857680, + "step": 72155 + }, + { + "epoch": 10.747691391123027, + "grad_norm": 0.0368703156709671, + "learning_rate": 2.610119746828932e-05, + "loss": 0.0005, + "num_input_tokens_seen": 41860720, + "step": 72160 + }, + { + "epoch": 10.748436103663986, + "grad_norm": 0.021230801939964294, + "learning_rate": 2.6097951191364007e-05, + "loss": 0.0004, + "num_input_tokens_seen": 41863696, + "step": 72165 + }, + { + "epoch": 10.749180816204944, + "grad_norm": 0.010860278271138668, + "learning_rate": 2.6094704895889927e-05, + "loss": 0.0002, + "num_input_tokens_seen": 41866480, + "step": 72170 + }, + { + "epoch": 10.749925528745905, + "grad_norm": 0.021545741707086563, + "learning_rate": 2.609145858192192e-05, + "loss": 0.0002, + "num_input_tokens_seen": 41869232, + "step": 72175 + }, + { + "epoch": 10.750670241286864, + "grad_norm": 2.5826199054718018, + "learning_rate": 2.608821224951483e-05, + "loss": 0.0583, + "num_input_tokens_seen": 41872208, + "step": 72180 + }, + { + "epoch": 10.751414953827823, + "grad_norm": 0.02626684494316578, + "learning_rate": 2.608496589872351e-05, + "loss": 0.0042, + "num_input_tokens_seen": 41875184, + "step": 72185 + }, + { + "epoch": 10.752159666368781, + "grad_norm": 0.01942257210612297, + "learning_rate": 2.6081719529602776e-05, + "loss": 0.2973, + "num_input_tokens_seen": 41878000, + "step": 72190 + }, + { + "epoch": 10.75290437890974, + "grad_norm": 0.006546149030327797, + "learning_rate": 2.6078473142207498e-05, + "loss": 0.1062, + "num_input_tokens_seen": 41881008, + "step": 72195 + }, + { + "epoch": 10.7536490914507, + "grad_norm": 0.05269601568579674, + "learning_rate": 2.607522673659251e-05, + "loss": 0.1724, + "num_input_tokens_seen": 41883696, + "step": 72200 + }, + { + "epoch": 10.75439380399166, + "grad_norm": 0.05480393022298813, + "learning_rate": 2.6071980312812665e-05, + "loss": 0.0163, + "num_input_tokens_seen": 41886928, + "step": 72205 + }, + { + "epoch": 10.755138516532618, + "grad_norm": 0.020440328866243362, + "learning_rate": 2.6068733870922797e-05, + "loss": 0.0023, + "num_input_tokens_seen": 41889776, + "step": 72210 + }, + { + "epoch": 10.755883229073577, + "grad_norm": 247.8358917236328, + "learning_rate": 2.606548741097776e-05, + "loss": 0.3018, + "num_input_tokens_seen": 41892528, + "step": 72215 + }, + { + "epoch": 10.756627941614537, + "grad_norm": 0.010171618312597275, + "learning_rate": 2.6062240933032394e-05, + "loss": 0.0002, + "num_input_tokens_seen": 41895504, + "step": 72220 + }, + { + "epoch": 10.757372654155496, + "grad_norm": 0.02769051305949688, + "learning_rate": 2.6058994437141554e-05, + "loss": 0.1289, + "num_input_tokens_seen": 41898256, + "step": 72225 + }, + { + "epoch": 10.758117366696455, + "grad_norm": 0.005235868971794844, + "learning_rate": 2.605574792336007e-05, + "loss": 0.0167, + "num_input_tokens_seen": 41901328, + "step": 72230 + }, + { + "epoch": 10.758862079237414, + "grad_norm": 0.005634348373860121, + "learning_rate": 2.6052501391742802e-05, + "loss": 0.1408, + "num_input_tokens_seen": 41904112, + "step": 72235 + }, + { + "epoch": 10.759606791778374, + "grad_norm": 0.0025737888645380735, + "learning_rate": 2.604925484234459e-05, + "loss": 0.0439, + "num_input_tokens_seen": 41907120, + "step": 72240 + }, + { + "epoch": 10.760351504319333, + "grad_norm": 0.009261345490813255, + "learning_rate": 2.6046008275220286e-05, + "loss": 0.2111, + "num_input_tokens_seen": 41909872, + "step": 72245 + }, + { + "epoch": 10.761096216860292, + "grad_norm": 11.109874725341797, + "learning_rate": 2.604276169042473e-05, + "loss": 0.1811, + "num_input_tokens_seen": 41912976, + "step": 72250 + }, + { + "epoch": 10.76184092940125, + "grad_norm": 0.03318016231060028, + "learning_rate": 2.6039515088012783e-05, + "loss": 0.0918, + "num_input_tokens_seen": 41915632, + "step": 72255 + }, + { + "epoch": 10.762585641942211, + "grad_norm": 23.32244110107422, + "learning_rate": 2.6036268468039282e-05, + "loss": 0.0126, + "num_input_tokens_seen": 41918352, + "step": 72260 + }, + { + "epoch": 10.76333035448317, + "grad_norm": 0.08317255228757858, + "learning_rate": 2.603302183055908e-05, + "loss": 0.1065, + "num_input_tokens_seen": 41921072, + "step": 72265 + }, + { + "epoch": 10.764075067024129, + "grad_norm": 0.005939268507063389, + "learning_rate": 2.6029775175627024e-05, + "loss": 0.0136, + "num_input_tokens_seen": 41923728, + "step": 72270 + }, + { + "epoch": 10.764819779565087, + "grad_norm": 0.13421829044818878, + "learning_rate": 2.602652850329796e-05, + "loss": 0.0011, + "num_input_tokens_seen": 41926768, + "step": 72275 + }, + { + "epoch": 10.765564492106048, + "grad_norm": 13.457826614379883, + "learning_rate": 2.6023281813626737e-05, + "loss": 0.1045, + "num_input_tokens_seen": 41929488, + "step": 72280 + }, + { + "epoch": 10.766309204647007, + "grad_norm": 0.050694819539785385, + "learning_rate": 2.602003510666822e-05, + "loss": 0.0009, + "num_input_tokens_seen": 41932432, + "step": 72285 + }, + { + "epoch": 10.767053917187965, + "grad_norm": 0.006116567179560661, + "learning_rate": 2.6016788382477238e-05, + "loss": 0.1348, + "num_input_tokens_seen": 41935472, + "step": 72290 + }, + { + "epoch": 10.767798629728924, + "grad_norm": 0.0012619197368621826, + "learning_rate": 2.6013541641108646e-05, + "loss": 0.0784, + "num_input_tokens_seen": 41938288, + "step": 72295 + }, + { + "epoch": 10.768543342269885, + "grad_norm": 0.3366459906101227, + "learning_rate": 2.6010294882617304e-05, + "loss": 0.0006, + "num_input_tokens_seen": 41941104, + "step": 72300 + }, + { + "epoch": 10.769288054810843, + "grad_norm": 0.07310701906681061, + "learning_rate": 2.6007048107058058e-05, + "loss": 0.0517, + "num_input_tokens_seen": 41943632, + "step": 72305 + }, + { + "epoch": 10.770032767351802, + "grad_norm": 23.64249610900879, + "learning_rate": 2.6003801314485755e-05, + "loss": 0.0152, + "num_input_tokens_seen": 41946384, + "step": 72310 + }, + { + "epoch": 10.770777479892761, + "grad_norm": 0.11803142726421356, + "learning_rate": 2.600055450495525e-05, + "loss": 0.1538, + "num_input_tokens_seen": 41949456, + "step": 72315 + }, + { + "epoch": 10.771522192433721, + "grad_norm": 0.004558885004371405, + "learning_rate": 2.5997307678521392e-05, + "loss": 0.2478, + "num_input_tokens_seen": 41952784, + "step": 72320 + }, + { + "epoch": 10.77226690497468, + "grad_norm": 6.479449272155762, + "learning_rate": 2.5994060835239036e-05, + "loss": 0.0587, + "num_input_tokens_seen": 41955600, + "step": 72325 + }, + { + "epoch": 10.773011617515639, + "grad_norm": 0.0038181585259735584, + "learning_rate": 2.5990813975163032e-05, + "loss": 0.1815, + "num_input_tokens_seen": 41958576, + "step": 72330 + }, + { + "epoch": 10.773756330056598, + "grad_norm": 3.8538804054260254, + "learning_rate": 2.598756709834823e-05, + "loss": 0.1811, + "num_input_tokens_seen": 41961552, + "step": 72335 + }, + { + "epoch": 10.774501042597556, + "grad_norm": 0.055296070873737335, + "learning_rate": 2.598432020484949e-05, + "loss": 0.0032, + "num_input_tokens_seen": 41964528, + "step": 72340 + }, + { + "epoch": 10.775245755138517, + "grad_norm": 29.985511779785156, + "learning_rate": 2.598107329472166e-05, + "loss": 0.3561, + "num_input_tokens_seen": 41967088, + "step": 72345 + }, + { + "epoch": 10.775990467679476, + "grad_norm": 0.02436150424182415, + "learning_rate": 2.5977826368019598e-05, + "loss": 0.148, + "num_input_tokens_seen": 41970000, + "step": 72350 + }, + { + "epoch": 10.776735180220435, + "grad_norm": 21.520832061767578, + "learning_rate": 2.5974579424798146e-05, + "loss": 0.0918, + "num_input_tokens_seen": 41972656, + "step": 72355 + }, + { + "epoch": 10.777479892761393, + "grad_norm": 0.3884986340999603, + "learning_rate": 2.5971332465112165e-05, + "loss": 0.0049, + "num_input_tokens_seen": 41975504, + "step": 72360 + }, + { + "epoch": 10.778224605302354, + "grad_norm": 0.016814807429909706, + "learning_rate": 2.5968085489016507e-05, + "loss": 0.0259, + "num_input_tokens_seen": 41978320, + "step": 72365 + }, + { + "epoch": 10.778969317843313, + "grad_norm": 0.008624543435871601, + "learning_rate": 2.5964838496566035e-05, + "loss": 0.0006, + "num_input_tokens_seen": 41981200, + "step": 72370 + }, + { + "epoch": 10.779714030384271, + "grad_norm": 0.5104418396949768, + "learning_rate": 2.596159148781559e-05, + "loss": 0.0294, + "num_input_tokens_seen": 41984080, + "step": 72375 + }, + { + "epoch": 10.78045874292523, + "grad_norm": 0.012643334455788136, + "learning_rate": 2.5958344462820045e-05, + "loss": 0.0022, + "num_input_tokens_seen": 41986896, + "step": 72380 + }, + { + "epoch": 10.78120345546619, + "grad_norm": 0.0028407408390194178, + "learning_rate": 2.5955097421634244e-05, + "loss": 0.0015, + "num_input_tokens_seen": 41989872, + "step": 72385 + }, + { + "epoch": 10.78194816800715, + "grad_norm": 0.0037793212104588747, + "learning_rate": 2.5951850364313036e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41992752, + "step": 72390 + }, + { + "epoch": 10.782692880548108, + "grad_norm": 0.013802842237055302, + "learning_rate": 2.5948603290911283e-05, + "loss": 0.0001, + "num_input_tokens_seen": 41995408, + "step": 72395 + }, + { + "epoch": 10.783437593089067, + "grad_norm": 0.012146087363362312, + "learning_rate": 2.594535620148384e-05, + "loss": 0.0003, + "num_input_tokens_seen": 41998224, + "step": 72400 + }, + { + "epoch": 10.784182305630027, + "grad_norm": 0.004569102544337511, + "learning_rate": 2.5942109096085566e-05, + "loss": 0.109, + "num_input_tokens_seen": 42001264, + "step": 72405 + }, + { + "epoch": 10.784927018170986, + "grad_norm": 0.011943460442125797, + "learning_rate": 2.5938861974771316e-05, + "loss": 0.0282, + "num_input_tokens_seen": 42004560, + "step": 72410 + }, + { + "epoch": 10.785671730711945, + "grad_norm": 0.0089491605758667, + "learning_rate": 2.5935614837595958e-05, + "loss": 0.0004, + "num_input_tokens_seen": 42007696, + "step": 72415 + }, + { + "epoch": 10.786416443252904, + "grad_norm": 0.05731336772441864, + "learning_rate": 2.5932367684614328e-05, + "loss": 0.1131, + "num_input_tokens_seen": 42010608, + "step": 72420 + }, + { + "epoch": 10.787161155793864, + "grad_norm": 0.07048865407705307, + "learning_rate": 2.59291205158813e-05, + "loss": 0.2222, + "num_input_tokens_seen": 42013360, + "step": 72425 + }, + { + "epoch": 10.787905868334823, + "grad_norm": 0.005185809452086687, + "learning_rate": 2.5925873331451715e-05, + "loss": 0.2161, + "num_input_tokens_seen": 42016112, + "step": 72430 + }, + { + "epoch": 10.788650580875782, + "grad_norm": 0.00018376798834651709, + "learning_rate": 2.5922626131380444e-05, + "loss": 0.1688, + "num_input_tokens_seen": 42018928, + "step": 72435 + }, + { + "epoch": 10.78939529341674, + "grad_norm": 0.010789998807013035, + "learning_rate": 2.5919378915722347e-05, + "loss": 0.367, + "num_input_tokens_seen": 42021808, + "step": 72440 + }, + { + "epoch": 10.790140005957701, + "grad_norm": 0.010872488841414452, + "learning_rate": 2.5916131684532274e-05, + "loss": 0.0013, + "num_input_tokens_seen": 42025008, + "step": 72445 + }, + { + "epoch": 10.79088471849866, + "grad_norm": 28.015377044677734, + "learning_rate": 2.5912884437865093e-05, + "loss": 0.1271, + "num_input_tokens_seen": 42027824, + "step": 72450 + }, + { + "epoch": 10.791629431039619, + "grad_norm": 18.278335571289062, + "learning_rate": 2.5909637175775652e-05, + "loss": 0.1576, + "num_input_tokens_seen": 42030512, + "step": 72455 + }, + { + "epoch": 10.792374143580577, + "grad_norm": 0.008222785778343678, + "learning_rate": 2.5906389898318817e-05, + "loss": 0.1715, + "num_input_tokens_seen": 42033392, + "step": 72460 + }, + { + "epoch": 10.793118856121538, + "grad_norm": 0.31958305835723877, + "learning_rate": 2.5903142605549445e-05, + "loss": 0.0236, + "num_input_tokens_seen": 42036208, + "step": 72465 + }, + { + "epoch": 10.793863568662497, + "grad_norm": 0.4453946053981781, + "learning_rate": 2.58998952975224e-05, + "loss": 0.0015, + "num_input_tokens_seen": 42039504, + "step": 72470 + }, + { + "epoch": 10.794608281203455, + "grad_norm": 0.0028543814551085234, + "learning_rate": 2.5896647974292533e-05, + "loss": 0.0134, + "num_input_tokens_seen": 42042544, + "step": 72475 + }, + { + "epoch": 10.795352993744414, + "grad_norm": 0.004182612057775259, + "learning_rate": 2.589340063591471e-05, + "loss": 0.1661, + "num_input_tokens_seen": 42045296, + "step": 72480 + }, + { + "epoch": 10.796097706285373, + "grad_norm": 252.92568969726562, + "learning_rate": 2.5890153282443797e-05, + "loss": 0.1942, + "num_input_tokens_seen": 42048016, + "step": 72485 + }, + { + "epoch": 10.796842418826333, + "grad_norm": 3.650606393814087, + "learning_rate": 2.5886905913934643e-05, + "loss": 0.0576, + "num_input_tokens_seen": 42050928, + "step": 72490 + }, + { + "epoch": 10.797587131367292, + "grad_norm": 0.06938264518976212, + "learning_rate": 2.5883658530442117e-05, + "loss": 0.0015, + "num_input_tokens_seen": 42053872, + "step": 72495 + }, + { + "epoch": 10.798331843908251, + "grad_norm": 0.016185779124498367, + "learning_rate": 2.5880411132021083e-05, + "loss": 0.0228, + "num_input_tokens_seen": 42056720, + "step": 72500 + }, + { + "epoch": 10.799076556449211, + "grad_norm": 0.0028528778348118067, + "learning_rate": 2.5877163718726394e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42059568, + "step": 72505 + }, + { + "epoch": 10.79982126899017, + "grad_norm": 0.008459921926259995, + "learning_rate": 2.5873916290612915e-05, + "loss": 0.1129, + "num_input_tokens_seen": 42062320, + "step": 72510 + }, + { + "epoch": 10.800565981531129, + "grad_norm": 1.7995485067367554, + "learning_rate": 2.5870668847735512e-05, + "loss": 0.1032, + "num_input_tokens_seen": 42065104, + "step": 72515 + }, + { + "epoch": 10.801310694072088, + "grad_norm": 0.010348671115934849, + "learning_rate": 2.5867421390149037e-05, + "loss": 0.0561, + "num_input_tokens_seen": 42067760, + "step": 72520 + }, + { + "epoch": 10.802055406613047, + "grad_norm": 0.016154292970895767, + "learning_rate": 2.5864173917908363e-05, + "loss": 0.0179, + "num_input_tokens_seen": 42070608, + "step": 72525 + }, + { + "epoch": 10.802800119154007, + "grad_norm": 0.05808010324835777, + "learning_rate": 2.5860926431068344e-05, + "loss": 0.0133, + "num_input_tokens_seen": 42073584, + "step": 72530 + }, + { + "epoch": 10.803544831694966, + "grad_norm": 0.003359428374096751, + "learning_rate": 2.5857678929683855e-05, + "loss": 0.3797, + "num_input_tokens_seen": 42076496, + "step": 72535 + }, + { + "epoch": 10.804289544235925, + "grad_norm": 1.5222691297531128, + "learning_rate": 2.585443141380975e-05, + "loss": 0.0014, + "num_input_tokens_seen": 42079696, + "step": 72540 + }, + { + "epoch": 10.805034256776883, + "grad_norm": 49.82472610473633, + "learning_rate": 2.5851183883500895e-05, + "loss": 0.2105, + "num_input_tokens_seen": 42082640, + "step": 72545 + }, + { + "epoch": 10.805778969317844, + "grad_norm": 0.029895182698965073, + "learning_rate": 2.5847936338812158e-05, + "loss": 0.0489, + "num_input_tokens_seen": 42085328, + "step": 72550 + }, + { + "epoch": 10.806523681858803, + "grad_norm": 2.7056777477264404, + "learning_rate": 2.5844688779798393e-05, + "loss": 0.001, + "num_input_tokens_seen": 42088144, + "step": 72555 + }, + { + "epoch": 10.807268394399761, + "grad_norm": 0.011782140471041203, + "learning_rate": 2.5841441206514468e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42090992, + "step": 72560 + }, + { + "epoch": 10.80801310694072, + "grad_norm": 0.06752203404903412, + "learning_rate": 2.583819361901525e-05, + "loss": 0.1346, + "num_input_tokens_seen": 42093904, + "step": 72565 + }, + { + "epoch": 10.80875781948168, + "grad_norm": 0.011915773153305054, + "learning_rate": 2.5834946017355598e-05, + "loss": 0.0512, + "num_input_tokens_seen": 42096912, + "step": 72570 + }, + { + "epoch": 10.80950253202264, + "grad_norm": 0.0020401813089847565, + "learning_rate": 2.583169840159039e-05, + "loss": 0.204, + "num_input_tokens_seen": 42099760, + "step": 72575 + }, + { + "epoch": 10.810247244563598, + "grad_norm": 0.005008866544812918, + "learning_rate": 2.582845077177448e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42102992, + "step": 72580 + }, + { + "epoch": 10.810991957104557, + "grad_norm": 0.003810629015788436, + "learning_rate": 2.5825203127962737e-05, + "loss": 0.0539, + "num_input_tokens_seen": 42106096, + "step": 72585 + }, + { + "epoch": 10.811736669645517, + "grad_norm": 0.011509658768773079, + "learning_rate": 2.582195547021003e-05, + "loss": 0.0703, + "num_input_tokens_seen": 42108912, + "step": 72590 + }, + { + "epoch": 10.812481382186476, + "grad_norm": 4.5676140785217285, + "learning_rate": 2.581870779857121e-05, + "loss": 0.0419, + "num_input_tokens_seen": 42112112, + "step": 72595 + }, + { + "epoch": 10.813226094727435, + "grad_norm": 63.145503997802734, + "learning_rate": 2.581546011310116e-05, + "loss": 0.4778, + "num_input_tokens_seen": 42114800, + "step": 72600 + }, + { + "epoch": 10.813970807268394, + "grad_norm": 0.17718887329101562, + "learning_rate": 2.5812212413854738e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42117776, + "step": 72605 + }, + { + "epoch": 10.814715519809354, + "grad_norm": 24.590373992919922, + "learning_rate": 2.5808964700886812e-05, + "loss": 0.123, + "num_input_tokens_seen": 42120784, + "step": 72610 + }, + { + "epoch": 10.815460232350313, + "grad_norm": 0.06275801360607147, + "learning_rate": 2.5805716974252257e-05, + "loss": 0.0355, + "num_input_tokens_seen": 42123472, + "step": 72615 + }, + { + "epoch": 10.816204944891272, + "grad_norm": 0.017751118168234825, + "learning_rate": 2.5802469234005927e-05, + "loss": 0.1943, + "num_input_tokens_seen": 42126576, + "step": 72620 + }, + { + "epoch": 10.81694965743223, + "grad_norm": 0.0033481603022664785, + "learning_rate": 2.57992214802027e-05, + "loss": 0.3949, + "num_input_tokens_seen": 42129296, + "step": 72625 + }, + { + "epoch": 10.817694369973191, + "grad_norm": 0.003276729490607977, + "learning_rate": 2.579597371289743e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42132112, + "step": 72630 + }, + { + "epoch": 10.81843908251415, + "grad_norm": 15.10049057006836, + "learning_rate": 2.5792725932144996e-05, + "loss": 0.3538, + "num_input_tokens_seen": 42135120, + "step": 72635 + }, + { + "epoch": 10.819183795055109, + "grad_norm": 4.764449119567871, + "learning_rate": 2.5789478138000262e-05, + "loss": 0.0049, + "num_input_tokens_seen": 42137968, + "step": 72640 + }, + { + "epoch": 10.819928507596067, + "grad_norm": 0.3740707039833069, + "learning_rate": 2.5786230330518096e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42141008, + "step": 72645 + }, + { + "epoch": 10.820673220137028, + "grad_norm": 18.11852264404297, + "learning_rate": 2.5782982509753377e-05, + "loss": 0.111, + "num_input_tokens_seen": 42143664, + "step": 72650 + }, + { + "epoch": 10.821417932677987, + "grad_norm": 0.2861461043357849, + "learning_rate": 2.5779734675760957e-05, + "loss": 0.0128, + "num_input_tokens_seen": 42146448, + "step": 72655 + }, + { + "epoch": 10.822162645218945, + "grad_norm": 0.06057147681713104, + "learning_rate": 2.5776486828595715e-05, + "loss": 0.0102, + "num_input_tokens_seen": 42149456, + "step": 72660 + }, + { + "epoch": 10.822907357759904, + "grad_norm": 33.01739501953125, + "learning_rate": 2.5773238968312514e-05, + "loss": 0.1113, + "num_input_tokens_seen": 42152688, + "step": 72665 + }, + { + "epoch": 10.823652070300863, + "grad_norm": 26.54728889465332, + "learning_rate": 2.5769991094966228e-05, + "loss": 0.0659, + "num_input_tokens_seen": 42155824, + "step": 72670 + }, + { + "epoch": 10.824396782841823, + "grad_norm": 0.00563082518056035, + "learning_rate": 2.5766743208611726e-05, + "loss": 0.0005, + "num_input_tokens_seen": 42158576, + "step": 72675 + }, + { + "epoch": 10.825141495382782, + "grad_norm": 0.04838398098945618, + "learning_rate": 2.576349530930388e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42161424, + "step": 72680 + }, + { + "epoch": 10.825886207923741, + "grad_norm": 0.14668124914169312, + "learning_rate": 2.5760247397097553e-05, + "loss": 0.0349, + "num_input_tokens_seen": 42164240, + "step": 72685 + }, + { + "epoch": 10.826630920464702, + "grad_norm": 0.007017318159341812, + "learning_rate": 2.5756999472047617e-05, + "loss": 0.0132, + "num_input_tokens_seen": 42167888, + "step": 72690 + }, + { + "epoch": 10.82737563300566, + "grad_norm": 19.801523208618164, + "learning_rate": 2.575375153420894e-05, + "loss": 0.0124, + "num_input_tokens_seen": 42171024, + "step": 72695 + }, + { + "epoch": 10.828120345546619, + "grad_norm": 0.01879567839205265, + "learning_rate": 2.5750503583636402e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42174096, + "step": 72700 + }, + { + "epoch": 10.828865058087578, + "grad_norm": 384.9083557128906, + "learning_rate": 2.5747255620384868e-05, + "loss": 0.3926, + "num_input_tokens_seen": 42176880, + "step": 72705 + }, + { + "epoch": 10.829609770628537, + "grad_norm": 0.0002024822315433994, + "learning_rate": 2.5744007644509215e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42179664, + "step": 72710 + }, + { + "epoch": 10.830354483169497, + "grad_norm": 1.1420139074325562, + "learning_rate": 2.574075965606431e-05, + "loss": 0.0005, + "num_input_tokens_seen": 42182512, + "step": 72715 + }, + { + "epoch": 10.831099195710456, + "grad_norm": 0.01233147643506527, + "learning_rate": 2.5737511655105018e-05, + "loss": 0.3533, + "num_input_tokens_seen": 42185232, + "step": 72720 + }, + { + "epoch": 10.831843908251415, + "grad_norm": 0.04441395029425621, + "learning_rate": 2.573426364168622e-05, + "loss": 0.1721, + "num_input_tokens_seen": 42188304, + "step": 72725 + }, + { + "epoch": 10.832588620792373, + "grad_norm": 0.30714645981788635, + "learning_rate": 2.5731015615862774e-05, + "loss": 0.0735, + "num_input_tokens_seen": 42191088, + "step": 72730 + }, + { + "epoch": 10.833333333333334, + "grad_norm": 0.26367825269699097, + "learning_rate": 2.572776757768957e-05, + "loss": 0.1733, + "num_input_tokens_seen": 42193840, + "step": 72735 + }, + { + "epoch": 10.834078045874293, + "grad_norm": 0.00931645929813385, + "learning_rate": 2.5724519527221468e-05, + "loss": 0.1256, + "num_input_tokens_seen": 42196528, + "step": 72740 + }, + { + "epoch": 10.834822758415251, + "grad_norm": 0.0005022718105465174, + "learning_rate": 2.5721271464513354e-05, + "loss": 0.0178, + "num_input_tokens_seen": 42199248, + "step": 72745 + }, + { + "epoch": 10.83556747095621, + "grad_norm": 0.20191925764083862, + "learning_rate": 2.571802338962009e-05, + "loss": 0.0007, + "num_input_tokens_seen": 42202480, + "step": 72750 + }, + { + "epoch": 10.83631218349717, + "grad_norm": 0.18876731395721436, + "learning_rate": 2.5714775302596545e-05, + "loss": 0.0922, + "num_input_tokens_seen": 42205136, + "step": 72755 + }, + { + "epoch": 10.83705689603813, + "grad_norm": 0.010629833675920963, + "learning_rate": 2.57115272034976e-05, + "loss": 0.3869, + "num_input_tokens_seen": 42208304, + "step": 72760 + }, + { + "epoch": 10.837801608579088, + "grad_norm": 0.00029837965848855674, + "learning_rate": 2.5708279092378123e-05, + "loss": 0.1008, + "num_input_tokens_seen": 42211120, + "step": 72765 + }, + { + "epoch": 10.838546321120047, + "grad_norm": 0.4606710374355316, + "learning_rate": 2.5705030969292992e-05, + "loss": 0.0678, + "num_input_tokens_seen": 42213904, + "step": 72770 + }, + { + "epoch": 10.839291033661008, + "grad_norm": 0.0006208036793395877, + "learning_rate": 2.5701782834297078e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42216848, + "step": 72775 + }, + { + "epoch": 10.840035746201966, + "grad_norm": 36.81455612182617, + "learning_rate": 2.5698534687445263e-05, + "loss": 0.1631, + "num_input_tokens_seen": 42219728, + "step": 72780 + }, + { + "epoch": 10.840780458742925, + "grad_norm": 0.022597743198275566, + "learning_rate": 2.5695286528792413e-05, + "loss": 0.022, + "num_input_tokens_seen": 42222320, + "step": 72785 + }, + { + "epoch": 10.841525171283884, + "grad_norm": 0.002318797865882516, + "learning_rate": 2.56920383583934e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42224976, + "step": 72790 + }, + { + "epoch": 10.842269883824844, + "grad_norm": 0.057883113622665405, + "learning_rate": 2.56887901763031e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42227888, + "step": 72795 + }, + { + "epoch": 10.843014596365803, + "grad_norm": 0.02371908538043499, + "learning_rate": 2.5685541982576395e-05, + "loss": 0.0002, + "num_input_tokens_seen": 42230960, + "step": 72800 + }, + { + "epoch": 10.843759308906762, + "grad_norm": 0.0032084190752357244, + "learning_rate": 2.5682293777268153e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42233680, + "step": 72805 + }, + { + "epoch": 10.84450402144772, + "grad_norm": 20.558109283447266, + "learning_rate": 2.567904556043325e-05, + "loss": 0.3458, + "num_input_tokens_seen": 42236560, + "step": 72810 + }, + { + "epoch": 10.845248733988681, + "grad_norm": 3.507911205291748, + "learning_rate": 2.5675797332126566e-05, + "loss": 0.0028, + "num_input_tokens_seen": 42239760, + "step": 72815 + }, + { + "epoch": 10.84599344652964, + "grad_norm": 0.15947942435741425, + "learning_rate": 2.567254909240297e-05, + "loss": 0.0426, + "num_input_tokens_seen": 42242640, + "step": 72820 + }, + { + "epoch": 10.846738159070599, + "grad_norm": 0.02664025127887726, + "learning_rate": 2.566930084131734e-05, + "loss": 0.3268, + "num_input_tokens_seen": 42245328, + "step": 72825 + }, + { + "epoch": 10.847482871611557, + "grad_norm": 13.95845890045166, + "learning_rate": 2.566605257892456e-05, + "loss": 0.1447, + "num_input_tokens_seen": 42248176, + "step": 72830 + }, + { + "epoch": 10.848227584152518, + "grad_norm": 0.0031883292831480503, + "learning_rate": 2.5662804305279485e-05, + "loss": 0.0002, + "num_input_tokens_seen": 42251088, + "step": 72835 + }, + { + "epoch": 10.848972296693477, + "grad_norm": 0.023020436987280846, + "learning_rate": 2.5659556020437015e-05, + "loss": 0.1819, + "num_input_tokens_seen": 42254000, + "step": 72840 + }, + { + "epoch": 10.849717009234435, + "grad_norm": 0.011712776497006416, + "learning_rate": 2.5656307724452016e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42256688, + "step": 72845 + }, + { + "epoch": 10.850461721775394, + "grad_norm": 30.525318145751953, + "learning_rate": 2.565305941737936e-05, + "loss": 0.0231, + "num_input_tokens_seen": 42259248, + "step": 72850 + }, + { + "epoch": 10.851206434316353, + "grad_norm": 13.716314315795898, + "learning_rate": 2.5649811099273935e-05, + "loss": 0.0453, + "num_input_tokens_seen": 42262416, + "step": 72855 + }, + { + "epoch": 10.851951146857314, + "grad_norm": 0.0003261194797232747, + "learning_rate": 2.564656277019061e-05, + "loss": 0.0458, + "num_input_tokens_seen": 42265328, + "step": 72860 + }, + { + "epoch": 10.852695859398272, + "grad_norm": 0.01685267873108387, + "learning_rate": 2.5643314430184257e-05, + "loss": 0.0512, + "num_input_tokens_seen": 42268272, + "step": 72865 + }, + { + "epoch": 10.853440571939231, + "grad_norm": 10.436347961425781, + "learning_rate": 2.564006607930977e-05, + "loss": 0.104, + "num_input_tokens_seen": 42271184, + "step": 72870 + }, + { + "epoch": 10.85418528448019, + "grad_norm": 0.004606642294675112, + "learning_rate": 2.5636817717622015e-05, + "loss": 0.2158, + "num_input_tokens_seen": 42273744, + "step": 72875 + }, + { + "epoch": 10.85492999702115, + "grad_norm": 0.001568725798279047, + "learning_rate": 2.5633569345175873e-05, + "loss": 0.003, + "num_input_tokens_seen": 42276176, + "step": 72880 + }, + { + "epoch": 10.85567470956211, + "grad_norm": 0.030841311439871788, + "learning_rate": 2.5630320962026217e-05, + "loss": 0.1409, + "num_input_tokens_seen": 42278864, + "step": 72885 + }, + { + "epoch": 10.856419422103068, + "grad_norm": 0.0038466977421194315, + "learning_rate": 2.5627072568227927e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42281840, + "step": 72890 + }, + { + "epoch": 10.857164134644027, + "grad_norm": 0.23728691041469574, + "learning_rate": 2.5623824163835887e-05, + "loss": 0.0404, + "num_input_tokens_seen": 42285104, + "step": 72895 + }, + { + "epoch": 10.857908847184987, + "grad_norm": 0.007878442294895649, + "learning_rate": 2.5620575748904968e-05, + "loss": 0.0023, + "num_input_tokens_seen": 42288240, + "step": 72900 + }, + { + "epoch": 10.858653559725946, + "grad_norm": 0.0004114606126677245, + "learning_rate": 2.5617327323490055e-05, + "loss": 0.016, + "num_input_tokens_seen": 42291024, + "step": 72905 + }, + { + "epoch": 10.859398272266905, + "grad_norm": 0.0165998674929142, + "learning_rate": 2.5614078887646025e-05, + "loss": 0.0289, + "num_input_tokens_seen": 42294000, + "step": 72910 + }, + { + "epoch": 10.860142984807863, + "grad_norm": 0.3501468300819397, + "learning_rate": 2.5610830441427762e-05, + "loss": 0.0007, + "num_input_tokens_seen": 42296912, + "step": 72915 + }, + { + "epoch": 10.860887697348824, + "grad_norm": 14.355748176574707, + "learning_rate": 2.5607581984890134e-05, + "loss": 0.003, + "num_input_tokens_seen": 42300048, + "step": 72920 + }, + { + "epoch": 10.861632409889783, + "grad_norm": 0.005090495105832815, + "learning_rate": 2.5604333518088026e-05, + "loss": 0.0013, + "num_input_tokens_seen": 42302928, + "step": 72925 + }, + { + "epoch": 10.862377122430741, + "grad_norm": 0.006362839136272669, + "learning_rate": 2.560108504107631e-05, + "loss": 0.0146, + "num_input_tokens_seen": 42305808, + "step": 72930 + }, + { + "epoch": 10.8631218349717, + "grad_norm": 0.0014100251719355583, + "learning_rate": 2.5597836553909884e-05, + "loss": 0.0119, + "num_input_tokens_seen": 42308720, + "step": 72935 + }, + { + "epoch": 10.86386654751266, + "grad_norm": 0.00031869355007074773, + "learning_rate": 2.5594588056643608e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42311696, + "step": 72940 + }, + { + "epoch": 10.86461126005362, + "grad_norm": 58.48595428466797, + "learning_rate": 2.5591339549332383e-05, + "loss": 0.0047, + "num_input_tokens_seen": 42314384, + "step": 72945 + }, + { + "epoch": 10.865355972594578, + "grad_norm": 0.00021575496066361666, + "learning_rate": 2.5588091032031075e-05, + "loss": 0.1564, + "num_input_tokens_seen": 42316848, + "step": 72950 + }, + { + "epoch": 10.866100685135537, + "grad_norm": 0.024456365033984184, + "learning_rate": 2.5584842504794558e-05, + "loss": 0.1366, + "num_input_tokens_seen": 42319632, + "step": 72955 + }, + { + "epoch": 10.866845397676498, + "grad_norm": 0.0006190763087943196, + "learning_rate": 2.5581593967677724e-05, + "loss": 0.0064, + "num_input_tokens_seen": 42322384, + "step": 72960 + }, + { + "epoch": 10.867590110217456, + "grad_norm": 0.06726372241973877, + "learning_rate": 2.557834542073545e-05, + "loss": 0.0122, + "num_input_tokens_seen": 42325072, + "step": 72965 + }, + { + "epoch": 10.868334822758415, + "grad_norm": 0.12034330517053604, + "learning_rate": 2.557509686402262e-05, + "loss": 0.1663, + "num_input_tokens_seen": 42327984, + "step": 72970 + }, + { + "epoch": 10.869079535299374, + "grad_norm": 0.01800950989127159, + "learning_rate": 2.5571848297594116e-05, + "loss": 0.0563, + "num_input_tokens_seen": 42330608, + "step": 72975 + }, + { + "epoch": 10.869824247840334, + "grad_norm": 0.0036008183378726244, + "learning_rate": 2.5568599721504814e-05, + "loss": 0.0, + "num_input_tokens_seen": 42333424, + "step": 72980 + }, + { + "epoch": 10.870568960381293, + "grad_norm": 6.690559387207031, + "learning_rate": 2.5565351135809597e-05, + "loss": 0.1521, + "num_input_tokens_seen": 42336400, + "step": 72985 + }, + { + "epoch": 10.871313672922252, + "grad_norm": 11.946141242980957, + "learning_rate": 2.5562102540563355e-05, + "loss": 0.1719, + "num_input_tokens_seen": 42339568, + "step": 72990 + }, + { + "epoch": 10.87205838546321, + "grad_norm": 0.007672740146517754, + "learning_rate": 2.5558853935820948e-05, + "loss": 0.2251, + "num_input_tokens_seen": 42342320, + "step": 72995 + }, + { + "epoch": 10.872803098004171, + "grad_norm": 0.002812701277434826, + "learning_rate": 2.555560532163728e-05, + "loss": 0.1813, + "num_input_tokens_seen": 42345104, + "step": 73000 + }, + { + "epoch": 10.87354781054513, + "grad_norm": 0.01609046757221222, + "learning_rate": 2.555235669806722e-05, + "loss": 0.0004, + "num_input_tokens_seen": 42348048, + "step": 73005 + }, + { + "epoch": 10.874292523086089, + "grad_norm": 0.0034084413200616837, + "learning_rate": 2.554910806516566e-05, + "loss": 0.0002, + "num_input_tokens_seen": 42350992, + "step": 73010 + }, + { + "epoch": 10.875037235627047, + "grad_norm": 0.004209152888506651, + "learning_rate": 2.5545859422987478e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42354352, + "step": 73015 + }, + { + "epoch": 10.875781948168008, + "grad_norm": 0.009604232385754585, + "learning_rate": 2.554261077158755e-05, + "loss": 0.1139, + "num_input_tokens_seen": 42357392, + "step": 73020 + }, + { + "epoch": 10.876526660708967, + "grad_norm": 0.00408868258818984, + "learning_rate": 2.5539362111020765e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42360400, + "step": 73025 + }, + { + "epoch": 10.877271373249926, + "grad_norm": 0.005131447222083807, + "learning_rate": 2.5536113441342014e-05, + "loss": 0.0002, + "num_input_tokens_seen": 42363344, + "step": 73030 + }, + { + "epoch": 10.878016085790884, + "grad_norm": 0.22662757337093353, + "learning_rate": 2.5532864762606164e-05, + "loss": 0.0004, + "num_input_tokens_seen": 42366192, + "step": 73035 + }, + { + "epoch": 10.878760798331843, + "grad_norm": 0.03905077651143074, + "learning_rate": 2.55296160748681e-05, + "loss": 0.0735, + "num_input_tokens_seen": 42369168, + "step": 73040 + }, + { + "epoch": 10.879505510872804, + "grad_norm": 0.002847021911293268, + "learning_rate": 2.5526367378182725e-05, + "loss": 0.2434, + "num_input_tokens_seen": 42372176, + "step": 73045 + }, + { + "epoch": 10.880250223413762, + "grad_norm": 0.0012358268722891808, + "learning_rate": 2.55231186726049e-05, + "loss": 0.2032, + "num_input_tokens_seen": 42375024, + "step": 73050 + }, + { + "epoch": 10.880994935954721, + "grad_norm": 0.0010487624676898122, + "learning_rate": 2.5519869958189513e-05, + "loss": 0.2925, + "num_input_tokens_seen": 42377456, + "step": 73055 + }, + { + "epoch": 10.88173964849568, + "grad_norm": 0.0066488743759691715, + "learning_rate": 2.5516621234991456e-05, + "loss": 0.1689, + "num_input_tokens_seen": 42380336, + "step": 73060 + }, + { + "epoch": 10.88248436103664, + "grad_norm": 0.04101819917559624, + "learning_rate": 2.551337250306561e-05, + "loss": 0.0131, + "num_input_tokens_seen": 42383056, + "step": 73065 + }, + { + "epoch": 10.8832290735776, + "grad_norm": 37.68952941894531, + "learning_rate": 2.5510123762466853e-05, + "loss": 0.0127, + "num_input_tokens_seen": 42385808, + "step": 73070 + }, + { + "epoch": 10.883973786118558, + "grad_norm": 0.018751034513115883, + "learning_rate": 2.5506875013250075e-05, + "loss": 0.2414, + "num_input_tokens_seen": 42388688, + "step": 73075 + }, + { + "epoch": 10.884718498659517, + "grad_norm": 0.028558064252138138, + "learning_rate": 2.5503626255470164e-05, + "loss": 0.001, + "num_input_tokens_seen": 42391440, + "step": 73080 + }, + { + "epoch": 10.885463211200477, + "grad_norm": 0.0002709281980060041, + "learning_rate": 2.5500377489181992e-05, + "loss": 0.0854, + "num_input_tokens_seen": 42394544, + "step": 73085 + }, + { + "epoch": 10.886207923741436, + "grad_norm": 0.015149395912885666, + "learning_rate": 2.5497128714440456e-05, + "loss": 0.0077, + "num_input_tokens_seen": 42397264, + "step": 73090 + }, + { + "epoch": 10.886952636282395, + "grad_norm": 0.00379334413446486, + "learning_rate": 2.549387993130043e-05, + "loss": 0.1697, + "num_input_tokens_seen": 42400368, + "step": 73095 + }, + { + "epoch": 10.887697348823353, + "grad_norm": 0.023790080100297928, + "learning_rate": 2.5490631139816806e-05, + "loss": 0.0059, + "num_input_tokens_seen": 42403216, + "step": 73100 + }, + { + "epoch": 10.888442061364314, + "grad_norm": 15.341792106628418, + "learning_rate": 2.548738234004447e-05, + "loss": 0.0082, + "num_input_tokens_seen": 42406480, + "step": 73105 + }, + { + "epoch": 10.889186773905273, + "grad_norm": 1.0008184909820557, + "learning_rate": 2.5484133532038307e-05, + "loss": 0.0011, + "num_input_tokens_seen": 42409328, + "step": 73110 + }, + { + "epoch": 10.889931486446232, + "grad_norm": 0.052245236933231354, + "learning_rate": 2.5480884715853197e-05, + "loss": 0.064, + "num_input_tokens_seen": 42412304, + "step": 73115 + }, + { + "epoch": 10.89067619898719, + "grad_norm": 0.09268634766340256, + "learning_rate": 2.547763589154403e-05, + "loss": 0.0154, + "num_input_tokens_seen": 42415152, + "step": 73120 + }, + { + "epoch": 10.89142091152815, + "grad_norm": 0.00596768269315362, + "learning_rate": 2.5474387059165687e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42418288, + "step": 73125 + }, + { + "epoch": 10.89216562406911, + "grad_norm": 9.049357414245605, + "learning_rate": 2.547113821877306e-05, + "loss": 0.083, + "num_input_tokens_seen": 42421712, + "step": 73130 + }, + { + "epoch": 10.892910336610068, + "grad_norm": 0.6727374196052551, + "learning_rate": 2.5467889370421027e-05, + "loss": 0.0005, + "num_input_tokens_seen": 42424400, + "step": 73135 + }, + { + "epoch": 10.893655049151027, + "grad_norm": 0.0044225119054317474, + "learning_rate": 2.546464051416448e-05, + "loss": 0.0006, + "num_input_tokens_seen": 42427344, + "step": 73140 + }, + { + "epoch": 10.894399761691988, + "grad_norm": 18.862472534179688, + "learning_rate": 2.5461391650058307e-05, + "loss": 0.1505, + "num_input_tokens_seen": 42430064, + "step": 73145 + }, + { + "epoch": 10.895144474232946, + "grad_norm": 0.00525331124663353, + "learning_rate": 2.5458142778157396e-05, + "loss": 0.0002, + "num_input_tokens_seen": 42432816, + "step": 73150 + }, + { + "epoch": 10.895889186773905, + "grad_norm": 0.013343781232833862, + "learning_rate": 2.545489389851662e-05, + "loss": 0.0267, + "num_input_tokens_seen": 42435664, + "step": 73155 + }, + { + "epoch": 10.896633899314864, + "grad_norm": 0.008594676852226257, + "learning_rate": 2.5451645011190872e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42438704, + "step": 73160 + }, + { + "epoch": 10.897378611855824, + "grad_norm": 53.91297912597656, + "learning_rate": 2.5448396116235046e-05, + "loss": 0.2376, + "num_input_tokens_seen": 42441776, + "step": 73165 + }, + { + "epoch": 10.898123324396783, + "grad_norm": 0.0034442299511283636, + "learning_rate": 2.5445147213704017e-05, + "loss": 0.1782, + "num_input_tokens_seen": 42444528, + "step": 73170 + }, + { + "epoch": 10.898868036937742, + "grad_norm": 0.014722920022904873, + "learning_rate": 2.5441898303652688e-05, + "loss": 0.0822, + "num_input_tokens_seen": 42447248, + "step": 73175 + }, + { + "epoch": 10.8996127494787, + "grad_norm": 0.003543406492099166, + "learning_rate": 2.5438649386135932e-05, + "loss": 0.2158, + "num_input_tokens_seen": 42450000, + "step": 73180 + }, + { + "epoch": 10.90035746201966, + "grad_norm": 0.06071540340781212, + "learning_rate": 2.5435400461208637e-05, + "loss": 0.1754, + "num_input_tokens_seen": 42453008, + "step": 73185 + }, + { + "epoch": 10.90110217456062, + "grad_norm": 5.217227458953857, + "learning_rate": 2.5432151528925702e-05, + "loss": 0.0108, + "num_input_tokens_seen": 42456336, + "step": 73190 + }, + { + "epoch": 10.901846887101579, + "grad_norm": 81.6197738647461, + "learning_rate": 2.5428902589341996e-05, + "loss": 0.2356, + "num_input_tokens_seen": 42459120, + "step": 73195 + }, + { + "epoch": 10.902591599642538, + "grad_norm": 0.15330898761749268, + "learning_rate": 2.542565364251242e-05, + "loss": 0.0566, + "num_input_tokens_seen": 42462192, + "step": 73200 + }, + { + "epoch": 10.903336312183498, + "grad_norm": 0.056324202567338943, + "learning_rate": 2.542240468849186e-05, + "loss": 0.154, + "num_input_tokens_seen": 42465072, + "step": 73205 + }, + { + "epoch": 10.904081024724457, + "grad_norm": 0.06381182372570038, + "learning_rate": 2.5419155727335204e-05, + "loss": 0.0383, + "num_input_tokens_seen": 42468016, + "step": 73210 + }, + { + "epoch": 10.904825737265416, + "grad_norm": 7.551848411560059, + "learning_rate": 2.5415906759097336e-05, + "loss": 0.1246, + "num_input_tokens_seen": 42470832, + "step": 73215 + }, + { + "epoch": 10.905570449806374, + "grad_norm": 0.0014924112474545836, + "learning_rate": 2.5412657783833143e-05, + "loss": 0.0089, + "num_input_tokens_seen": 42473776, + "step": 73220 + }, + { + "epoch": 10.906315162347333, + "grad_norm": 0.2862035632133484, + "learning_rate": 2.5409408801597517e-05, + "loss": 0.0016, + "num_input_tokens_seen": 42476592, + "step": 73225 + }, + { + "epoch": 10.907059874888294, + "grad_norm": 0.09503111243247986, + "learning_rate": 2.540615981244535e-05, + "loss": 0.1564, + "num_input_tokens_seen": 42479408, + "step": 73230 + }, + { + "epoch": 10.907804587429252, + "grad_norm": 0.014125632122159004, + "learning_rate": 2.5402910816431525e-05, + "loss": 0.0807, + "num_input_tokens_seen": 42482384, + "step": 73235 + }, + { + "epoch": 10.908549299970211, + "grad_norm": 0.003038277616724372, + "learning_rate": 2.5399661813610925e-05, + "loss": 0.0013, + "num_input_tokens_seen": 42485424, + "step": 73240 + }, + { + "epoch": 10.90929401251117, + "grad_norm": 0.008153866976499557, + "learning_rate": 2.5396412804038455e-05, + "loss": 0.0073, + "num_input_tokens_seen": 42488144, + "step": 73245 + }, + { + "epoch": 10.91003872505213, + "grad_norm": 0.25801023840904236, + "learning_rate": 2.5393163787768988e-05, + "loss": 0.0007, + "num_input_tokens_seen": 42491056, + "step": 73250 + }, + { + "epoch": 10.91078343759309, + "grad_norm": 0.0032259479630738497, + "learning_rate": 2.5389914764857413e-05, + "loss": 0.0722, + "num_input_tokens_seen": 42494064, + "step": 73255 + }, + { + "epoch": 10.911528150134048, + "grad_norm": 0.047697216272354126, + "learning_rate": 2.538666573535863e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42496880, + "step": 73260 + }, + { + "epoch": 10.912272862675007, + "grad_norm": 0.1734970360994339, + "learning_rate": 2.5383416699327524e-05, + "loss": 0.2722, + "num_input_tokens_seen": 42499632, + "step": 73265 + }, + { + "epoch": 10.913017575215967, + "grad_norm": 40.54851150512695, + "learning_rate": 2.5380167656818978e-05, + "loss": 0.1814, + "num_input_tokens_seen": 42502576, + "step": 73270 + }, + { + "epoch": 10.913762287756926, + "grad_norm": 0.036258943378925323, + "learning_rate": 2.537691860788789e-05, + "loss": 0.0436, + "num_input_tokens_seen": 42505616, + "step": 73275 + }, + { + "epoch": 10.914507000297885, + "grad_norm": 9.662229537963867, + "learning_rate": 2.5373669552589146e-05, + "loss": 0.1724, + "num_input_tokens_seen": 42508432, + "step": 73280 + }, + { + "epoch": 10.915251712838844, + "grad_norm": 0.06331650167703629, + "learning_rate": 2.537042049097763e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42511344, + "step": 73285 + }, + { + "epoch": 10.915996425379804, + "grad_norm": 0.00219872803427279, + "learning_rate": 2.5367171423108238e-05, + "loss": 0.0876, + "num_input_tokens_seen": 42514224, + "step": 73290 + }, + { + "epoch": 10.916741137920763, + "grad_norm": 0.004548132419586182, + "learning_rate": 2.5363922349035857e-05, + "loss": 0.025, + "num_input_tokens_seen": 42516848, + "step": 73295 + }, + { + "epoch": 10.917485850461722, + "grad_norm": 15.723210334777832, + "learning_rate": 2.5360673268815378e-05, + "loss": 0.2354, + "num_input_tokens_seen": 42520368, + "step": 73300 + }, + { + "epoch": 10.91823056300268, + "grad_norm": 70.54609680175781, + "learning_rate": 2.535742418250169e-05, + "loss": 0.2067, + "num_input_tokens_seen": 42523344, + "step": 73305 + }, + { + "epoch": 10.918975275543641, + "grad_norm": 0.16614390909671783, + "learning_rate": 2.535417509014969e-05, + "loss": 0.0155, + "num_input_tokens_seen": 42526352, + "step": 73310 + }, + { + "epoch": 10.9197199880846, + "grad_norm": 0.01921723037958145, + "learning_rate": 2.5350925991814263e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42529584, + "step": 73315 + }, + { + "epoch": 10.920464700625558, + "grad_norm": 0.018779626116156578, + "learning_rate": 2.5347676887550286e-05, + "loss": 0.0965, + "num_input_tokens_seen": 42532560, + "step": 73320 + }, + { + "epoch": 10.921209413166517, + "grad_norm": 0.10954595357179642, + "learning_rate": 2.534442777741267e-05, + "loss": 0.0152, + "num_input_tokens_seen": 42535632, + "step": 73325 + }, + { + "epoch": 10.921954125707478, + "grad_norm": 0.8808513879776001, + "learning_rate": 2.5341178661456293e-05, + "loss": 0.1809, + "num_input_tokens_seen": 42538800, + "step": 73330 + }, + { + "epoch": 10.922698838248436, + "grad_norm": 0.006809059530496597, + "learning_rate": 2.533792953973605e-05, + "loss": 0.0302, + "num_input_tokens_seen": 42541840, + "step": 73335 + }, + { + "epoch": 10.923443550789395, + "grad_norm": 0.014117204584181309, + "learning_rate": 2.533468041230683e-05, + "loss": 0.018, + "num_input_tokens_seen": 42544752, + "step": 73340 + }, + { + "epoch": 10.924188263330354, + "grad_norm": 1.5134990215301514, + "learning_rate": 2.5331431279223528e-05, + "loss": 0.001, + "num_input_tokens_seen": 42547536, + "step": 73345 + }, + { + "epoch": 10.924932975871315, + "grad_norm": 0.0032290779054164886, + "learning_rate": 2.5328182140541028e-05, + "loss": 0.1538, + "num_input_tokens_seen": 42550416, + "step": 73350 + }, + { + "epoch": 10.925677688412273, + "grad_norm": 0.004538299515843391, + "learning_rate": 2.5324932996314233e-05, + "loss": 0.0002, + "num_input_tokens_seen": 42553360, + "step": 73355 + }, + { + "epoch": 10.926422400953232, + "grad_norm": 0.08051087707281113, + "learning_rate": 2.5321683846598015e-05, + "loss": 0.0736, + "num_input_tokens_seen": 42556368, + "step": 73360 + }, + { + "epoch": 10.92716711349419, + "grad_norm": 4.668212413787842, + "learning_rate": 2.531843469144728e-05, + "loss": 0.1267, + "num_input_tokens_seen": 42559280, + "step": 73365 + }, + { + "epoch": 10.92791182603515, + "grad_norm": 0.0027153820265084505, + "learning_rate": 2.5315185530916907e-05, + "loss": 0.0712, + "num_input_tokens_seen": 42562096, + "step": 73370 + }, + { + "epoch": 10.92865653857611, + "grad_norm": 152.57835388183594, + "learning_rate": 2.5311936365061804e-05, + "loss": 0.0164, + "num_input_tokens_seen": 42564880, + "step": 73375 + }, + { + "epoch": 10.929401251117069, + "grad_norm": 0.035898033529520035, + "learning_rate": 2.530868719393685e-05, + "loss": 0.006, + "num_input_tokens_seen": 42567696, + "step": 73380 + }, + { + "epoch": 10.930145963658028, + "grad_norm": 0.02217874862253666, + "learning_rate": 2.5305438017596937e-05, + "loss": 0.2397, + "num_input_tokens_seen": 42570608, + "step": 73385 + }, + { + "epoch": 10.930890676198988, + "grad_norm": 0.25541234016418457, + "learning_rate": 2.5302188836096963e-05, + "loss": 0.3048, + "num_input_tokens_seen": 42573392, + "step": 73390 + }, + { + "epoch": 10.931635388739947, + "grad_norm": 0.01186450570821762, + "learning_rate": 2.5298939649491816e-05, + "loss": 0.0014, + "num_input_tokens_seen": 42576560, + "step": 73395 + }, + { + "epoch": 10.932380101280906, + "grad_norm": 0.11715073138475418, + "learning_rate": 2.5295690457836384e-05, + "loss": 0.0996, + "num_input_tokens_seen": 42579600, + "step": 73400 + }, + { + "epoch": 10.933124813821864, + "grad_norm": 2.617295026779175, + "learning_rate": 2.529244126118556e-05, + "loss": 0.2012, + "num_input_tokens_seen": 42582480, + "step": 73405 + }, + { + "epoch": 10.933869526362823, + "grad_norm": 0.00359434075653553, + "learning_rate": 2.5289192059594253e-05, + "loss": 0.2455, + "num_input_tokens_seen": 42585424, + "step": 73410 + }, + { + "epoch": 10.934614238903784, + "grad_norm": 5.710209846496582, + "learning_rate": 2.5285942853117327e-05, + "loss": 0.0298, + "num_input_tokens_seen": 42588368, + "step": 73415 + }, + { + "epoch": 10.935358951444742, + "grad_norm": 0.036474503576755524, + "learning_rate": 2.5282693641809683e-05, + "loss": 0.0055, + "num_input_tokens_seen": 42591216, + "step": 73420 + }, + { + "epoch": 10.936103663985701, + "grad_norm": 0.015708424150943756, + "learning_rate": 2.5279444425726228e-05, + "loss": 0.0005, + "num_input_tokens_seen": 42594512, + "step": 73425 + }, + { + "epoch": 10.93684837652666, + "grad_norm": 0.0054261465556919575, + "learning_rate": 2.5276195204921837e-05, + "loss": 0.0005, + "num_input_tokens_seen": 42597488, + "step": 73430 + }, + { + "epoch": 10.93759308906762, + "grad_norm": 0.0025273978244513273, + "learning_rate": 2.5272945979451413e-05, + "loss": 0.1982, + "num_input_tokens_seen": 42600528, + "step": 73435 + }, + { + "epoch": 10.93833780160858, + "grad_norm": 0.0005597209674306214, + "learning_rate": 2.5269696749369844e-05, + "loss": 0.2931, + "num_input_tokens_seen": 42603248, + "step": 73440 + }, + { + "epoch": 10.939082514149538, + "grad_norm": 0.006700800731778145, + "learning_rate": 2.5266447514732023e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42606096, + "step": 73445 + }, + { + "epoch": 10.939827226690497, + "grad_norm": 53.325714111328125, + "learning_rate": 2.5263198275592835e-05, + "loss": 0.0307, + "num_input_tokens_seen": 42609008, + "step": 73450 + }, + { + "epoch": 10.940571939231457, + "grad_norm": 0.013851727358996868, + "learning_rate": 2.5259949032007186e-05, + "loss": 0.1869, + "num_input_tokens_seen": 42611760, + "step": 73455 + }, + { + "epoch": 10.941316651772416, + "grad_norm": 0.0013737056870013475, + "learning_rate": 2.5256699784029958e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42614768, + "step": 73460 + }, + { + "epoch": 10.942061364313375, + "grad_norm": 0.010967021808028221, + "learning_rate": 2.525345053171605e-05, + "loss": 0.0004, + "num_input_tokens_seen": 42617552, + "step": 73465 + }, + { + "epoch": 10.942806076854334, + "grad_norm": 0.003258069744333625, + "learning_rate": 2.525020127512035e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42620336, + "step": 73470 + }, + { + "epoch": 10.943550789395294, + "grad_norm": 0.021393854171037674, + "learning_rate": 2.524695201429776e-05, + "loss": 0.138, + "num_input_tokens_seen": 42623152, + "step": 73475 + }, + { + "epoch": 10.944295501936253, + "grad_norm": 1.700624942779541, + "learning_rate": 2.5243702749303173e-05, + "loss": 0.0572, + "num_input_tokens_seen": 42625968, + "step": 73480 + }, + { + "epoch": 10.945040214477212, + "grad_norm": 0.5094318389892578, + "learning_rate": 2.5240453480191463e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42628816, + "step": 73485 + }, + { + "epoch": 10.94578492701817, + "grad_norm": 0.015793221071362495, + "learning_rate": 2.5237204207017533e-05, + "loss": 0.1043, + "num_input_tokens_seen": 42631792, + "step": 73490 + }, + { + "epoch": 10.946529639559131, + "grad_norm": 21.31588363647461, + "learning_rate": 2.523395492983629e-05, + "loss": 0.0017, + "num_input_tokens_seen": 42634544, + "step": 73495 + }, + { + "epoch": 10.94727435210009, + "grad_norm": 0.27655526995658875, + "learning_rate": 2.5230705648702608e-05, + "loss": 0.2699, + "num_input_tokens_seen": 42637488, + "step": 73500 + }, + { + "epoch": 10.948019064641048, + "grad_norm": 91.186767578125, + "learning_rate": 2.52274563636714e-05, + "loss": 0.5025, + "num_input_tokens_seen": 42640176, + "step": 73505 + }, + { + "epoch": 10.948763777182007, + "grad_norm": 8.991656303405762, + "learning_rate": 2.5224207074797533e-05, + "loss": 0.0514, + "num_input_tokens_seen": 42642960, + "step": 73510 + }, + { + "epoch": 10.949508489722968, + "grad_norm": 0.007097653578966856, + "learning_rate": 2.522095778213593e-05, + "loss": 0.0075, + "num_input_tokens_seen": 42646192, + "step": 73515 + }, + { + "epoch": 10.950253202263927, + "grad_norm": 19.699554443359375, + "learning_rate": 2.5217708485741458e-05, + "loss": 0.1219, + "num_input_tokens_seen": 42649424, + "step": 73520 + }, + { + "epoch": 10.950997914804885, + "grad_norm": 0.12698067724704742, + "learning_rate": 2.5214459185669028e-05, + "loss": 0.1323, + "num_input_tokens_seen": 42652784, + "step": 73525 + }, + { + "epoch": 10.951742627345844, + "grad_norm": 50.20087814331055, + "learning_rate": 2.5211209881973525e-05, + "loss": 0.1821, + "num_input_tokens_seen": 42655856, + "step": 73530 + }, + { + "epoch": 10.952487339886805, + "grad_norm": 0.002100586425513029, + "learning_rate": 2.5207960574709843e-05, + "loss": 0.3175, + "num_input_tokens_seen": 42658960, + "step": 73535 + }, + { + "epoch": 10.953232052427763, + "grad_norm": 0.010964876040816307, + "learning_rate": 2.520471126393289e-05, + "loss": 0.0004, + "num_input_tokens_seen": 42661808, + "step": 73540 + }, + { + "epoch": 10.953976764968722, + "grad_norm": 0.0016726321773603559, + "learning_rate": 2.5201461949697534e-05, + "loss": 0.0016, + "num_input_tokens_seen": 42664656, + "step": 73545 + }, + { + "epoch": 10.95472147750968, + "grad_norm": 0.014532710425555706, + "learning_rate": 2.5198212632058694e-05, + "loss": 0.0009, + "num_input_tokens_seen": 42667728, + "step": 73550 + }, + { + "epoch": 10.95546619005064, + "grad_norm": 0.015238662250339985, + "learning_rate": 2.519496331107125e-05, + "loss": 0.0004, + "num_input_tokens_seen": 42670480, + "step": 73555 + }, + { + "epoch": 10.9562109025916, + "grad_norm": 49.39706039428711, + "learning_rate": 2.51917139867901e-05, + "loss": 0.1534, + "num_input_tokens_seen": 42673008, + "step": 73560 + }, + { + "epoch": 10.956955615132559, + "grad_norm": 0.042121902108192444, + "learning_rate": 2.5188464659270133e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42676016, + "step": 73565 + }, + { + "epoch": 10.957700327673518, + "grad_norm": 208.54275512695312, + "learning_rate": 2.5185215328566247e-05, + "loss": 0.1759, + "num_input_tokens_seen": 42678928, + "step": 73570 + }, + { + "epoch": 10.958445040214476, + "grad_norm": 0.0005751871503889561, + "learning_rate": 2.5181965994733343e-05, + "loss": 0.1379, + "num_input_tokens_seen": 42681872, + "step": 73575 + }, + { + "epoch": 10.959189752755437, + "grad_norm": 0.6531841158866882, + "learning_rate": 2.5178716657826302e-05, + "loss": 0.0005, + "num_input_tokens_seen": 42684528, + "step": 73580 + }, + { + "epoch": 10.959934465296396, + "grad_norm": 0.009845231659710407, + "learning_rate": 2.5175467317900026e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42687216, + "step": 73585 + }, + { + "epoch": 10.960679177837354, + "grad_norm": 0.006332834251224995, + "learning_rate": 2.517221797500941e-05, + "loss": 0.1226, + "num_input_tokens_seen": 42690160, + "step": 73590 + }, + { + "epoch": 10.961423890378313, + "grad_norm": 314.24542236328125, + "learning_rate": 2.516896862920935e-05, + "loss": 0.5343, + "num_input_tokens_seen": 42693200, + "step": 73595 + }, + { + "epoch": 10.962168602919274, + "grad_norm": 0.00801398977637291, + "learning_rate": 2.5165719280554728e-05, + "loss": 0.0403, + "num_input_tokens_seen": 42695792, + "step": 73600 + }, + { + "epoch": 10.962913315460233, + "grad_norm": 0.011168669909238815, + "learning_rate": 2.5162469929100452e-05, + "loss": 0.163, + "num_input_tokens_seen": 42698832, + "step": 73605 + }, + { + "epoch": 10.963658028001191, + "grad_norm": 0.005735842511057854, + "learning_rate": 2.5159220574901417e-05, + "loss": 0.0005, + "num_input_tokens_seen": 42702000, + "step": 73610 + }, + { + "epoch": 10.96440274054215, + "grad_norm": 0.007815989665687084, + "learning_rate": 2.5155971218012503e-05, + "loss": 0.0112, + "num_input_tokens_seen": 42704880, + "step": 73615 + }, + { + "epoch": 10.96514745308311, + "grad_norm": 34.61449432373047, + "learning_rate": 2.5152721858488615e-05, + "loss": 0.0407, + "num_input_tokens_seen": 42707568, + "step": 73620 + }, + { + "epoch": 10.96589216562407, + "grad_norm": 0.016094038262963295, + "learning_rate": 2.5149472496384645e-05, + "loss": 0.0004, + "num_input_tokens_seen": 42710320, + "step": 73625 + }, + { + "epoch": 10.966636878165028, + "grad_norm": 0.08529076725244522, + "learning_rate": 2.5146223131755493e-05, + "loss": 0.0025, + "num_input_tokens_seen": 42712912, + "step": 73630 + }, + { + "epoch": 10.967381590705987, + "grad_norm": 0.02928822673857212, + "learning_rate": 2.514297376465605e-05, + "loss": 0.0002, + "num_input_tokens_seen": 42716208, + "step": 73635 + }, + { + "epoch": 10.968126303246947, + "grad_norm": 0.010629817843437195, + "learning_rate": 2.5139724395141207e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42719248, + "step": 73640 + }, + { + "epoch": 10.968871015787906, + "grad_norm": 0.5528386831283569, + "learning_rate": 2.513647502326587e-05, + "loss": 0.0087, + "num_input_tokens_seen": 42722288, + "step": 73645 + }, + { + "epoch": 10.969615728328865, + "grad_norm": 0.00669999560341239, + "learning_rate": 2.513322564908492e-05, + "loss": 0.4903, + "num_input_tokens_seen": 42725040, + "step": 73650 + }, + { + "epoch": 10.970360440869824, + "grad_norm": 0.0014225341146811843, + "learning_rate": 2.512997627265326e-05, + "loss": 0.1284, + "num_input_tokens_seen": 42728080, + "step": 73655 + }, + { + "epoch": 10.971105153410784, + "grad_norm": 0.11061467975378036, + "learning_rate": 2.5126726894025782e-05, + "loss": 0.0004, + "num_input_tokens_seen": 42731312, + "step": 73660 + }, + { + "epoch": 10.971849865951743, + "grad_norm": 8.101181030273438, + "learning_rate": 2.5123477513257376e-05, + "loss": 0.0237, + "num_input_tokens_seen": 42734032, + "step": 73665 + }, + { + "epoch": 10.972594578492702, + "grad_norm": 0.04611128941178322, + "learning_rate": 2.5120228130402955e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42737104, + "step": 73670 + }, + { + "epoch": 10.97333929103366, + "grad_norm": 0.19468289613723755, + "learning_rate": 2.5116978745517394e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42740016, + "step": 73675 + }, + { + "epoch": 10.974084003574621, + "grad_norm": 40.400108337402344, + "learning_rate": 2.5113729358655602e-05, + "loss": 0.1479, + "num_input_tokens_seen": 42742896, + "step": 73680 + }, + { + "epoch": 10.97482871611558, + "grad_norm": 0.5020964741706848, + "learning_rate": 2.5110479969872463e-05, + "loss": 0.2045, + "num_input_tokens_seen": 42746096, + "step": 73685 + }, + { + "epoch": 10.975573428656539, + "grad_norm": 0.02996632643043995, + "learning_rate": 2.510723057922288e-05, + "loss": 0.0593, + "num_input_tokens_seen": 42748944, + "step": 73690 + }, + { + "epoch": 10.976318141197497, + "grad_norm": 0.2164691686630249, + "learning_rate": 2.510398118676174e-05, + "loss": 0.0002, + "num_input_tokens_seen": 42751664, + "step": 73695 + }, + { + "epoch": 10.977062853738456, + "grad_norm": 45.83074188232422, + "learning_rate": 2.5100731792543948e-05, + "loss": 0.0178, + "num_input_tokens_seen": 42754640, + "step": 73700 + }, + { + "epoch": 10.977807566279417, + "grad_norm": 0.0010662466520443559, + "learning_rate": 2.5097482396624393e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42758256, + "step": 73705 + }, + { + "epoch": 10.978552278820375, + "grad_norm": 0.010792466811835766, + "learning_rate": 2.5094232999057975e-05, + "loss": 0.0004, + "num_input_tokens_seen": 42761168, + "step": 73710 + }, + { + "epoch": 10.979296991361334, + "grad_norm": 0.006349856499582529, + "learning_rate": 2.5090983599899587e-05, + "loss": 0.1772, + "num_input_tokens_seen": 42763824, + "step": 73715 + }, + { + "epoch": 10.980041703902295, + "grad_norm": 9.236644744873047, + "learning_rate": 2.508773419920412e-05, + "loss": 0.2532, + "num_input_tokens_seen": 42766864, + "step": 73720 + }, + { + "epoch": 10.980786416443253, + "grad_norm": 1.7086496353149414, + "learning_rate": 2.508448479702647e-05, + "loss": 0.001, + "num_input_tokens_seen": 42770000, + "step": 73725 + }, + { + "epoch": 10.981531128984212, + "grad_norm": 1.6003772020339966, + "learning_rate": 2.5081235393421537e-05, + "loss": 0.1281, + "num_input_tokens_seen": 42772656, + "step": 73730 + }, + { + "epoch": 10.982275841525171, + "grad_norm": 6.892874717712402, + "learning_rate": 2.507798598844422e-05, + "loss": 0.2751, + "num_input_tokens_seen": 42775504, + "step": 73735 + }, + { + "epoch": 10.98302055406613, + "grad_norm": 0.232815682888031, + "learning_rate": 2.5074736582149405e-05, + "loss": 0.0058, + "num_input_tokens_seen": 42778256, + "step": 73740 + }, + { + "epoch": 10.98376526660709, + "grad_norm": 0.4885084331035614, + "learning_rate": 2.507148717459199e-05, + "loss": 0.0004, + "num_input_tokens_seen": 42781264, + "step": 73745 + }, + { + "epoch": 10.984509979148049, + "grad_norm": 0.007508234586566687, + "learning_rate": 2.5068237765826875e-05, + "loss": 0.0387, + "num_input_tokens_seen": 42784080, + "step": 73750 + }, + { + "epoch": 10.985254691689008, + "grad_norm": 0.12814562022686005, + "learning_rate": 2.5064988355908952e-05, + "loss": 0.1547, + "num_input_tokens_seen": 42786800, + "step": 73755 + }, + { + "epoch": 10.985999404229966, + "grad_norm": 0.011164652183651924, + "learning_rate": 2.5061738944893115e-05, + "loss": 0.0004, + "num_input_tokens_seen": 42789776, + "step": 73760 + }, + { + "epoch": 10.986744116770927, + "grad_norm": 14.679363250732422, + "learning_rate": 2.5058489532834262e-05, + "loss": 0.0766, + "num_input_tokens_seen": 42792848, + "step": 73765 + }, + { + "epoch": 10.987488829311886, + "grad_norm": 0.0005045998259447515, + "learning_rate": 2.5055240119787287e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42795888, + "step": 73770 + }, + { + "epoch": 10.988233541852845, + "grad_norm": 0.0028306206222623587, + "learning_rate": 2.5051990705807092e-05, + "loss": 0.177, + "num_input_tokens_seen": 42799184, + "step": 73775 + }, + { + "epoch": 10.988978254393803, + "grad_norm": 201.7808837890625, + "learning_rate": 2.504874129094856e-05, + "loss": 0.0457, + "num_input_tokens_seen": 42802064, + "step": 73780 + }, + { + "epoch": 10.989722966934764, + "grad_norm": 0.007132799830287695, + "learning_rate": 2.504549187526659e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42804784, + "step": 73785 + }, + { + "epoch": 10.990467679475723, + "grad_norm": 0.3624112904071808, + "learning_rate": 2.504224245881609e-05, + "loss": 0.2326, + "num_input_tokens_seen": 42807472, + "step": 73790 + }, + { + "epoch": 10.991212392016681, + "grad_norm": 0.027772199362516403, + "learning_rate": 2.5038993041651947e-05, + "loss": 0.0004, + "num_input_tokens_seen": 42810640, + "step": 73795 + }, + { + "epoch": 10.99195710455764, + "grad_norm": 0.024942565709352493, + "learning_rate": 2.503574362382905e-05, + "loss": 0.0042, + "num_input_tokens_seen": 42813456, + "step": 73800 + }, + { + "epoch": 10.9927018170986, + "grad_norm": 0.004317055456340313, + "learning_rate": 2.5032494205402303e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42816144, + "step": 73805 + }, + { + "epoch": 10.99344652963956, + "grad_norm": 9.700166702270508, + "learning_rate": 2.5029244786426603e-05, + "loss": 0.111, + "num_input_tokens_seen": 42818992, + "step": 73810 + }, + { + "epoch": 10.994191242180518, + "grad_norm": 0.012487540952861309, + "learning_rate": 2.5025995366956835e-05, + "loss": 0.0592, + "num_input_tokens_seen": 42821776, + "step": 73815 + }, + { + "epoch": 10.994935954721477, + "grad_norm": 0.002494879998266697, + "learning_rate": 2.5022745947047904e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42824752, + "step": 73820 + }, + { + "epoch": 10.995680667262437, + "grad_norm": 0.19772127270698547, + "learning_rate": 2.5019496526754705e-05, + "loss": 0.0797, + "num_input_tokens_seen": 42827664, + "step": 73825 + }, + { + "epoch": 10.996425379803396, + "grad_norm": 0.2149389237165451, + "learning_rate": 2.501624710613213e-05, + "loss": 0.0009, + "num_input_tokens_seen": 42830704, + "step": 73830 + }, + { + "epoch": 10.997170092344355, + "grad_norm": 0.013914906419813633, + "learning_rate": 2.501299768523508e-05, + "loss": 0.0004, + "num_input_tokens_seen": 42833424, + "step": 73835 + }, + { + "epoch": 10.997914804885314, + "grad_norm": 2.170823335647583, + "learning_rate": 2.5009748264118442e-05, + "loss": 0.0017, + "num_input_tokens_seen": 42836496, + "step": 73840 + }, + { + "epoch": 10.998659517426274, + "grad_norm": 0.00496708182618022, + "learning_rate": 2.500649884283713e-05, + "loss": 0.0314, + "num_input_tokens_seen": 42839184, + "step": 73845 + }, + { + "epoch": 10.999404229967233, + "grad_norm": 0.15966752171516418, + "learning_rate": 2.5003249421446012e-05, + "loss": 0.0184, + "num_input_tokens_seen": 42842416, + "step": 73850 + }, + { + "epoch": 11.0, + "eval_loss": 1.8252243995666504, + "eval_runtime": 51.2583, + "eval_samples_per_second": 58.215, + "eval_steps_per_second": 14.554, + "num_input_tokens_seen": 42844400, + "step": 73854 + }, + { + "epoch": 11.000148942508192, + "grad_norm": 0.012329431250691414, + "learning_rate": 2.5e-05, + "loss": 0.006, + "num_input_tokens_seen": 42845104, + "step": 73855 + }, + { + "epoch": 11.00089365504915, + "grad_norm": 0.0009801952401176095, + "learning_rate": 2.4996750578553997e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42847952, + "step": 73860 + }, + { + "epoch": 11.001638367590111, + "grad_norm": 0.002763089956715703, + "learning_rate": 2.499350115716288e-05, + "loss": 0.1784, + "num_input_tokens_seen": 42850800, + "step": 73865 + }, + { + "epoch": 11.00238308013107, + "grad_norm": 0.0013340600999072194, + "learning_rate": 2.4990251735881563e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42853648, + "step": 73870 + }, + { + "epoch": 11.003127792672029, + "grad_norm": 0.05643486976623535, + "learning_rate": 2.4987002314764926e-05, + "loss": 0.1596, + "num_input_tokens_seen": 42856656, + "step": 73875 + }, + { + "epoch": 11.003872505212987, + "grad_norm": 0.07716754078865051, + "learning_rate": 2.4983752893867877e-05, + "loss": 0.0109, + "num_input_tokens_seen": 42859504, + "step": 73880 + }, + { + "epoch": 11.004617217753948, + "grad_norm": 0.004730162210762501, + "learning_rate": 2.4980503473245298e-05, + "loss": 0.0002, + "num_input_tokens_seen": 42862352, + "step": 73885 + }, + { + "epoch": 11.005361930294907, + "grad_norm": 0.04740314185619354, + "learning_rate": 2.4977254052952102e-05, + "loss": 0.0468, + "num_input_tokens_seen": 42865104, + "step": 73890 + }, + { + "epoch": 11.006106642835865, + "grad_norm": 0.0074986182153224945, + "learning_rate": 2.4974004633043168e-05, + "loss": 0.0297, + "num_input_tokens_seen": 42867792, + "step": 73895 + }, + { + "epoch": 11.006851355376824, + "grad_norm": 0.0029734154231846333, + "learning_rate": 2.4970755213573403e-05, + "loss": 0.0041, + "num_input_tokens_seen": 42870576, + "step": 73900 + }, + { + "epoch": 11.007596067917783, + "grad_norm": 0.002750941552221775, + "learning_rate": 2.4967505794597703e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42873360, + "step": 73905 + }, + { + "epoch": 11.008340780458743, + "grad_norm": 0.0014239868614822626, + "learning_rate": 2.4964256376170954e-05, + "loss": 0.0, + "num_input_tokens_seen": 42876304, + "step": 73910 + }, + { + "epoch": 11.009085492999702, + "grad_norm": 0.000876846956089139, + "learning_rate": 2.4961006958348066e-05, + "loss": 0.0018, + "num_input_tokens_seen": 42879024, + "step": 73915 + }, + { + "epoch": 11.009830205540661, + "grad_norm": 0.006547810975462198, + "learning_rate": 2.495775754118391e-05, + "loss": 0.0007, + "num_input_tokens_seen": 42882064, + "step": 73920 + }, + { + "epoch": 11.01057491808162, + "grad_norm": 4.659055709838867, + "learning_rate": 2.4954508124733413e-05, + "loss": 0.0226, + "num_input_tokens_seen": 42885008, + "step": 73925 + }, + { + "epoch": 11.01131963062258, + "grad_norm": 0.003633841872215271, + "learning_rate": 2.495125870905144e-05, + "loss": 0.0002, + "num_input_tokens_seen": 42887888, + "step": 73930 + }, + { + "epoch": 11.012064343163539, + "grad_norm": 0.009335838258266449, + "learning_rate": 2.4948009294192913e-05, + "loss": 0.0191, + "num_input_tokens_seen": 42890800, + "step": 73935 + }, + { + "epoch": 11.012809055704498, + "grad_norm": 0.0007144618430174887, + "learning_rate": 2.494475988021272e-05, + "loss": 0.0005, + "num_input_tokens_seen": 42893776, + "step": 73940 + }, + { + "epoch": 11.013553768245457, + "grad_norm": 0.0026411954313516617, + "learning_rate": 2.4941510467165744e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42896560, + "step": 73945 + }, + { + "epoch": 11.014298480786417, + "grad_norm": 0.005905783269554377, + "learning_rate": 2.4938261055106894e-05, + "loss": 0.0005, + "num_input_tokens_seen": 42899216, + "step": 73950 + }, + { + "epoch": 11.015043193327376, + "grad_norm": 0.005491721443831921, + "learning_rate": 2.493501164409105e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42902000, + "step": 73955 + }, + { + "epoch": 11.015787905868335, + "grad_norm": 0.17204709351062775, + "learning_rate": 2.493176223417313e-05, + "loss": 0.0007, + "num_input_tokens_seen": 42904912, + "step": 73960 + }, + { + "epoch": 11.016532618409293, + "grad_norm": 0.004628367256373167, + "learning_rate": 2.4928512825408006e-05, + "loss": 0.1538, + "num_input_tokens_seen": 42907760, + "step": 73965 + }, + { + "epoch": 11.017277330950254, + "grad_norm": 0.00021517634741030633, + "learning_rate": 2.4925263417850598e-05, + "loss": 0.0012, + "num_input_tokens_seen": 42910512, + "step": 73970 + }, + { + "epoch": 11.018022043491213, + "grad_norm": 0.002401552861556411, + "learning_rate": 2.4922014011555784e-05, + "loss": 0.0012, + "num_input_tokens_seen": 42913456, + "step": 73975 + }, + { + "epoch": 11.018766756032171, + "grad_norm": 1.794092059135437, + "learning_rate": 2.4918764606578465e-05, + "loss": 0.0008, + "num_input_tokens_seen": 42916368, + "step": 73980 + }, + { + "epoch": 11.01951146857313, + "grad_norm": 0.008753074333071709, + "learning_rate": 2.491551520297354e-05, + "loss": 0.0763, + "num_input_tokens_seen": 42919152, + "step": 73985 + }, + { + "epoch": 11.02025618111409, + "grad_norm": 0.004605482332408428, + "learning_rate": 2.4912265800795885e-05, + "loss": 0.0706, + "num_input_tokens_seen": 42922032, + "step": 73990 + }, + { + "epoch": 11.02100089365505, + "grad_norm": 0.03460698947310448, + "learning_rate": 2.4909016400100423e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42925008, + "step": 73995 + }, + { + "epoch": 11.021745606196008, + "grad_norm": 0.013822389766573906, + "learning_rate": 2.490576700094203e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42927920, + "step": 74000 + }, + { + "epoch": 11.022490318736967, + "grad_norm": 0.01006000954657793, + "learning_rate": 2.490251760337561e-05, + "loss": 0.074, + "num_input_tokens_seen": 42931120, + "step": 74005 + }, + { + "epoch": 11.023235031277927, + "grad_norm": 0.00196037907153368, + "learning_rate": 2.4899268207456055e-05, + "loss": 0.1453, + "num_input_tokens_seen": 42933808, + "step": 74010 + }, + { + "epoch": 11.023979743818886, + "grad_norm": 0.11643645912408829, + "learning_rate": 2.4896018813238263e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42936720, + "step": 74015 + }, + { + "epoch": 11.024724456359845, + "grad_norm": 0.009168465621769428, + "learning_rate": 2.4892769420777134e-05, + "loss": 0.0006, + "num_input_tokens_seen": 42939952, + "step": 74020 + }, + { + "epoch": 11.025469168900804, + "grad_norm": 0.0003715700877364725, + "learning_rate": 2.4889520030127543e-05, + "loss": 0.0376, + "num_input_tokens_seen": 42942864, + "step": 74025 + }, + { + "epoch": 11.026213881441764, + "grad_norm": 0.0090469466522336, + "learning_rate": 2.488627064134441e-05, + "loss": 0.4846, + "num_input_tokens_seen": 42945744, + "step": 74030 + }, + { + "epoch": 11.026958593982723, + "grad_norm": 0.010513672605156898, + "learning_rate": 2.4883021254482612e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42948688, + "step": 74035 + }, + { + "epoch": 11.027703306523682, + "grad_norm": 0.000587046961300075, + "learning_rate": 2.487977186959705e-05, + "loss": 0.0, + "num_input_tokens_seen": 42951344, + "step": 74040 + }, + { + "epoch": 11.02844801906464, + "grad_norm": 0.006475236266851425, + "learning_rate": 2.487652248674262e-05, + "loss": 0.0004, + "num_input_tokens_seen": 42953968, + "step": 74045 + }, + { + "epoch": 11.029192731605601, + "grad_norm": 0.008357100188732147, + "learning_rate": 2.4873273105974227e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42956944, + "step": 74050 + }, + { + "epoch": 11.02993744414656, + "grad_norm": 104.29850769042969, + "learning_rate": 2.487002372734674e-05, + "loss": 0.2539, + "num_input_tokens_seen": 42960016, + "step": 74055 + }, + { + "epoch": 11.030682156687519, + "grad_norm": 0.0034814318642020226, + "learning_rate": 2.4866774350915084e-05, + "loss": 0.0045, + "num_input_tokens_seen": 42963088, + "step": 74060 + }, + { + "epoch": 11.031426869228477, + "grad_norm": 0.008529135026037693, + "learning_rate": 2.486352497673414e-05, + "loss": 0.0021, + "num_input_tokens_seen": 42965808, + "step": 74065 + }, + { + "epoch": 11.032171581769436, + "grad_norm": 0.0038259441498667, + "learning_rate": 2.4860275604858796e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42968912, + "step": 74070 + }, + { + "epoch": 11.032916294310397, + "grad_norm": 0.00464564049616456, + "learning_rate": 2.485702623534396e-05, + "loss": 0.2157, + "num_input_tokens_seen": 42971888, + "step": 74075 + }, + { + "epoch": 11.033661006851355, + "grad_norm": 0.02755182608962059, + "learning_rate": 2.485377686824451e-05, + "loss": 0.0093, + "num_input_tokens_seen": 42975056, + "step": 74080 + }, + { + "epoch": 11.034405719392314, + "grad_norm": 0.0009123384952545166, + "learning_rate": 2.485052750361536e-05, + "loss": 0.0915, + "num_input_tokens_seen": 42977904, + "step": 74085 + }, + { + "epoch": 11.035150431933273, + "grad_norm": 0.01029383484274149, + "learning_rate": 2.4847278141511387e-05, + "loss": 0.0003, + "num_input_tokens_seen": 42980496, + "step": 74090 + }, + { + "epoch": 11.035895144474233, + "grad_norm": 0.020767854526638985, + "learning_rate": 2.4844028781987506e-05, + "loss": 0.1193, + "num_input_tokens_seen": 42983440, + "step": 74095 + }, + { + "epoch": 11.036639857015192, + "grad_norm": 36.821800231933594, + "learning_rate": 2.48407794250986e-05, + "loss": 0.2094, + "num_input_tokens_seen": 42986160, + "step": 74100 + }, + { + "epoch": 11.037384569556151, + "grad_norm": 0.20572467148303986, + "learning_rate": 2.4837530070899557e-05, + "loss": 0.0005, + "num_input_tokens_seen": 42988720, + "step": 74105 + }, + { + "epoch": 11.03812928209711, + "grad_norm": 0.04408067464828491, + "learning_rate": 2.483428071944528e-05, + "loss": 0.0002, + "num_input_tokens_seen": 42991440, + "step": 74110 + }, + { + "epoch": 11.03887399463807, + "grad_norm": 0.003333666129037738, + "learning_rate": 2.483103137079066e-05, + "loss": 0.0001, + "num_input_tokens_seen": 42994352, + "step": 74115 + }, + { + "epoch": 11.039618707179029, + "grad_norm": 0.0011294480646029115, + "learning_rate": 2.4827782024990596e-05, + "loss": 0.0775, + "num_input_tokens_seen": 42997456, + "step": 74120 + }, + { + "epoch": 11.040363419719988, + "grad_norm": 0.002434808062389493, + "learning_rate": 2.4824532682099973e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43000240, + "step": 74125 + }, + { + "epoch": 11.041108132260947, + "grad_norm": 0.09528953582048416, + "learning_rate": 2.48212833421737e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43003312, + "step": 74130 + }, + { + "epoch": 11.041852844801907, + "grad_norm": 5.383506774902344, + "learning_rate": 2.4818034005266663e-05, + "loss": 0.159, + "num_input_tokens_seen": 43006096, + "step": 74135 + }, + { + "epoch": 11.042597557342866, + "grad_norm": 32.23161697387695, + "learning_rate": 2.481478467143376e-05, + "loss": 0.2549, + "num_input_tokens_seen": 43009232, + "step": 74140 + }, + { + "epoch": 11.043342269883825, + "grad_norm": 0.005216635297983885, + "learning_rate": 2.4811535340729876e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43012496, + "step": 74145 + }, + { + "epoch": 11.044086982424783, + "grad_norm": 0.015840867534279823, + "learning_rate": 2.4808286013209905e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43015312, + "step": 74150 + }, + { + "epoch": 11.044831694965744, + "grad_norm": 0.00982147827744484, + "learning_rate": 2.4805036688928758e-05, + "loss": 0.0008, + "num_input_tokens_seen": 43018192, + "step": 74155 + }, + { + "epoch": 11.045576407506703, + "grad_norm": 31.737529754638672, + "learning_rate": 2.4801787367941305e-05, + "loss": 0.0312, + "num_input_tokens_seen": 43021008, + "step": 74160 + }, + { + "epoch": 11.046321120047661, + "grad_norm": 145.87942504882812, + "learning_rate": 2.4798538050302468e-05, + "loss": 0.0886, + "num_input_tokens_seen": 43024112, + "step": 74165 + }, + { + "epoch": 11.04706583258862, + "grad_norm": 145.25961303710938, + "learning_rate": 2.4795288736067118e-05, + "loss": 0.1502, + "num_input_tokens_seen": 43026992, + "step": 74170 + }, + { + "epoch": 11.04781054512958, + "grad_norm": 0.006565435789525509, + "learning_rate": 2.4792039425290163e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43029744, + "step": 74175 + }, + { + "epoch": 11.04855525767054, + "grad_norm": 0.0001741445594234392, + "learning_rate": 2.4788790118026487e-05, + "loss": 0.0006, + "num_input_tokens_seen": 43032496, + "step": 74180 + }, + { + "epoch": 11.049299970211498, + "grad_norm": 0.002486143261194229, + "learning_rate": 2.4785540814330978e-05, + "loss": 0.0142, + "num_input_tokens_seen": 43035408, + "step": 74185 + }, + { + "epoch": 11.050044682752457, + "grad_norm": 0.0013337793061509728, + "learning_rate": 2.478229151425855e-05, + "loss": 0.0018, + "num_input_tokens_seen": 43038352, + "step": 74190 + }, + { + "epoch": 11.050789395293418, + "grad_norm": 0.018881814554333687, + "learning_rate": 2.4779042217864077e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43041264, + "step": 74195 + }, + { + "epoch": 11.051534107834376, + "grad_norm": 0.01765117608010769, + "learning_rate": 2.477579292520247e-05, + "loss": 0.0, + "num_input_tokens_seen": 43043952, + "step": 74200 + }, + { + "epoch": 11.052278820375335, + "grad_norm": 92.45701599121094, + "learning_rate": 2.477254363632861e-05, + "loss": 0.0946, + "num_input_tokens_seen": 43047216, + "step": 74205 + }, + { + "epoch": 11.053023532916294, + "grad_norm": 0.15025387704372406, + "learning_rate": 2.4769294351297398e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43049936, + "step": 74210 + }, + { + "epoch": 11.053768245457254, + "grad_norm": 8.831930160522461, + "learning_rate": 2.4766045070163713e-05, + "loss": 0.1814, + "num_input_tokens_seen": 43053104, + "step": 74215 + }, + { + "epoch": 11.054512957998213, + "grad_norm": 0.01587093248963356, + "learning_rate": 2.476279579298247e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43055856, + "step": 74220 + }, + { + "epoch": 11.055257670539172, + "grad_norm": 0.005842780228704214, + "learning_rate": 2.475954651980855e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43058704, + "step": 74225 + }, + { + "epoch": 11.05600238308013, + "grad_norm": 0.022406693547964096, + "learning_rate": 2.4756297250696837e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43061776, + "step": 74230 + }, + { + "epoch": 11.056747095621091, + "grad_norm": 0.002624309156090021, + "learning_rate": 2.4753047985702243e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43064816, + "step": 74235 + }, + { + "epoch": 11.05749180816205, + "grad_norm": 0.013400896452367306, + "learning_rate": 2.474979872487965e-05, + "loss": 0.1751, + "num_input_tokens_seen": 43067664, + "step": 74240 + }, + { + "epoch": 11.058236520703009, + "grad_norm": 0.010883960872888565, + "learning_rate": 2.474654946828396e-05, + "loss": 0.1564, + "num_input_tokens_seen": 43070640, + "step": 74245 + }, + { + "epoch": 11.058981233243967, + "grad_norm": 0.0012774023925885558, + "learning_rate": 2.474330021597004e-05, + "loss": 0.1128, + "num_input_tokens_seen": 43073360, + "step": 74250 + }, + { + "epoch": 11.059725945784926, + "grad_norm": 0.0020960732363164425, + "learning_rate": 2.474005096799282e-05, + "loss": 0.0285, + "num_input_tokens_seen": 43076144, + "step": 74255 + }, + { + "epoch": 11.060470658325887, + "grad_norm": 0.011852975934743881, + "learning_rate": 2.4736801724407174e-05, + "loss": 0.0044, + "num_input_tokens_seen": 43079056, + "step": 74260 + }, + { + "epoch": 11.061215370866845, + "grad_norm": 13.239788055419922, + "learning_rate": 2.4733552485267983e-05, + "loss": 0.3689, + "num_input_tokens_seen": 43081936, + "step": 74265 + }, + { + "epoch": 11.061960083407804, + "grad_norm": 0.007225588895380497, + "learning_rate": 2.4730303250630165e-05, + "loss": 0.0589, + "num_input_tokens_seen": 43084880, + "step": 74270 + }, + { + "epoch": 11.062704795948763, + "grad_norm": 0.1913686990737915, + "learning_rate": 2.4727054020548592e-05, + "loss": 0.1567, + "num_input_tokens_seen": 43087856, + "step": 74275 + }, + { + "epoch": 11.063449508489724, + "grad_norm": 0.052931200712919235, + "learning_rate": 2.4723804795078172e-05, + "loss": 0.1442, + "num_input_tokens_seen": 43090928, + "step": 74280 + }, + { + "epoch": 11.064194221030682, + "grad_norm": 0.0678781047463417, + "learning_rate": 2.4720555574273775e-05, + "loss": 0.2378, + "num_input_tokens_seen": 43094000, + "step": 74285 + }, + { + "epoch": 11.064938933571641, + "grad_norm": 0.0025508380495011806, + "learning_rate": 2.471730635819032e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43097040, + "step": 74290 + }, + { + "epoch": 11.0656836461126, + "grad_norm": 0.006938622798770666, + "learning_rate": 2.4714057146882676e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43099952, + "step": 74295 + }, + { + "epoch": 11.06642835865356, + "grad_norm": 0.01330790389329195, + "learning_rate": 2.4710807940405756e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43102768, + "step": 74300 + }, + { + "epoch": 11.067173071194519, + "grad_norm": 0.003721937770023942, + "learning_rate": 2.470755873881444e-05, + "loss": 0.0089, + "num_input_tokens_seen": 43105648, + "step": 74305 + }, + { + "epoch": 11.067917783735478, + "grad_norm": 0.009127799421548843, + "learning_rate": 2.470430954216362e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43108432, + "step": 74310 + }, + { + "epoch": 11.068662496276437, + "grad_norm": 74.3729248046875, + "learning_rate": 2.4701060350508194e-05, + "loss": 0.1566, + "num_input_tokens_seen": 43111472, + "step": 74315 + }, + { + "epoch": 11.069407208817397, + "grad_norm": 0.008742686361074448, + "learning_rate": 2.4697811163903036e-05, + "loss": 0.0026, + "num_input_tokens_seen": 43114192, + "step": 74320 + }, + { + "epoch": 11.070151921358356, + "grad_norm": 0.7562637329101562, + "learning_rate": 2.4694561982403065e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43117072, + "step": 74325 + }, + { + "epoch": 11.070896633899315, + "grad_norm": 0.025792105123400688, + "learning_rate": 2.4691312806063154e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43119984, + "step": 74330 + }, + { + "epoch": 11.071641346440273, + "grad_norm": 0.9577749967575073, + "learning_rate": 2.4688063634938198e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43122608, + "step": 74335 + }, + { + "epoch": 11.072386058981234, + "grad_norm": 0.012896085157990456, + "learning_rate": 2.46848144690831e-05, + "loss": 0.008, + "num_input_tokens_seen": 43125424, + "step": 74340 + }, + { + "epoch": 11.073130771522193, + "grad_norm": 0.004284312482923269, + "learning_rate": 2.468156530855273e-05, + "loss": 0.0108, + "num_input_tokens_seen": 43128080, + "step": 74345 + }, + { + "epoch": 11.073875484063151, + "grad_norm": 0.0016523887170478702, + "learning_rate": 2.4678316153401994e-05, + "loss": 0.0423, + "num_input_tokens_seen": 43130768, + "step": 74350 + }, + { + "epoch": 11.07462019660411, + "grad_norm": 0.0007738754502497613, + "learning_rate": 2.4675067003685776e-05, + "loss": 0.1456, + "num_input_tokens_seen": 43133616, + "step": 74355 + }, + { + "epoch": 11.07536490914507, + "grad_norm": 0.0027305292896926403, + "learning_rate": 2.4671817859458974e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43136560, + "step": 74360 + }, + { + "epoch": 11.07610962168603, + "grad_norm": 85.83502197265625, + "learning_rate": 2.4668568720776478e-05, + "loss": 0.0467, + "num_input_tokens_seen": 43139280, + "step": 74365 + }, + { + "epoch": 11.076854334226988, + "grad_norm": 0.008978687226772308, + "learning_rate": 2.466531958769317e-05, + "loss": 0.013, + "num_input_tokens_seen": 43142256, + "step": 74370 + }, + { + "epoch": 11.077599046767947, + "grad_norm": 0.007816578261554241, + "learning_rate": 2.466207046026395e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43145040, + "step": 74375 + }, + { + "epoch": 11.078343759308908, + "grad_norm": 0.003731310134753585, + "learning_rate": 2.4658821338543713e-05, + "loss": 0.0927, + "num_input_tokens_seen": 43147952, + "step": 74380 + }, + { + "epoch": 11.079088471849866, + "grad_norm": 0.021545004099607468, + "learning_rate": 2.465557222258734e-05, + "loss": 0.0146, + "num_input_tokens_seen": 43150704, + "step": 74385 + }, + { + "epoch": 11.079833184390825, + "grad_norm": 0.002561482833698392, + "learning_rate": 2.4652323112449716e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43153328, + "step": 74390 + }, + { + "epoch": 11.080577896931784, + "grad_norm": 0.01918194256722927, + "learning_rate": 2.464907400818575e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43156336, + "step": 74395 + }, + { + "epoch": 11.081322609472744, + "grad_norm": 0.000843303743749857, + "learning_rate": 2.4645824909850316e-05, + "loss": 0.1103, + "num_input_tokens_seen": 43159408, + "step": 74400 + }, + { + "epoch": 11.082067322013703, + "grad_norm": 22.356714248657227, + "learning_rate": 2.4642575817498313e-05, + "loss": 0.0709, + "num_input_tokens_seen": 43161936, + "step": 74405 + }, + { + "epoch": 11.082812034554662, + "grad_norm": 0.03985777869820595, + "learning_rate": 2.463932673118462e-05, + "loss": 0.0024, + "num_input_tokens_seen": 43164624, + "step": 74410 + }, + { + "epoch": 11.08355674709562, + "grad_norm": 0.009035813622176647, + "learning_rate": 2.463607765096415e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43167856, + "step": 74415 + }, + { + "epoch": 11.08430145963658, + "grad_norm": 0.003959218971431255, + "learning_rate": 2.4632828576891774e-05, + "loss": 0.163, + "num_input_tokens_seen": 43170576, + "step": 74420 + }, + { + "epoch": 11.08504617217754, + "grad_norm": 0.018885599449276924, + "learning_rate": 2.4629579509022374e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43173424, + "step": 74425 + }, + { + "epoch": 11.085790884718499, + "grad_norm": 0.07254242151975632, + "learning_rate": 2.4626330447410864e-05, + "loss": 0.0005, + "num_input_tokens_seen": 43176560, + "step": 74430 + }, + { + "epoch": 11.086535597259457, + "grad_norm": 0.0035155219957232475, + "learning_rate": 2.4623081392112117e-05, + "loss": 0.0691, + "num_input_tokens_seen": 43179696, + "step": 74435 + }, + { + "epoch": 11.087280309800416, + "grad_norm": 0.03912462294101715, + "learning_rate": 2.4619832343181028e-05, + "loss": 0.03, + "num_input_tokens_seen": 43182576, + "step": 74440 + }, + { + "epoch": 11.088025022341377, + "grad_norm": 156.07891845703125, + "learning_rate": 2.461658330067248e-05, + "loss": 0.08, + "num_input_tokens_seen": 43185680, + "step": 74445 + }, + { + "epoch": 11.088769734882336, + "grad_norm": 0.013596048578619957, + "learning_rate": 2.4613334264641373e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43188496, + "step": 74450 + }, + { + "epoch": 11.089514447423294, + "grad_norm": 0.3549831211566925, + "learning_rate": 2.4610085235142586e-05, + "loss": 0.0006, + "num_input_tokens_seen": 43191184, + "step": 74455 + }, + { + "epoch": 11.090259159964253, + "grad_norm": 0.003706513438373804, + "learning_rate": 2.4606836212231018e-05, + "loss": 0.0006, + "num_input_tokens_seen": 43193872, + "step": 74460 + }, + { + "epoch": 11.091003872505214, + "grad_norm": 0.0028391822706907988, + "learning_rate": 2.4603587195961554e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43196688, + "step": 74465 + }, + { + "epoch": 11.091748585046172, + "grad_norm": 0.005293973721563816, + "learning_rate": 2.460033818638908e-05, + "loss": 0.0, + "num_input_tokens_seen": 43199376, + "step": 74470 + }, + { + "epoch": 11.092493297587131, + "grad_norm": 0.19228005409240723, + "learning_rate": 2.4597089183568488e-05, + "loss": 0.1759, + "num_input_tokens_seen": 43202064, + "step": 74475 + }, + { + "epoch": 11.09323801012809, + "grad_norm": 0.043704260140657425, + "learning_rate": 2.4593840187554654e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43204912, + "step": 74480 + }, + { + "epoch": 11.09398272266905, + "grad_norm": 0.003799441969022155, + "learning_rate": 2.459059119840249e-05, + "loss": 0.1782, + "num_input_tokens_seen": 43207600, + "step": 74485 + }, + { + "epoch": 11.09472743521001, + "grad_norm": 0.007215180434286594, + "learning_rate": 2.4587342216166856e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43210704, + "step": 74490 + }, + { + "epoch": 11.095472147750968, + "grad_norm": 0.004420743323862553, + "learning_rate": 2.458409324090267e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43213680, + "step": 74495 + }, + { + "epoch": 11.096216860291927, + "grad_norm": 0.017043111845850945, + "learning_rate": 2.45808442726648e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43216656, + "step": 74500 + }, + { + "epoch": 11.096961572832887, + "grad_norm": 0.004121035803109407, + "learning_rate": 2.4577595311508143e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43219568, + "step": 74505 + }, + { + "epoch": 11.097706285373846, + "grad_norm": 0.0009097744477912784, + "learning_rate": 2.4574346357487588e-05, + "loss": 0.1191, + "num_input_tokens_seen": 43222288, + "step": 74510 + }, + { + "epoch": 11.098450997914805, + "grad_norm": 0.00578375905752182, + "learning_rate": 2.4571097410658006e-05, + "loss": 0.0035, + "num_input_tokens_seen": 43225200, + "step": 74515 + }, + { + "epoch": 11.099195710455763, + "grad_norm": 0.001601236523129046, + "learning_rate": 2.4567848471074307e-05, + "loss": 0.2983, + "num_input_tokens_seen": 43228112, + "step": 74520 + }, + { + "epoch": 11.099940422996724, + "grad_norm": 0.006381817162036896, + "learning_rate": 2.4564599538791362e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43231088, + "step": 74525 + }, + { + "epoch": 11.100685135537683, + "grad_norm": 0.5157651901245117, + "learning_rate": 2.4561350613864074e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43233968, + "step": 74530 + }, + { + "epoch": 11.101429848078642, + "grad_norm": 0.00862610898911953, + "learning_rate": 2.4558101696347315e-05, + "loss": 0.0495, + "num_input_tokens_seen": 43236816, + "step": 74535 + }, + { + "epoch": 11.1021745606196, + "grad_norm": 0.17004446685314178, + "learning_rate": 2.4554852786295985e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43239568, + "step": 74540 + }, + { + "epoch": 11.10291927316056, + "grad_norm": 0.011950075626373291, + "learning_rate": 2.4551603883764963e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43242480, + "step": 74545 + }, + { + "epoch": 11.10366398570152, + "grad_norm": 0.0021105112973600626, + "learning_rate": 2.454835498880913e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43245904, + "step": 74550 + }, + { + "epoch": 11.104408698242478, + "grad_norm": 0.01015334390103817, + "learning_rate": 2.454510610148339e-05, + "loss": 0.0823, + "num_input_tokens_seen": 43248816, + "step": 74555 + }, + { + "epoch": 11.105153410783437, + "grad_norm": 0.0012872408842667937, + "learning_rate": 2.454185722184261e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43251824, + "step": 74560 + }, + { + "epoch": 11.105898123324398, + "grad_norm": 0.009267371147871017, + "learning_rate": 2.4538608349941695e-05, + "loss": 0.087, + "num_input_tokens_seen": 43254576, + "step": 74565 + }, + { + "epoch": 11.106642835865356, + "grad_norm": 5.100061416625977, + "learning_rate": 2.453535948583552e-05, + "loss": 0.0373, + "num_input_tokens_seen": 43257360, + "step": 74570 + }, + { + "epoch": 11.107387548406315, + "grad_norm": 0.04045509546995163, + "learning_rate": 2.453211062957898e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43260240, + "step": 74575 + }, + { + "epoch": 11.108132260947274, + "grad_norm": 0.00236739800311625, + "learning_rate": 2.4528861781226942e-05, + "loss": 0.0767, + "num_input_tokens_seen": 43263280, + "step": 74580 + }, + { + "epoch": 11.108876973488233, + "grad_norm": 0.01753085106611252, + "learning_rate": 2.4525612940834315e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43266128, + "step": 74585 + }, + { + "epoch": 11.109621686029193, + "grad_norm": 0.06844035536050797, + "learning_rate": 2.452236410845598e-05, + "loss": 0.0008, + "num_input_tokens_seen": 43268944, + "step": 74590 + }, + { + "epoch": 11.110366398570152, + "grad_norm": 0.012383762747049332, + "learning_rate": 2.4519115284146806e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43271760, + "step": 74595 + }, + { + "epoch": 11.11111111111111, + "grad_norm": 0.001150181400589645, + "learning_rate": 2.45158664679617e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43274576, + "step": 74600 + }, + { + "epoch": 11.11185582365207, + "grad_norm": 0.01826268434524536, + "learning_rate": 2.4512617659955532e-05, + "loss": 0.0042, + "num_input_tokens_seen": 43277424, + "step": 74605 + }, + { + "epoch": 11.11260053619303, + "grad_norm": 0.022414807230234146, + "learning_rate": 2.45093688601832e-05, + "loss": 0.0956, + "num_input_tokens_seen": 43280144, + "step": 74610 + }, + { + "epoch": 11.113345248733989, + "grad_norm": 0.009276839904487133, + "learning_rate": 2.450612006869957e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43283312, + "step": 74615 + }, + { + "epoch": 11.114089961274948, + "grad_norm": 0.028262494131922722, + "learning_rate": 2.450287128555955e-05, + "loss": 0.002, + "num_input_tokens_seen": 43286320, + "step": 74620 + }, + { + "epoch": 11.114834673815906, + "grad_norm": 0.0008100806153379381, + "learning_rate": 2.4499622510818017e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43289040, + "step": 74625 + }, + { + "epoch": 11.115579386356867, + "grad_norm": 0.0011690851533785462, + "learning_rate": 2.4496373744529842e-05, + "loss": 0.0119, + "num_input_tokens_seen": 43291824, + "step": 74630 + }, + { + "epoch": 11.116324098897826, + "grad_norm": 0.03186088055372238, + "learning_rate": 2.449312498674993e-05, + "loss": 0.0651, + "num_input_tokens_seen": 43294608, + "step": 74635 + }, + { + "epoch": 11.117068811438784, + "grad_norm": 0.49731332063674927, + "learning_rate": 2.448987623753315e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43297808, + "step": 74640 + }, + { + "epoch": 11.117813523979743, + "grad_norm": 0.027750128880143166, + "learning_rate": 2.44866274969344e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43300464, + "step": 74645 + }, + { + "epoch": 11.118558236520704, + "grad_norm": 0.07503858953714371, + "learning_rate": 2.4483378765008543e-05, + "loss": 0.0191, + "num_input_tokens_seen": 43303344, + "step": 74650 + }, + { + "epoch": 11.119302949061662, + "grad_norm": 0.0008483863202854991, + "learning_rate": 2.4480130041810493e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43306256, + "step": 74655 + }, + { + "epoch": 11.120047661602621, + "grad_norm": 0.000741609197575599, + "learning_rate": 2.4476881327395108e-05, + "loss": 0.0, + "num_input_tokens_seen": 43309136, + "step": 74660 + }, + { + "epoch": 11.12079237414358, + "grad_norm": 0.0017004904802888632, + "learning_rate": 2.447363262181728e-05, + "loss": 0.0, + "num_input_tokens_seen": 43312016, + "step": 74665 + }, + { + "epoch": 11.12153708668454, + "grad_norm": 0.007105667609721422, + "learning_rate": 2.44703839251319e-05, + "loss": 0.0853, + "num_input_tokens_seen": 43314800, + "step": 74670 + }, + { + "epoch": 11.1222817992255, + "grad_norm": 0.0007778005092404783, + "learning_rate": 2.4467135237393842e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43317424, + "step": 74675 + }, + { + "epoch": 11.123026511766458, + "grad_norm": 0.003344890894368291, + "learning_rate": 2.4463886558658e-05, + "loss": 0.0008, + "num_input_tokens_seen": 43320240, + "step": 74680 + }, + { + "epoch": 11.123771224307417, + "grad_norm": 0.004016139078885317, + "learning_rate": 2.4460637888979234e-05, + "loss": 0.0192, + "num_input_tokens_seen": 43323280, + "step": 74685 + }, + { + "epoch": 11.124515936848377, + "grad_norm": 34.3465690612793, + "learning_rate": 2.4457389228412457e-05, + "loss": 0.0652, + "num_input_tokens_seen": 43326192, + "step": 74690 + }, + { + "epoch": 11.125260649389336, + "grad_norm": 86.59144592285156, + "learning_rate": 2.445414057701253e-05, + "loss": 0.0894, + "num_input_tokens_seen": 43329008, + "step": 74695 + }, + { + "epoch": 11.126005361930295, + "grad_norm": 0.0023068496957421303, + "learning_rate": 2.4450891934834345e-05, + "loss": 0.0, + "num_input_tokens_seen": 43331568, + "step": 74700 + }, + { + "epoch": 11.126750074471254, + "grad_norm": 1.634725570678711, + "learning_rate": 2.4447643301932785e-05, + "loss": 0.1316, + "num_input_tokens_seen": 43334512, + "step": 74705 + }, + { + "epoch": 11.127494787012214, + "grad_norm": 0.003266041399911046, + "learning_rate": 2.4444394678362727e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43337392, + "step": 74710 + }, + { + "epoch": 11.128239499553173, + "grad_norm": 0.023061322048306465, + "learning_rate": 2.444114606417906e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43340144, + "step": 74715 + }, + { + "epoch": 11.128984212094132, + "grad_norm": 25.452592849731445, + "learning_rate": 2.4437897459436654e-05, + "loss": 0.0908, + "num_input_tokens_seen": 43342960, + "step": 74720 + }, + { + "epoch": 11.12972892463509, + "grad_norm": 0.00633612647652626, + "learning_rate": 2.4434648864190405e-05, + "loss": 0.0, + "num_input_tokens_seen": 43346160, + "step": 74725 + }, + { + "epoch": 11.13047363717605, + "grad_norm": 0.0015380204422399402, + "learning_rate": 2.4431400278495188e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43348880, + "step": 74730 + }, + { + "epoch": 11.13121834971701, + "grad_norm": 0.0017116755479946733, + "learning_rate": 2.442815170240589e-05, + "loss": 0.0, + "num_input_tokens_seen": 43351504, + "step": 74735 + }, + { + "epoch": 11.131963062257968, + "grad_norm": 301.9273376464844, + "learning_rate": 2.442490313597738e-05, + "loss": 0.107, + "num_input_tokens_seen": 43354160, + "step": 74740 + }, + { + "epoch": 11.132707774798927, + "grad_norm": 0.003458768827840686, + "learning_rate": 2.4421654579264553e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43356784, + "step": 74745 + }, + { + "epoch": 11.133452487339888, + "grad_norm": 0.0047670817002654076, + "learning_rate": 2.4418406032322286e-05, + "loss": 0.0, + "num_input_tokens_seen": 43359440, + "step": 74750 + }, + { + "epoch": 11.134197199880846, + "grad_norm": 0.001110243028961122, + "learning_rate": 2.4415157495205445e-05, + "loss": 0.0, + "num_input_tokens_seen": 43361968, + "step": 74755 + }, + { + "epoch": 11.134941912421805, + "grad_norm": 0.00896318070590496, + "learning_rate": 2.4411908967968938e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43364656, + "step": 74760 + }, + { + "epoch": 11.135686624962764, + "grad_norm": 0.004241576883941889, + "learning_rate": 2.4408660450667626e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43367440, + "step": 74765 + }, + { + "epoch": 11.136431337503723, + "grad_norm": 0.006266876123845577, + "learning_rate": 2.4405411943356398e-05, + "loss": 0.0, + "num_input_tokens_seen": 43370544, + "step": 74770 + }, + { + "epoch": 11.137176050044683, + "grad_norm": 0.016813650727272034, + "learning_rate": 2.440216344609012e-05, + "loss": 0.1036, + "num_input_tokens_seen": 43373456, + "step": 74775 + }, + { + "epoch": 11.137920762585642, + "grad_norm": 0.004422828555107117, + "learning_rate": 2.4398914958923695e-05, + "loss": 0.0, + "num_input_tokens_seen": 43376336, + "step": 74780 + }, + { + "epoch": 11.1386654751266, + "grad_norm": 0.0006593713769689202, + "learning_rate": 2.439566648191199e-05, + "loss": 0.0, + "num_input_tokens_seen": 43379120, + "step": 74785 + }, + { + "epoch": 11.13941018766756, + "grad_norm": 0.002533335005864501, + "learning_rate": 2.439241801510987e-05, + "loss": 0.2098, + "num_input_tokens_seen": 43382224, + "step": 74790 + }, + { + "epoch": 11.14015490020852, + "grad_norm": 0.00020675804989878088, + "learning_rate": 2.4389169558572247e-05, + "loss": 0.0, + "num_input_tokens_seen": 43384880, + "step": 74795 + }, + { + "epoch": 11.140899612749479, + "grad_norm": 0.006953088100999594, + "learning_rate": 2.4385921112353978e-05, + "loss": 0.019, + "num_input_tokens_seen": 43387536, + "step": 74800 + }, + { + "epoch": 11.141644325290438, + "grad_norm": 0.003764311783015728, + "learning_rate": 2.438267267650995e-05, + "loss": 0.0, + "num_input_tokens_seen": 43390608, + "step": 74805 + }, + { + "epoch": 11.142389037831396, + "grad_norm": 0.007214795332401991, + "learning_rate": 2.4379424251095034e-05, + "loss": 0.0, + "num_input_tokens_seen": 43393584, + "step": 74810 + }, + { + "epoch": 11.143133750372357, + "grad_norm": 0.00071701843990013, + "learning_rate": 2.4376175836164122e-05, + "loss": 0.0267, + "num_input_tokens_seen": 43396368, + "step": 74815 + }, + { + "epoch": 11.143878462913316, + "grad_norm": 0.004893066361546516, + "learning_rate": 2.4372927431772076e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43399152, + "step": 74820 + }, + { + "epoch": 11.144623175454274, + "grad_norm": 0.019743330776691437, + "learning_rate": 2.4369679037973792e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43402448, + "step": 74825 + }, + { + "epoch": 11.145367887995233, + "grad_norm": 21.164880752563477, + "learning_rate": 2.436643065482414e-05, + "loss": 0.0066, + "num_input_tokens_seen": 43405360, + "step": 74830 + }, + { + "epoch": 11.146112600536194, + "grad_norm": 214.3655548095703, + "learning_rate": 2.4363182282377994e-05, + "loss": 0.2493, + "num_input_tokens_seen": 43408592, + "step": 74835 + }, + { + "epoch": 11.146857313077152, + "grad_norm": 0.0020863504614681005, + "learning_rate": 2.4359933920690242e-05, + "loss": 0.1533, + "num_input_tokens_seen": 43411856, + "step": 74840 + }, + { + "epoch": 11.147602025618111, + "grad_norm": 0.00012699443323072046, + "learning_rate": 2.4356685569815742e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43414704, + "step": 74845 + }, + { + "epoch": 11.14834673815907, + "grad_norm": 0.013386490754783154, + "learning_rate": 2.43534372298094e-05, + "loss": 0.0092, + "num_input_tokens_seen": 43417776, + "step": 74850 + }, + { + "epoch": 11.14909145070003, + "grad_norm": 25.270095825195312, + "learning_rate": 2.4350188900726068e-05, + "loss": 0.297, + "num_input_tokens_seen": 43421104, + "step": 74855 + }, + { + "epoch": 11.14983616324099, + "grad_norm": 0.003869588254019618, + "learning_rate": 2.4346940582620644e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43424016, + "step": 74860 + }, + { + "epoch": 11.150580875781948, + "grad_norm": 0.011355354450643063, + "learning_rate": 2.4343692275547993e-05, + "loss": 0.2509, + "num_input_tokens_seen": 43426864, + "step": 74865 + }, + { + "epoch": 11.151325588322907, + "grad_norm": 0.007795870304107666, + "learning_rate": 2.434044397956299e-05, + "loss": 0.003, + "num_input_tokens_seen": 43429776, + "step": 74870 + }, + { + "epoch": 11.152070300863867, + "grad_norm": 0.0023418855853378773, + "learning_rate": 2.433719569472052e-05, + "loss": 0.1347, + "num_input_tokens_seen": 43432528, + "step": 74875 + }, + { + "epoch": 11.152815013404826, + "grad_norm": 0.0008533561485819519, + "learning_rate": 2.433394742107545e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43435984, + "step": 74880 + }, + { + "epoch": 11.153559725945785, + "grad_norm": 0.0018133011180907488, + "learning_rate": 2.4330699158682666e-05, + "loss": 0.0008, + "num_input_tokens_seen": 43438800, + "step": 74885 + }, + { + "epoch": 11.154304438486744, + "grad_norm": 0.07982442528009415, + "learning_rate": 2.432745090759703e-05, + "loss": 0.0006, + "num_input_tokens_seen": 43441968, + "step": 74890 + }, + { + "epoch": 11.155049151027704, + "grad_norm": 0.018697043880820274, + "learning_rate": 2.432420266787344e-05, + "loss": 0.2203, + "num_input_tokens_seen": 43444880, + "step": 74895 + }, + { + "epoch": 11.155793863568663, + "grad_norm": 0.0036435488145798445, + "learning_rate": 2.4320954439566752e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43447792, + "step": 74900 + }, + { + "epoch": 11.156538576109622, + "grad_norm": 0.005602498073130846, + "learning_rate": 2.4317706222731853e-05, + "loss": 0.3736, + "num_input_tokens_seen": 43450704, + "step": 74905 + }, + { + "epoch": 11.15728328865058, + "grad_norm": 0.038147784769535065, + "learning_rate": 2.4314458017423618e-05, + "loss": 0.2847, + "num_input_tokens_seen": 43453456, + "step": 74910 + }, + { + "epoch": 11.158028001191541, + "grad_norm": 0.0032930488232523203, + "learning_rate": 2.43112098236969e-05, + "loss": 0.0153, + "num_input_tokens_seen": 43456304, + "step": 74915 + }, + { + "epoch": 11.1587727137325, + "grad_norm": 0.016423270106315613, + "learning_rate": 2.430796164160661e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43459184, + "step": 74920 + }, + { + "epoch": 11.159517426273458, + "grad_norm": 0.011161159723997116, + "learning_rate": 2.430471347120759e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43461936, + "step": 74925 + }, + { + "epoch": 11.160262138814417, + "grad_norm": 0.0033080349676311016, + "learning_rate": 2.4301465312554743e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43464656, + "step": 74930 + }, + { + "epoch": 11.161006851355376, + "grad_norm": 0.061409518122673035, + "learning_rate": 2.429821716570292e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43467632, + "step": 74935 + }, + { + "epoch": 11.161751563896336, + "grad_norm": 0.007461738772690296, + "learning_rate": 2.4294969030707013e-05, + "loss": 0.2127, + "num_input_tokens_seen": 43470352, + "step": 74940 + }, + { + "epoch": 11.162496276437295, + "grad_norm": 0.004423667211085558, + "learning_rate": 2.4291720907621886e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43473328, + "step": 74945 + }, + { + "epoch": 11.163240988978254, + "grad_norm": 0.004873564932495356, + "learning_rate": 2.4288472796502407e-05, + "loss": 0.001, + "num_input_tokens_seen": 43476176, + "step": 74950 + }, + { + "epoch": 11.163985701519213, + "grad_norm": 0.009816034696996212, + "learning_rate": 2.4285224697403464e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43479088, + "step": 74955 + }, + { + "epoch": 11.164730414060173, + "grad_norm": 1.0350427627563477, + "learning_rate": 2.4281976610379914e-05, + "loss": 0.0007, + "num_input_tokens_seen": 43482256, + "step": 74960 + }, + { + "epoch": 11.165475126601132, + "grad_norm": 76.19488525390625, + "learning_rate": 2.427872853548665e-05, + "loss": 0.225, + "num_input_tokens_seen": 43484880, + "step": 74965 + }, + { + "epoch": 11.16621983914209, + "grad_norm": 0.021986719220876694, + "learning_rate": 2.427548047277853e-05, + "loss": 0.0006, + "num_input_tokens_seen": 43487696, + "step": 74970 + }, + { + "epoch": 11.16696455168305, + "grad_norm": 0.006809759419411421, + "learning_rate": 2.4272232422310436e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43490480, + "step": 74975 + }, + { + "epoch": 11.16770926422401, + "grad_norm": 32.125572204589844, + "learning_rate": 2.4268984384137225e-05, + "loss": 0.0161, + "num_input_tokens_seen": 43493200, + "step": 74980 + }, + { + "epoch": 11.168453976764969, + "grad_norm": 0.042014069855213165, + "learning_rate": 2.426573635831379e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43496016, + "step": 74985 + }, + { + "epoch": 11.169198689305928, + "grad_norm": 0.0178294125944376, + "learning_rate": 2.426248834489499e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43498896, + "step": 74990 + }, + { + "epoch": 11.169943401846886, + "grad_norm": 0.010544156655669212, + "learning_rate": 2.4259240343935695e-05, + "loss": 0.3157, + "num_input_tokens_seen": 43501968, + "step": 74995 + }, + { + "epoch": 11.170688114387847, + "grad_norm": 0.6414217352867126, + "learning_rate": 2.4255992355490788e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43504496, + "step": 75000 + }, + { + "epoch": 11.171432826928806, + "grad_norm": 0.0008679882157593966, + "learning_rate": 2.425274437961513e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43507600, + "step": 75005 + }, + { + "epoch": 11.172177539469764, + "grad_norm": 0.011387777514755726, + "learning_rate": 2.42494964163636e-05, + "loss": 0.5317, + "num_input_tokens_seen": 43510704, + "step": 75010 + }, + { + "epoch": 11.172922252010723, + "grad_norm": 0.0020628320053219795, + "learning_rate": 2.4246248465791058e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43513584, + "step": 75015 + }, + { + "epoch": 11.173666964551684, + "grad_norm": 0.013143755495548248, + "learning_rate": 2.4243000527952388e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43516432, + "step": 75020 + }, + { + "epoch": 11.174411677092642, + "grad_norm": 0.00745210237801075, + "learning_rate": 2.423975260290246e-05, + "loss": 0.1222, + "num_input_tokens_seen": 43519312, + "step": 75025 + }, + { + "epoch": 11.175156389633601, + "grad_norm": 0.003752846736460924, + "learning_rate": 2.4236504690696125e-05, + "loss": 0.0, + "num_input_tokens_seen": 43522224, + "step": 75030 + }, + { + "epoch": 11.17590110217456, + "grad_norm": 0.007342053111642599, + "learning_rate": 2.423325679138828e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43524944, + "step": 75035 + }, + { + "epoch": 11.17664581471552, + "grad_norm": 0.01928076706826687, + "learning_rate": 2.4230008905033774e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43527536, + "step": 75040 + }, + { + "epoch": 11.17739052725648, + "grad_norm": 0.37153181433677673, + "learning_rate": 2.4226761031687496e-05, + "loss": 0.0005, + "num_input_tokens_seen": 43530160, + "step": 75045 + }, + { + "epoch": 11.178135239797438, + "grad_norm": 0.0008800805662758648, + "learning_rate": 2.4223513171404288e-05, + "loss": 0.2923, + "num_input_tokens_seen": 43533168, + "step": 75050 + }, + { + "epoch": 11.178879952338397, + "grad_norm": 0.013467837125062943, + "learning_rate": 2.4220265324239045e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43536016, + "step": 75055 + }, + { + "epoch": 11.179624664879357, + "grad_norm": 0.006920000072568655, + "learning_rate": 2.4217017490246626e-05, + "loss": 0.0266, + "num_input_tokens_seen": 43539312, + "step": 75060 + }, + { + "epoch": 11.180369377420316, + "grad_norm": 0.06735603511333466, + "learning_rate": 2.4213769669481906e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43542128, + "step": 75065 + }, + { + "epoch": 11.181114089961275, + "grad_norm": 0.004986779298633337, + "learning_rate": 2.421052186199974e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43545200, + "step": 75070 + }, + { + "epoch": 11.181858802502234, + "grad_norm": 153.2150115966797, + "learning_rate": 2.420727406785501e-05, + "loss": 0.335, + "num_input_tokens_seen": 43548048, + "step": 75075 + }, + { + "epoch": 11.182603515043194, + "grad_norm": 0.0012524049961939454, + "learning_rate": 2.4204026287102578e-05, + "loss": 0.1471, + "num_input_tokens_seen": 43550768, + "step": 75080 + }, + { + "epoch": 11.183348227584153, + "grad_norm": 78.7615966796875, + "learning_rate": 2.4200778519797306e-05, + "loss": 0.1413, + "num_input_tokens_seen": 43553840, + "step": 75085 + }, + { + "epoch": 11.184092940125112, + "grad_norm": 0.030176831409335136, + "learning_rate": 2.4197530765994082e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43557232, + "step": 75090 + }, + { + "epoch": 11.18483765266607, + "grad_norm": 61.28989028930664, + "learning_rate": 2.4194283025747748e-05, + "loss": 0.2336, + "num_input_tokens_seen": 43560240, + "step": 75095 + }, + { + "epoch": 11.18558236520703, + "grad_norm": 0.026908904314041138, + "learning_rate": 2.4191035299113194e-05, + "loss": 0.2408, + "num_input_tokens_seen": 43563056, + "step": 75100 + }, + { + "epoch": 11.18632707774799, + "grad_norm": 0.007032355759292841, + "learning_rate": 2.418778758614526e-05, + "loss": 0.3535, + "num_input_tokens_seen": 43566000, + "step": 75105 + }, + { + "epoch": 11.187071790288948, + "grad_norm": 0.030028540641069412, + "learning_rate": 2.4184539886898846e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43568944, + "step": 75110 + }, + { + "epoch": 11.187816502829907, + "grad_norm": 0.019163774326443672, + "learning_rate": 2.41812922014288e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43571664, + "step": 75115 + }, + { + "epoch": 11.188561215370866, + "grad_norm": 0.004902444779872894, + "learning_rate": 2.417804452978998e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43575184, + "step": 75120 + }, + { + "epoch": 11.189305927911827, + "grad_norm": 0.011373880319297314, + "learning_rate": 2.417479687203727e-05, + "loss": 0.0289, + "num_input_tokens_seen": 43577936, + "step": 75125 + }, + { + "epoch": 11.190050640452785, + "grad_norm": 0.007723797578364611, + "learning_rate": 2.4171549228225524e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43580912, + "step": 75130 + }, + { + "epoch": 11.190795352993744, + "grad_norm": 0.08104433119297028, + "learning_rate": 2.4168301598409617e-05, + "loss": 0.2598, + "num_input_tokens_seen": 43583984, + "step": 75135 + }, + { + "epoch": 11.191540065534703, + "grad_norm": 0.015529963187873363, + "learning_rate": 2.41650539826444e-05, + "loss": 0.0006, + "num_input_tokens_seen": 43587024, + "step": 75140 + }, + { + "epoch": 11.192284778075663, + "grad_norm": 0.008565345779061317, + "learning_rate": 2.416180638098476e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43589872, + "step": 75145 + }, + { + "epoch": 11.193029490616622, + "grad_norm": 0.028447004035115242, + "learning_rate": 2.4158558793485545e-05, + "loss": 0.0827, + "num_input_tokens_seen": 43592976, + "step": 75150 + }, + { + "epoch": 11.19377420315758, + "grad_norm": 0.010187827050685883, + "learning_rate": 2.4155311220201616e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43595696, + "step": 75155 + }, + { + "epoch": 11.19451891569854, + "grad_norm": 0.04696992412209511, + "learning_rate": 2.4152063661187855e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43598448, + "step": 75160 + }, + { + "epoch": 11.1952636282395, + "grad_norm": 0.03978525474667549, + "learning_rate": 2.414881611649911e-05, + "loss": 0.1534, + "num_input_tokens_seen": 43601712, + "step": 75165 + }, + { + "epoch": 11.196008340780459, + "grad_norm": 11.60384750366211, + "learning_rate": 2.414556858619026e-05, + "loss": 0.0746, + "num_input_tokens_seen": 43604752, + "step": 75170 + }, + { + "epoch": 11.196753053321418, + "grad_norm": 0.044900745153427124, + "learning_rate": 2.4142321070316147e-05, + "loss": 0.0011, + "num_input_tokens_seen": 43607536, + "step": 75175 + }, + { + "epoch": 11.197497765862376, + "grad_norm": 0.007367162965238094, + "learning_rate": 2.4139073568931658e-05, + "loss": 0.1356, + "num_input_tokens_seen": 43610544, + "step": 75180 + }, + { + "epoch": 11.198242478403337, + "grad_norm": 0.010808026418089867, + "learning_rate": 2.4135826082091636e-05, + "loss": 0.0038, + "num_input_tokens_seen": 43613232, + "step": 75185 + }, + { + "epoch": 11.198987190944296, + "grad_norm": 0.004840823356062174, + "learning_rate": 2.4132578609850966e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43616272, + "step": 75190 + }, + { + "epoch": 11.199731903485254, + "grad_norm": 0.02305670641362667, + "learning_rate": 2.41293311522645e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43619376, + "step": 75195 + }, + { + "epoch": 11.200476616026213, + "grad_norm": 0.029465898871421814, + "learning_rate": 2.4126083709387094e-05, + "loss": 0.0009, + "num_input_tokens_seen": 43622416, + "step": 75200 + }, + { + "epoch": 11.201221328567174, + "grad_norm": 0.012444994412362576, + "learning_rate": 2.4122836281273618e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43625488, + "step": 75205 + }, + { + "epoch": 11.201966041108133, + "grad_norm": 0.00045090526691637933, + "learning_rate": 2.411958886797892e-05, + "loss": 0.0454, + "num_input_tokens_seen": 43628464, + "step": 75210 + }, + { + "epoch": 11.202710753649091, + "grad_norm": 0.013346012681722641, + "learning_rate": 2.4116341469557888e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43631504, + "step": 75215 + }, + { + "epoch": 11.20345546619005, + "grad_norm": 0.11153256148099899, + "learning_rate": 2.4113094086065356e-05, + "loss": 0.0008, + "num_input_tokens_seen": 43634448, + "step": 75220 + }, + { + "epoch": 11.20420017873101, + "grad_norm": 0.0053972890600562096, + "learning_rate": 2.4109846717556206e-05, + "loss": 0.2763, + "num_input_tokens_seen": 43637072, + "step": 75225 + }, + { + "epoch": 11.20494489127197, + "grad_norm": 0.0017070581670850515, + "learning_rate": 2.4106599364085296e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43639920, + "step": 75230 + }, + { + "epoch": 11.205689603812928, + "grad_norm": 0.0017161418218165636, + "learning_rate": 2.4103352025707473e-05, + "loss": 0.2233, + "num_input_tokens_seen": 43642736, + "step": 75235 + }, + { + "epoch": 11.206434316353887, + "grad_norm": 0.00736803887411952, + "learning_rate": 2.4100104702477614e-05, + "loss": 0.0014, + "num_input_tokens_seen": 43645360, + "step": 75240 + }, + { + "epoch": 11.207179028894847, + "grad_norm": 0.0016175491036847234, + "learning_rate": 2.4096857394450557e-05, + "loss": 0.0491, + "num_input_tokens_seen": 43648368, + "step": 75245 + }, + { + "epoch": 11.207923741435806, + "grad_norm": 0.004565535578876734, + "learning_rate": 2.4093610101681192e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43651472, + "step": 75250 + }, + { + "epoch": 11.208668453976765, + "grad_norm": 0.03596959635615349, + "learning_rate": 2.4090362824224347e-05, + "loss": 0.0043, + "num_input_tokens_seen": 43654256, + "step": 75255 + }, + { + "epoch": 11.209413166517724, + "grad_norm": 0.0054730139672756195, + "learning_rate": 2.4087115562134913e-05, + "loss": 0.0005, + "num_input_tokens_seen": 43657616, + "step": 75260 + }, + { + "epoch": 11.210157879058684, + "grad_norm": 0.04767651483416557, + "learning_rate": 2.4083868315467725e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43660880, + "step": 75265 + }, + { + "epoch": 11.210902591599643, + "grad_norm": 0.07957921922206879, + "learning_rate": 2.4080621084277656e-05, + "loss": 0.0014, + "num_input_tokens_seen": 43663824, + "step": 75270 + }, + { + "epoch": 11.211647304140602, + "grad_norm": 0.016091734170913696, + "learning_rate": 2.4077373868619562e-05, + "loss": 0.113, + "num_input_tokens_seen": 43666608, + "step": 75275 + }, + { + "epoch": 11.21239201668156, + "grad_norm": 0.01802576333284378, + "learning_rate": 2.4074126668548287e-05, + "loss": 0.2126, + "num_input_tokens_seen": 43669232, + "step": 75280 + }, + { + "epoch": 11.21313672922252, + "grad_norm": 0.00098310480825603, + "learning_rate": 2.4070879484118712e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43671888, + "step": 75285 + }, + { + "epoch": 11.21388144176348, + "grad_norm": 0.01295414101332426, + "learning_rate": 2.4067632315385675e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43674768, + "step": 75290 + }, + { + "epoch": 11.214626154304439, + "grad_norm": 0.008722190745174885, + "learning_rate": 2.4064385162404048e-05, + "loss": 0.0006, + "num_input_tokens_seen": 43677552, + "step": 75295 + }, + { + "epoch": 11.215370866845397, + "grad_norm": 0.019083097577095032, + "learning_rate": 2.406113802522868e-05, + "loss": 0.4544, + "num_input_tokens_seen": 43680464, + "step": 75300 + }, + { + "epoch": 11.216115579386356, + "grad_norm": 256.4896240234375, + "learning_rate": 2.4057890903914437e-05, + "loss": 0.1909, + "num_input_tokens_seen": 43683376, + "step": 75305 + }, + { + "epoch": 11.216860291927317, + "grad_norm": 0.01328712422400713, + "learning_rate": 2.405464379851617e-05, + "loss": 0.0007, + "num_input_tokens_seen": 43686288, + "step": 75310 + }, + { + "epoch": 11.217605004468275, + "grad_norm": 0.007221274543553591, + "learning_rate": 2.4051396709088726e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43689040, + "step": 75315 + }, + { + "epoch": 11.218349717009234, + "grad_norm": 0.002085914835333824, + "learning_rate": 2.4048149635686977e-05, + "loss": 0.2107, + "num_input_tokens_seen": 43691856, + "step": 75320 + }, + { + "epoch": 11.219094429550193, + "grad_norm": 0.01983252353966236, + "learning_rate": 2.4044902578365765e-05, + "loss": 0.0682, + "num_input_tokens_seen": 43694672, + "step": 75325 + }, + { + "epoch": 11.219839142091153, + "grad_norm": 0.004652504809200764, + "learning_rate": 2.404165553717996e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43697776, + "step": 75330 + }, + { + "epoch": 11.220583854632112, + "grad_norm": 0.025262556970119476, + "learning_rate": 2.4038408512184406e-05, + "loss": 0.1381, + "num_input_tokens_seen": 43700656, + "step": 75335 + }, + { + "epoch": 11.221328567173071, + "grad_norm": 0.015132003463804722, + "learning_rate": 2.4035161503433974e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43703504, + "step": 75340 + }, + { + "epoch": 11.22207327971403, + "grad_norm": 0.0022254353389143944, + "learning_rate": 2.4031914510983492e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43706544, + "step": 75345 + }, + { + "epoch": 11.22281799225499, + "grad_norm": 0.09230975806713104, + "learning_rate": 2.402866753488784e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43709424, + "step": 75350 + }, + { + "epoch": 11.223562704795949, + "grad_norm": 12.922164916992188, + "learning_rate": 2.4025420575201867e-05, + "loss": 0.2311, + "num_input_tokens_seen": 43712336, + "step": 75355 + }, + { + "epoch": 11.224307417336908, + "grad_norm": 0.0013991622254252434, + "learning_rate": 2.402217363198041e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43715152, + "step": 75360 + }, + { + "epoch": 11.225052129877866, + "grad_norm": 0.1284113973379135, + "learning_rate": 2.4018926705278347e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43717936, + "step": 75365 + }, + { + "epoch": 11.225796842418827, + "grad_norm": 10.487860679626465, + "learning_rate": 2.4015679795150513e-05, + "loss": 0.2234, + "num_input_tokens_seen": 43720528, + "step": 75370 + }, + { + "epoch": 11.226541554959786, + "grad_norm": 0.052108269184827805, + "learning_rate": 2.4012432901651778e-05, + "loss": 0.0825, + "num_input_tokens_seen": 43723536, + "step": 75375 + }, + { + "epoch": 11.227286267500745, + "grad_norm": 0.024768348783254623, + "learning_rate": 2.400918602483697e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43726992, + "step": 75380 + }, + { + "epoch": 11.228030980041703, + "grad_norm": 0.07067868858575821, + "learning_rate": 2.400593916476097e-05, + "loss": 0.1004, + "num_input_tokens_seen": 43729776, + "step": 75385 + }, + { + "epoch": 11.228775692582664, + "grad_norm": 0.019814863801002502, + "learning_rate": 2.4002692321478617e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43732464, + "step": 75390 + }, + { + "epoch": 11.229520405123623, + "grad_norm": 0.07343466579914093, + "learning_rate": 2.399944549504476e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43735120, + "step": 75395 + }, + { + "epoch": 11.230265117664581, + "grad_norm": 0.009188130497932434, + "learning_rate": 2.399619868551425e-05, + "loss": 0.001, + "num_input_tokens_seen": 43737872, + "step": 75400 + }, + { + "epoch": 11.23100983020554, + "grad_norm": 0.000255913328146562, + "learning_rate": 2.3992951892941948e-05, + "loss": 0.0058, + "num_input_tokens_seen": 43740944, + "step": 75405 + }, + { + "epoch": 11.2317545427465, + "grad_norm": 0.006260246969759464, + "learning_rate": 2.3989705117382705e-05, + "loss": 0.1166, + "num_input_tokens_seen": 43743920, + "step": 75410 + }, + { + "epoch": 11.23249925528746, + "grad_norm": 25.961597442626953, + "learning_rate": 2.3986458358891353e-05, + "loss": 0.227, + "num_input_tokens_seen": 43746864, + "step": 75415 + }, + { + "epoch": 11.233243967828418, + "grad_norm": 0.09162471443414688, + "learning_rate": 2.3983211617522768e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43749808, + "step": 75420 + }, + { + "epoch": 11.233988680369377, + "grad_norm": 0.24866710603237152, + "learning_rate": 2.3979964893331787e-05, + "loss": 0.0007, + "num_input_tokens_seen": 43752688, + "step": 75425 + }, + { + "epoch": 11.234733392910337, + "grad_norm": 0.02516757883131504, + "learning_rate": 2.397671818637327e-05, + "loss": 0.0714, + "num_input_tokens_seen": 43755376, + "step": 75430 + }, + { + "epoch": 11.235478105451296, + "grad_norm": 34.350914001464844, + "learning_rate": 2.3973471496702052e-05, + "loss": 0.1598, + "num_input_tokens_seen": 43758416, + "step": 75435 + }, + { + "epoch": 11.236222817992255, + "grad_norm": 0.001735585043206811, + "learning_rate": 2.397022482437298e-05, + "loss": 0.001, + "num_input_tokens_seen": 43761552, + "step": 75440 + }, + { + "epoch": 11.236967530533214, + "grad_norm": 0.005260074511170387, + "learning_rate": 2.3966978169440927e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43764432, + "step": 75445 + }, + { + "epoch": 11.237712243074174, + "grad_norm": 0.15935435891151428, + "learning_rate": 2.396373153196072e-05, + "loss": 0.0006, + "num_input_tokens_seen": 43767440, + "step": 75450 + }, + { + "epoch": 11.238456955615133, + "grad_norm": 0.10629360377788544, + "learning_rate": 2.3960484911987223e-05, + "loss": 0.0005, + "num_input_tokens_seen": 43770288, + "step": 75455 + }, + { + "epoch": 11.239201668156092, + "grad_norm": 0.017730645835399628, + "learning_rate": 2.395723830957527e-05, + "loss": 0.1908, + "num_input_tokens_seen": 43773296, + "step": 75460 + }, + { + "epoch": 11.23994638069705, + "grad_norm": 0.11279254406690598, + "learning_rate": 2.3953991724779723e-05, + "loss": 0.0019, + "num_input_tokens_seen": 43776304, + "step": 75465 + }, + { + "epoch": 11.24069109323801, + "grad_norm": 0.16384907066822052, + "learning_rate": 2.395074515765542e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43779056, + "step": 75470 + }, + { + "epoch": 11.24143580577897, + "grad_norm": 0.01852215640246868, + "learning_rate": 2.3947498608257204e-05, + "loss": 0.1677, + "num_input_tokens_seen": 43782032, + "step": 75475 + }, + { + "epoch": 11.242180518319929, + "grad_norm": 0.00819967407733202, + "learning_rate": 2.394425207663994e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43784976, + "step": 75480 + }, + { + "epoch": 11.242925230860887, + "grad_norm": 0.004314773250371218, + "learning_rate": 2.3941005562858452e-05, + "loss": 0.0039, + "num_input_tokens_seen": 43788080, + "step": 75485 + }, + { + "epoch": 11.243669943401846, + "grad_norm": 0.022769495844841003, + "learning_rate": 2.393775906696761e-05, + "loss": 0.0736, + "num_input_tokens_seen": 43791184, + "step": 75490 + }, + { + "epoch": 11.244414655942807, + "grad_norm": 0.028143566101789474, + "learning_rate": 2.393451258902224e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43794096, + "step": 75495 + }, + { + "epoch": 11.245159368483765, + "grad_norm": 0.009759509935975075, + "learning_rate": 2.3931266129077206e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43796976, + "step": 75500 + }, + { + "epoch": 11.245904081024724, + "grad_norm": 0.01201801747083664, + "learning_rate": 2.3928019687187338e-05, + "loss": 0.1099, + "num_input_tokens_seen": 43800752, + "step": 75505 + }, + { + "epoch": 11.246648793565683, + "grad_norm": 0.022514915093779564, + "learning_rate": 2.392477326340749e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43803696, + "step": 75510 + }, + { + "epoch": 11.247393506106643, + "grad_norm": 4.164835453033447, + "learning_rate": 2.3921526857792508e-05, + "loss": 0.006, + "num_input_tokens_seen": 43806800, + "step": 75515 + }, + { + "epoch": 11.248138218647602, + "grad_norm": 0.00813613273203373, + "learning_rate": 2.3918280470397226e-05, + "loss": 0.2018, + "num_input_tokens_seen": 43809680, + "step": 75520 + }, + { + "epoch": 11.248882931188561, + "grad_norm": 0.09725439548492432, + "learning_rate": 2.3915034101276504e-05, + "loss": 0.0095, + "num_input_tokens_seen": 43812720, + "step": 75525 + }, + { + "epoch": 11.24962764372952, + "grad_norm": 0.005965046584606171, + "learning_rate": 2.3911787750485172e-05, + "loss": 0.0134, + "num_input_tokens_seen": 43815632, + "step": 75530 + }, + { + "epoch": 11.25037235627048, + "grad_norm": 0.004664409905672073, + "learning_rate": 2.3908541418078087e-05, + "loss": 0.1112, + "num_input_tokens_seen": 43819184, + "step": 75535 + }, + { + "epoch": 11.251117068811439, + "grad_norm": 0.0034497373271733522, + "learning_rate": 2.3905295104110076e-05, + "loss": 0.0008, + "num_input_tokens_seen": 43821808, + "step": 75540 + }, + { + "epoch": 11.251861781352398, + "grad_norm": 0.0807151347398758, + "learning_rate": 2.3902048808636e-05, + "loss": 0.0593, + "num_input_tokens_seen": 43824848, + "step": 75545 + }, + { + "epoch": 11.252606493893357, + "grad_norm": 0.0011869427980855107, + "learning_rate": 2.3898802531710693e-05, + "loss": 0.166, + "num_input_tokens_seen": 43827792, + "step": 75550 + }, + { + "epoch": 11.253351206434317, + "grad_norm": 0.0013843858614563942, + "learning_rate": 2.389555627338899e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43830736, + "step": 75555 + }, + { + "epoch": 11.254095918975276, + "grad_norm": 0.00932077132165432, + "learning_rate": 2.3892310033725747e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43833904, + "step": 75560 + }, + { + "epoch": 11.254840631516235, + "grad_norm": 0.0010105154942721128, + "learning_rate": 2.38890638127758e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43836592, + "step": 75565 + }, + { + "epoch": 11.255585344057193, + "grad_norm": 1.6783301830291748, + "learning_rate": 2.3885817610593994e-05, + "loss": 0.1142, + "num_input_tokens_seen": 43839728, + "step": 75570 + }, + { + "epoch": 11.256330056598154, + "grad_norm": 0.007734478451311588, + "learning_rate": 2.3882571427235156e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43843120, + "step": 75575 + }, + { + "epoch": 11.257074769139113, + "grad_norm": 14.744132995605469, + "learning_rate": 2.3879325262754152e-05, + "loss": 0.1972, + "num_input_tokens_seen": 43846000, + "step": 75580 + }, + { + "epoch": 11.257819481680071, + "grad_norm": 0.004645278211683035, + "learning_rate": 2.3876079117205795e-05, + "loss": 0.0006, + "num_input_tokens_seen": 43848912, + "step": 75585 + }, + { + "epoch": 11.25856419422103, + "grad_norm": 0.004785486496984959, + "learning_rate": 2.387283299064495e-05, + "loss": 0.0081, + "num_input_tokens_seen": 43851856, + "step": 75590 + }, + { + "epoch": 11.25930890676199, + "grad_norm": 0.00696165207773447, + "learning_rate": 2.3869586883126445e-05, + "loss": 0.0133, + "num_input_tokens_seen": 43854992, + "step": 75595 + }, + { + "epoch": 11.26005361930295, + "grad_norm": 0.00427012937143445, + "learning_rate": 2.3866340794705117e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43857936, + "step": 75600 + }, + { + "epoch": 11.260798331843908, + "grad_norm": 0.002250207355245948, + "learning_rate": 2.3863094725435813e-05, + "loss": 0.0067, + "num_input_tokens_seen": 43860688, + "step": 75605 + }, + { + "epoch": 11.261543044384867, + "grad_norm": 0.0022583073005080223, + "learning_rate": 2.3859848675373358e-05, + "loss": 0.0453, + "num_input_tokens_seen": 43863696, + "step": 75610 + }, + { + "epoch": 11.262287756925826, + "grad_norm": 0.004022941458970308, + "learning_rate": 2.385660264457261e-05, + "loss": 0.001, + "num_input_tokens_seen": 43866768, + "step": 75615 + }, + { + "epoch": 11.263032469466786, + "grad_norm": 0.005835664924234152, + "learning_rate": 2.385335663308839e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43869648, + "step": 75620 + }, + { + "epoch": 11.263777182007745, + "grad_norm": 0.009480842389166355, + "learning_rate": 2.3850110640975555e-05, + "loss": 0.0001, + "num_input_tokens_seen": 43872528, + "step": 75625 + }, + { + "epoch": 11.264521894548704, + "grad_norm": 0.13523660600185394, + "learning_rate": 2.3846864668288933e-05, + "loss": 0.2745, + "num_input_tokens_seen": 43875632, + "step": 75630 + }, + { + "epoch": 11.265266607089663, + "grad_norm": 0.16522879898548126, + "learning_rate": 2.3843618715083353e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43878608, + "step": 75635 + }, + { + "epoch": 11.266011319630623, + "grad_norm": 0.016076598316431046, + "learning_rate": 2.3840372781413667e-05, + "loss": 0.1064, + "num_input_tokens_seen": 43881840, + "step": 75640 + }, + { + "epoch": 11.266756032171582, + "grad_norm": 0.002462349832057953, + "learning_rate": 2.3837126867334687e-05, + "loss": 0.1816, + "num_input_tokens_seen": 43884944, + "step": 75645 + }, + { + "epoch": 11.26750074471254, + "grad_norm": 0.007576014380902052, + "learning_rate": 2.3833880972901285e-05, + "loss": 0.3616, + "num_input_tokens_seen": 43888208, + "step": 75650 + }, + { + "epoch": 11.2682454572535, + "grad_norm": 11.744864463806152, + "learning_rate": 2.3830635098168267e-05, + "loss": 0.172, + "num_input_tokens_seen": 43891152, + "step": 75655 + }, + { + "epoch": 11.26899016979446, + "grad_norm": 0.09633654356002808, + "learning_rate": 2.3827389243190486e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43893840, + "step": 75660 + }, + { + "epoch": 11.269734882335419, + "grad_norm": 0.007801926229149103, + "learning_rate": 2.3824143408022773e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43896656, + "step": 75665 + }, + { + "epoch": 11.270479594876377, + "grad_norm": 0.07332679629325867, + "learning_rate": 2.3820897592719964e-05, + "loss": 0.0007, + "num_input_tokens_seen": 43899632, + "step": 75670 + }, + { + "epoch": 11.271224307417336, + "grad_norm": 0.4758346378803253, + "learning_rate": 2.3817651797336894e-05, + "loss": 0.0008, + "num_input_tokens_seen": 43902256, + "step": 75675 + }, + { + "epoch": 11.271969019958297, + "grad_norm": 0.06543999910354614, + "learning_rate": 2.3814406021928382e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43905456, + "step": 75680 + }, + { + "epoch": 11.272713732499255, + "grad_norm": 0.11654675006866455, + "learning_rate": 2.381116026654929e-05, + "loss": 0.139, + "num_input_tokens_seen": 43908496, + "step": 75685 + }, + { + "epoch": 11.273458445040214, + "grad_norm": 0.017610779032111168, + "learning_rate": 2.3807914531254417e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43911408, + "step": 75690 + }, + { + "epoch": 11.274203157581173, + "grad_norm": 0.005843372084200382, + "learning_rate": 2.3804668816098635e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43914416, + "step": 75695 + }, + { + "epoch": 11.274947870122134, + "grad_norm": 0.1059187799692154, + "learning_rate": 2.3801423121136752e-05, + "loss": 0.1423, + "num_input_tokens_seen": 43917360, + "step": 75700 + }, + { + "epoch": 11.275692582663092, + "grad_norm": 0.021966490894556046, + "learning_rate": 2.379817744642361e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43920560, + "step": 75705 + }, + { + "epoch": 11.276437295204051, + "grad_norm": 0.06653071939945221, + "learning_rate": 2.379493179201403e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43923504, + "step": 75710 + }, + { + "epoch": 11.27718200774501, + "grad_norm": 0.09797196090221405, + "learning_rate": 2.3791686157962866e-05, + "loss": 0.0512, + "num_input_tokens_seen": 43926288, + "step": 75715 + }, + { + "epoch": 11.27792672028597, + "grad_norm": 0.008127759210765362, + "learning_rate": 2.378844054432493e-05, + "loss": 0.1102, + "num_input_tokens_seen": 43929264, + "step": 75720 + }, + { + "epoch": 11.278671432826929, + "grad_norm": 0.026044294238090515, + "learning_rate": 2.378519495115506e-05, + "loss": 0.1438, + "num_input_tokens_seen": 43931888, + "step": 75725 + }, + { + "epoch": 11.279416145367888, + "grad_norm": 0.008871830999851227, + "learning_rate": 2.3781949378508085e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43934928, + "step": 75730 + }, + { + "epoch": 11.280160857908847, + "grad_norm": 0.0026440946385264397, + "learning_rate": 2.3778703826438833e-05, + "loss": 0.2375, + "num_input_tokens_seen": 43937616, + "step": 75735 + }, + { + "epoch": 11.280905570449807, + "grad_norm": 0.013804974965751171, + "learning_rate": 2.377545829500215e-05, + "loss": 0.0004, + "num_input_tokens_seen": 43940368, + "step": 75740 + }, + { + "epoch": 11.281650282990766, + "grad_norm": 0.18941178917884827, + "learning_rate": 2.377221278425284e-05, + "loss": 0.0008, + "num_input_tokens_seen": 43943024, + "step": 75745 + }, + { + "epoch": 11.282394995531725, + "grad_norm": 0.040232714265584946, + "learning_rate": 2.376896729424576e-05, + "loss": 0.1164, + "num_input_tokens_seen": 43946128, + "step": 75750 + }, + { + "epoch": 11.283139708072683, + "grad_norm": 0.007406687829643488, + "learning_rate": 2.3765721825035724e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43948944, + "step": 75755 + }, + { + "epoch": 11.283884420613644, + "grad_norm": 0.10577766597270966, + "learning_rate": 2.376247637667756e-05, + "loss": 0.0005, + "num_input_tokens_seen": 43951632, + "step": 75760 + }, + { + "epoch": 11.284629133154603, + "grad_norm": 0.009321678429841995, + "learning_rate": 2.3759230949226103e-05, + "loss": 0.0192, + "num_input_tokens_seen": 43954640, + "step": 75765 + }, + { + "epoch": 11.285373845695561, + "grad_norm": 0.047497138381004333, + "learning_rate": 2.375598554273617e-05, + "loss": 0.0581, + "num_input_tokens_seen": 43957424, + "step": 75770 + }, + { + "epoch": 11.28611855823652, + "grad_norm": 6.822856903076172, + "learning_rate": 2.3752740157262607e-05, + "loss": 0.3754, + "num_input_tokens_seen": 43960336, + "step": 75775 + }, + { + "epoch": 11.28686327077748, + "grad_norm": 0.021356023848056793, + "learning_rate": 2.374949479286022e-05, + "loss": 0.0003, + "num_input_tokens_seen": 43963376, + "step": 75780 + }, + { + "epoch": 11.28760798331844, + "grad_norm": 0.011074787005782127, + "learning_rate": 2.374624944958386e-05, + "loss": 0.0922, + "num_input_tokens_seen": 43966096, + "step": 75785 + }, + { + "epoch": 11.288352695859398, + "grad_norm": 0.009211627766489983, + "learning_rate": 2.3743004127488332e-05, + "loss": 0.0009, + "num_input_tokens_seen": 43969136, + "step": 75790 + }, + { + "epoch": 11.289097408400357, + "grad_norm": 0.015819167718291283, + "learning_rate": 2.373975882662848e-05, + "loss": 0.0006, + "num_input_tokens_seen": 43972176, + "step": 75795 + }, + { + "epoch": 11.289842120941316, + "grad_norm": 0.5053892135620117, + "learning_rate": 2.3736513547059124e-05, + "loss": 0.0031, + "num_input_tokens_seen": 43975056, + "step": 75800 + }, + { + "epoch": 11.290586833482276, + "grad_norm": 0.005890072789043188, + "learning_rate": 2.373326828883507e-05, + "loss": 0.1317, + "num_input_tokens_seen": 43977872, + "step": 75805 + }, + { + "epoch": 11.291331546023235, + "grad_norm": 0.016212541610002518, + "learning_rate": 2.3730023052011178e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43980656, + "step": 75810 + }, + { + "epoch": 11.292076258564194, + "grad_norm": 7.026090145111084, + "learning_rate": 2.3726777836642243e-05, + "loss": 0.0234, + "num_input_tokens_seen": 43983920, + "step": 75815 + }, + { + "epoch": 11.292820971105153, + "grad_norm": 42.111724853515625, + "learning_rate": 2.3723532642783114e-05, + "loss": 0.2193, + "num_input_tokens_seen": 43986864, + "step": 75820 + }, + { + "epoch": 11.293565683646113, + "grad_norm": 0.033250439912080765, + "learning_rate": 2.3720287470488596e-05, + "loss": 0.0052, + "num_input_tokens_seen": 43989712, + "step": 75825 + }, + { + "epoch": 11.294310396187072, + "grad_norm": 0.016709666699171066, + "learning_rate": 2.3717042319813527e-05, + "loss": 0.2328, + "num_input_tokens_seen": 43993136, + "step": 75830 + }, + { + "epoch": 11.29505510872803, + "grad_norm": 0.07288090139627457, + "learning_rate": 2.3713797190812726e-05, + "loss": 0.0046, + "num_input_tokens_seen": 43995920, + "step": 75835 + }, + { + "epoch": 11.29579982126899, + "grad_norm": 0.011146405711770058, + "learning_rate": 2.3710552083540998e-05, + "loss": 0.0002, + "num_input_tokens_seen": 43998608, + "step": 75840 + }, + { + "epoch": 11.29654453380995, + "grad_norm": 0.012859448790550232, + "learning_rate": 2.3707306998053198e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44001456, + "step": 75845 + }, + { + "epoch": 11.297289246350909, + "grad_norm": 0.0031196139752864838, + "learning_rate": 2.370406193440412e-05, + "loss": 0.1051, + "num_input_tokens_seen": 44004432, + "step": 75850 + }, + { + "epoch": 11.298033958891867, + "grad_norm": 0.006661758292466402, + "learning_rate": 2.3700816892648608e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44007280, + "step": 75855 + }, + { + "epoch": 11.298778671432826, + "grad_norm": 18.47831916809082, + "learning_rate": 2.369757187284147e-05, + "loss": 0.1111, + "num_input_tokens_seen": 44010224, + "step": 75860 + }, + { + "epoch": 11.299523383973787, + "grad_norm": 0.3372849225997925, + "learning_rate": 2.3694326875037536e-05, + "loss": 0.1597, + "num_input_tokens_seen": 44013360, + "step": 75865 + }, + { + "epoch": 11.300268096514746, + "grad_norm": 0.002102768514305353, + "learning_rate": 2.3691081899291613e-05, + "loss": 0.0033, + "num_input_tokens_seen": 44016176, + "step": 75870 + }, + { + "epoch": 11.301012809055704, + "grad_norm": 0.18132299184799194, + "learning_rate": 2.3687836945658542e-05, + "loss": 0.0496, + "num_input_tokens_seen": 44019088, + "step": 75875 + }, + { + "epoch": 11.301757521596663, + "grad_norm": 0.007147507276386023, + "learning_rate": 2.3684592014193133e-05, + "loss": 0.0768, + "num_input_tokens_seen": 44021936, + "step": 75880 + }, + { + "epoch": 11.302502234137624, + "grad_norm": 0.005147925578057766, + "learning_rate": 2.3681347104950193e-05, + "loss": 0.2761, + "num_input_tokens_seen": 44024784, + "step": 75885 + }, + { + "epoch": 11.303246946678582, + "grad_norm": 0.007281903177499771, + "learning_rate": 2.3678102217984564e-05, + "loss": 0.0004, + "num_input_tokens_seen": 44027632, + "step": 75890 + }, + { + "epoch": 11.303991659219541, + "grad_norm": 0.039910588413476944, + "learning_rate": 2.3674857353351048e-05, + "loss": 0.0004, + "num_input_tokens_seen": 44030416, + "step": 75895 + }, + { + "epoch": 11.3047363717605, + "grad_norm": 4.3317365646362305, + "learning_rate": 2.3671612511104476e-05, + "loss": 0.0017, + "num_input_tokens_seen": 44033424, + "step": 75900 + }, + { + "epoch": 11.30548108430146, + "grad_norm": 0.00551245454698801, + "learning_rate": 2.366836769129965e-05, + "loss": 0.0025, + "num_input_tokens_seen": 44036112, + "step": 75905 + }, + { + "epoch": 11.30622579684242, + "grad_norm": 0.027760563418269157, + "learning_rate": 2.366512289399141e-05, + "loss": 0.1973, + "num_input_tokens_seen": 44038992, + "step": 75910 + }, + { + "epoch": 11.306970509383378, + "grad_norm": 0.018500661477446556, + "learning_rate": 2.3661878119234562e-05, + "loss": 0.1042, + "num_input_tokens_seen": 44041552, + "step": 75915 + }, + { + "epoch": 11.307715221924337, + "grad_norm": 0.0056099314242601395, + "learning_rate": 2.3658633367083914e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44044272, + "step": 75920 + }, + { + "epoch": 11.308459934465297, + "grad_norm": 0.11419722437858582, + "learning_rate": 2.3655388637594298e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44046960, + "step": 75925 + }, + { + "epoch": 11.309204647006256, + "grad_norm": 0.009655018337070942, + "learning_rate": 2.3652143930820523e-05, + "loss": 0.0706, + "num_input_tokens_seen": 44049872, + "step": 75930 + }, + { + "epoch": 11.309949359547215, + "grad_norm": 0.011402702890336514, + "learning_rate": 2.364889924681741e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44052432, + "step": 75935 + }, + { + "epoch": 11.310694072088173, + "grad_norm": 0.5767307877540588, + "learning_rate": 2.364565458563976e-05, + "loss": 0.1403, + "num_input_tokens_seen": 44055760, + "step": 75940 + }, + { + "epoch": 11.311438784629134, + "grad_norm": 0.005940394941717386, + "learning_rate": 2.364240994734241e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44058800, + "step": 75945 + }, + { + "epoch": 11.312183497170093, + "grad_norm": 0.0014570036437362432, + "learning_rate": 2.3639165331980157e-05, + "loss": 0.0009, + "num_input_tokens_seen": 44061616, + "step": 75950 + }, + { + "epoch": 11.312928209711052, + "grad_norm": 0.005124179180711508, + "learning_rate": 2.3635920739607828e-05, + "loss": 0.2607, + "num_input_tokens_seen": 44064720, + "step": 75955 + }, + { + "epoch": 11.31367292225201, + "grad_norm": 0.03356840834021568, + "learning_rate": 2.3632676170280235e-05, + "loss": 0.0009, + "num_input_tokens_seen": 44067856, + "step": 75960 + }, + { + "epoch": 11.31441763479297, + "grad_norm": 17.231632232666016, + "learning_rate": 2.362943162405218e-05, + "loss": 0.3049, + "num_input_tokens_seen": 44070736, + "step": 75965 + }, + { + "epoch": 11.31516234733393, + "grad_norm": 0.007466605864465237, + "learning_rate": 2.3626187100978496e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44073648, + "step": 75970 + }, + { + "epoch": 11.315907059874888, + "grad_norm": 3.3469486236572266, + "learning_rate": 2.362294260111397e-05, + "loss": 0.051, + "num_input_tokens_seen": 44076112, + "step": 75975 + }, + { + "epoch": 11.316651772415847, + "grad_norm": 0.008361120708286762, + "learning_rate": 2.3619698124513438e-05, + "loss": 0.0354, + "num_input_tokens_seen": 44078992, + "step": 75980 + }, + { + "epoch": 11.317396484956806, + "grad_norm": 17.633054733276367, + "learning_rate": 2.3616453671231694e-05, + "loss": 0.0407, + "num_input_tokens_seen": 44081936, + "step": 75985 + }, + { + "epoch": 11.318141197497766, + "grad_norm": 39.76454162597656, + "learning_rate": 2.3613209241323574e-05, + "loss": 0.0796, + "num_input_tokens_seen": 44085136, + "step": 75990 + }, + { + "epoch": 11.318885910038725, + "grad_norm": 0.04261808097362518, + "learning_rate": 2.360996483484387e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44087888, + "step": 75995 + }, + { + "epoch": 11.319630622579684, + "grad_norm": 0.004892365075647831, + "learning_rate": 2.360672045184739e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44090736, + "step": 76000 + }, + { + "epoch": 11.320375335120643, + "grad_norm": 0.0012789571192115545, + "learning_rate": 2.3603476092388963e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44093776, + "step": 76005 + }, + { + "epoch": 11.321120047661603, + "grad_norm": 0.011084990575909615, + "learning_rate": 2.3600231756523373e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44096720, + "step": 76010 + }, + { + "epoch": 11.321864760202562, + "grad_norm": 0.011545833200216293, + "learning_rate": 2.3596987444305456e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44099536, + "step": 76015 + }, + { + "epoch": 11.32260947274352, + "grad_norm": 0.01246478222310543, + "learning_rate": 2.359374315579e-05, + "loss": 0.0004, + "num_input_tokens_seen": 44102320, + "step": 76020 + }, + { + "epoch": 11.32335418528448, + "grad_norm": 0.000312662567012012, + "learning_rate": 2.3590498891031838e-05, + "loss": 0.002, + "num_input_tokens_seen": 44105136, + "step": 76025 + }, + { + "epoch": 11.32409889782544, + "grad_norm": 0.06231144443154335, + "learning_rate": 2.3587254650085757e-05, + "loss": 0.2253, + "num_input_tokens_seen": 44108080, + "step": 76030 + }, + { + "epoch": 11.324843610366399, + "grad_norm": 0.003913932014256716, + "learning_rate": 2.3584010433006577e-05, + "loss": 0.001, + "num_input_tokens_seen": 44111120, + "step": 76035 + }, + { + "epoch": 11.325588322907358, + "grad_norm": 0.05780377984046936, + "learning_rate": 2.3580766239849102e-05, + "loss": 0.031, + "num_input_tokens_seen": 44113840, + "step": 76040 + }, + { + "epoch": 11.326333035448316, + "grad_norm": 16.843542098999023, + "learning_rate": 2.3577522070668128e-05, + "loss": 0.0374, + "num_input_tokens_seen": 44116752, + "step": 76045 + }, + { + "epoch": 11.327077747989277, + "grad_norm": 9.552102088928223, + "learning_rate": 2.3574277925518488e-05, + "loss": 0.3231, + "num_input_tokens_seen": 44119568, + "step": 76050 + }, + { + "epoch": 11.327822460530236, + "grad_norm": 0.0012167624663561583, + "learning_rate": 2.357103380445496e-05, + "loss": 0.0007, + "num_input_tokens_seen": 44122576, + "step": 76055 + }, + { + "epoch": 11.328567173071194, + "grad_norm": 0.017460832372307777, + "learning_rate": 2.356778970753237e-05, + "loss": 0.0676, + "num_input_tokens_seen": 44125424, + "step": 76060 + }, + { + "epoch": 11.329311885612153, + "grad_norm": 0.0015300125814974308, + "learning_rate": 2.3564545634805516e-05, + "loss": 0.2386, + "num_input_tokens_seen": 44128528, + "step": 76065 + }, + { + "epoch": 11.330056598153114, + "grad_norm": 0.03982268646359444, + "learning_rate": 2.356130158632921e-05, + "loss": 0.0005, + "num_input_tokens_seen": 44131472, + "step": 76070 + }, + { + "epoch": 11.330801310694072, + "grad_norm": 0.0032517947256565094, + "learning_rate": 2.3558057562158247e-05, + "loss": 0.0131, + "num_input_tokens_seen": 44134512, + "step": 76075 + }, + { + "epoch": 11.331546023235031, + "grad_norm": 0.007411811966449022, + "learning_rate": 2.355481356234743e-05, + "loss": 0.0005, + "num_input_tokens_seen": 44137264, + "step": 76080 + }, + { + "epoch": 11.33229073577599, + "grad_norm": 0.11214381456375122, + "learning_rate": 2.355156958695158e-05, + "loss": 0.0006, + "num_input_tokens_seen": 44140496, + "step": 76085 + }, + { + "epoch": 11.33303544831695, + "grad_norm": 0.014226089231669903, + "learning_rate": 2.354832563602548e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44143184, + "step": 76090 + }, + { + "epoch": 11.33378016085791, + "grad_norm": 10.065576553344727, + "learning_rate": 2.3545081709623953e-05, + "loss": 0.141, + "num_input_tokens_seen": 44146000, + "step": 76095 + }, + { + "epoch": 11.334524873398868, + "grad_norm": 9.847402572631836, + "learning_rate": 2.354183780780178e-05, + "loss": 0.1601, + "num_input_tokens_seen": 44148752, + "step": 76100 + }, + { + "epoch": 11.335269585939827, + "grad_norm": 0.024494128301739693, + "learning_rate": 2.3538593930613784e-05, + "loss": 0.0101, + "num_input_tokens_seen": 44151920, + "step": 76105 + }, + { + "epoch": 11.336014298480787, + "grad_norm": 0.10002743452787399, + "learning_rate": 2.353535007811475e-05, + "loss": 0.0055, + "num_input_tokens_seen": 44154832, + "step": 76110 + }, + { + "epoch": 11.336759011021746, + "grad_norm": 0.01068259496241808, + "learning_rate": 2.3532106250359498e-05, + "loss": 0.2213, + "num_input_tokens_seen": 44157808, + "step": 76115 + }, + { + "epoch": 11.337503723562705, + "grad_norm": 0.05508003383874893, + "learning_rate": 2.3528862447402817e-05, + "loss": 0.0006, + "num_input_tokens_seen": 44160592, + "step": 76120 + }, + { + "epoch": 11.338248436103664, + "grad_norm": 0.0598532035946846, + "learning_rate": 2.3525618669299505e-05, + "loss": 0.1663, + "num_input_tokens_seen": 44163728, + "step": 76125 + }, + { + "epoch": 11.338993148644622, + "grad_norm": 0.009395002387464046, + "learning_rate": 2.3522374916104377e-05, + "loss": 0.2065, + "num_input_tokens_seen": 44166224, + "step": 76130 + }, + { + "epoch": 11.339737861185583, + "grad_norm": 0.020369810983538628, + "learning_rate": 2.3519131187872207e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44169136, + "step": 76135 + }, + { + "epoch": 11.340482573726542, + "grad_norm": 0.2682128846645355, + "learning_rate": 2.3515887484657823e-05, + "loss": 0.1457, + "num_input_tokens_seen": 44171920, + "step": 76140 + }, + { + "epoch": 11.3412272862675, + "grad_norm": 0.027781931683421135, + "learning_rate": 2.3512643806516e-05, + "loss": 0.0465, + "num_input_tokens_seen": 44174800, + "step": 76145 + }, + { + "epoch": 11.341971998808459, + "grad_norm": 0.021241413429379463, + "learning_rate": 2.3509400153501556e-05, + "loss": 0.1534, + "num_input_tokens_seen": 44177616, + "step": 76150 + }, + { + "epoch": 11.34271671134942, + "grad_norm": 0.0042065163142979145, + "learning_rate": 2.3506156525669286e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44180432, + "step": 76155 + }, + { + "epoch": 11.343461423890378, + "grad_norm": 16.756961822509766, + "learning_rate": 2.3502912923073976e-05, + "loss": 0.1658, + "num_input_tokens_seen": 44183280, + "step": 76160 + }, + { + "epoch": 11.344206136431337, + "grad_norm": 0.006900534965097904, + "learning_rate": 2.349966934577044e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44186352, + "step": 76165 + }, + { + "epoch": 11.344950848972296, + "grad_norm": 0.005154238548129797, + "learning_rate": 2.349642579381345e-05, + "loss": 0.2449, + "num_input_tokens_seen": 44189200, + "step": 76170 + }, + { + "epoch": 11.345695561513256, + "grad_norm": 1.3470773696899414, + "learning_rate": 2.349318226725783e-05, + "loss": 0.0005, + "num_input_tokens_seen": 44192144, + "step": 76175 + }, + { + "epoch": 11.346440274054215, + "grad_norm": 0.013217175379395485, + "learning_rate": 2.3489938766158354e-05, + "loss": 0.0013, + "num_input_tokens_seen": 44195184, + "step": 76180 + }, + { + "epoch": 11.347184986595174, + "grad_norm": 0.0012715827906504273, + "learning_rate": 2.3486695290569838e-05, + "loss": 0.0246, + "num_input_tokens_seen": 44198064, + "step": 76185 + }, + { + "epoch": 11.347929699136133, + "grad_norm": 0.05161653831601143, + "learning_rate": 2.348345184054706e-05, + "loss": 0.0017, + "num_input_tokens_seen": 44200944, + "step": 76190 + }, + { + "epoch": 11.348674411677093, + "grad_norm": 0.12476936727762222, + "learning_rate": 2.3480208416144832e-05, + "loss": 0.0004, + "num_input_tokens_seen": 44203664, + "step": 76195 + }, + { + "epoch": 11.349419124218052, + "grad_norm": 0.008639196865260601, + "learning_rate": 2.3476965017417935e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44206416, + "step": 76200 + }, + { + "epoch": 11.35016383675901, + "grad_norm": 0.2552105188369751, + "learning_rate": 2.3473721644421155e-05, + "loss": 0.0007, + "num_input_tokens_seen": 44209552, + "step": 76205 + }, + { + "epoch": 11.35090854929997, + "grad_norm": 0.0029568346217274666, + "learning_rate": 2.3470478297209307e-05, + "loss": 0.1689, + "num_input_tokens_seen": 44212432, + "step": 76210 + }, + { + "epoch": 11.35165326184093, + "grad_norm": 0.03954337537288666, + "learning_rate": 2.3467234975837162e-05, + "loss": 0.0272, + "num_input_tokens_seen": 44215408, + "step": 76215 + }, + { + "epoch": 11.352397974381889, + "grad_norm": 0.004033967386931181, + "learning_rate": 2.3463991680359536e-05, + "loss": 0.0004, + "num_input_tokens_seen": 44218320, + "step": 76220 + }, + { + "epoch": 11.353142686922848, + "grad_norm": 0.0028792403172701597, + "learning_rate": 2.346074841083121e-05, + "loss": 0.0015, + "num_input_tokens_seen": 44221072, + "step": 76225 + }, + { + "epoch": 11.353887399463806, + "grad_norm": 62.755401611328125, + "learning_rate": 2.345750516730697e-05, + "loss": 0.2095, + "num_input_tokens_seen": 44224144, + "step": 76230 + }, + { + "epoch": 11.354632112004767, + "grad_norm": 0.0076406775042414665, + "learning_rate": 2.3454261949841622e-05, + "loss": 0.0253, + "num_input_tokens_seen": 44226768, + "step": 76235 + }, + { + "epoch": 11.355376824545726, + "grad_norm": 0.002469029277563095, + "learning_rate": 2.3451018758489932e-05, + "loss": 0.0009, + "num_input_tokens_seen": 44229552, + "step": 76240 + }, + { + "epoch": 11.356121537086684, + "grad_norm": 0.008737596683204174, + "learning_rate": 2.3447775593306716e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44232752, + "step": 76245 + }, + { + "epoch": 11.356866249627643, + "grad_norm": 0.010473373346030712, + "learning_rate": 2.3444532454346745e-05, + "loss": 0.0416, + "num_input_tokens_seen": 44235760, + "step": 76250 + }, + { + "epoch": 11.357610962168604, + "grad_norm": 0.4300236105918884, + "learning_rate": 2.3441289341664822e-05, + "loss": 0.111, + "num_input_tokens_seen": 44238928, + "step": 76255 + }, + { + "epoch": 11.358355674709562, + "grad_norm": 0.0003822463913820684, + "learning_rate": 2.3438046255315735e-05, + "loss": 0.0, + "num_input_tokens_seen": 44242032, + "step": 76260 + }, + { + "epoch": 11.359100387250521, + "grad_norm": 0.014949607662856579, + "learning_rate": 2.3434803195354268e-05, + "loss": 0.1409, + "num_input_tokens_seen": 44244720, + "step": 76265 + }, + { + "epoch": 11.35984509979148, + "grad_norm": 0.006796491798013449, + "learning_rate": 2.3431560161835204e-05, + "loss": 0.1688, + "num_input_tokens_seen": 44247504, + "step": 76270 + }, + { + "epoch": 11.36058981233244, + "grad_norm": 0.25095632672309875, + "learning_rate": 2.3428317154813344e-05, + "loss": 0.2252, + "num_input_tokens_seen": 44250544, + "step": 76275 + }, + { + "epoch": 11.3613345248734, + "grad_norm": 0.7199733853340149, + "learning_rate": 2.342507417434347e-05, + "loss": 0.1951, + "num_input_tokens_seen": 44253584, + "step": 76280 + }, + { + "epoch": 11.362079237414358, + "grad_norm": 0.00872697401791811, + "learning_rate": 2.3421831220480357e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44256528, + "step": 76285 + }, + { + "epoch": 11.362823949955317, + "grad_norm": 32.11474609375, + "learning_rate": 2.341858829327881e-05, + "loss": 0.1658, + "num_input_tokens_seen": 44259408, + "step": 76290 + }, + { + "epoch": 11.363568662496277, + "grad_norm": 0.006766339298337698, + "learning_rate": 2.34153453927936e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44262256, + "step": 76295 + }, + { + "epoch": 11.364313375037236, + "grad_norm": 0.05761308595538139, + "learning_rate": 2.341210251907953e-05, + "loss": 0.0006, + "num_input_tokens_seen": 44265104, + "step": 76300 + }, + { + "epoch": 11.365058087578195, + "grad_norm": 0.02169845625758171, + "learning_rate": 2.340885967219136e-05, + "loss": 0.0338, + "num_input_tokens_seen": 44267824, + "step": 76305 + }, + { + "epoch": 11.365802800119154, + "grad_norm": 0.0021321780513972044, + "learning_rate": 2.3405616852183902e-05, + "loss": 0.0078, + "num_input_tokens_seen": 44270928, + "step": 76310 + }, + { + "epoch": 11.366547512660112, + "grad_norm": 0.008732498623430729, + "learning_rate": 2.3402374059111912e-05, + "loss": 0.003, + "num_input_tokens_seen": 44273712, + "step": 76315 + }, + { + "epoch": 11.367292225201073, + "grad_norm": 0.003083673072978854, + "learning_rate": 2.3399131293030204e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44276304, + "step": 76320 + }, + { + "epoch": 11.368036937742032, + "grad_norm": 10.915775299072266, + "learning_rate": 2.339588855399354e-05, + "loss": 0.6047, + "num_input_tokens_seen": 44279056, + "step": 76325 + }, + { + "epoch": 11.36878165028299, + "grad_norm": 0.19120532274246216, + "learning_rate": 2.3392645842056707e-05, + "loss": 0.127, + "num_input_tokens_seen": 44282000, + "step": 76330 + }, + { + "epoch": 11.36952636282395, + "grad_norm": 0.012862416915595531, + "learning_rate": 2.338940315727449e-05, + "loss": 0.1752, + "num_input_tokens_seen": 44284944, + "step": 76335 + }, + { + "epoch": 11.37027107536491, + "grad_norm": 21.23243522644043, + "learning_rate": 2.3386160499701663e-05, + "loss": 0.096, + "num_input_tokens_seen": 44288048, + "step": 76340 + }, + { + "epoch": 11.371015787905868, + "grad_norm": 0.031176352873444557, + "learning_rate": 2.3382917869393027e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44290896, + "step": 76345 + }, + { + "epoch": 11.371760500446827, + "grad_norm": 0.012834112159907818, + "learning_rate": 2.3379675266403335e-05, + "loss": 0.0004, + "num_input_tokens_seen": 44294192, + "step": 76350 + }, + { + "epoch": 11.372505212987786, + "grad_norm": 0.04528094828128815, + "learning_rate": 2.3376432690787396e-05, + "loss": 0.0973, + "num_input_tokens_seen": 44297296, + "step": 76355 + }, + { + "epoch": 11.373249925528746, + "grad_norm": 0.0022785670589655638, + "learning_rate": 2.3373190142599973e-05, + "loss": 0.0005, + "num_input_tokens_seen": 44300016, + "step": 76360 + }, + { + "epoch": 11.373994638069705, + "grad_norm": 0.6146847605705261, + "learning_rate": 2.3369947621895845e-05, + "loss": 0.1943, + "num_input_tokens_seen": 44302864, + "step": 76365 + }, + { + "epoch": 11.374739350610664, + "grad_norm": 0.021256931126117706, + "learning_rate": 2.3366705128729805e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44305616, + "step": 76370 + }, + { + "epoch": 11.375484063151623, + "grad_norm": 0.2542828917503357, + "learning_rate": 2.3363462663156606e-05, + "loss": 0.001, + "num_input_tokens_seen": 44308816, + "step": 76375 + }, + { + "epoch": 11.376228775692583, + "grad_norm": 0.049481429159641266, + "learning_rate": 2.3360220225231057e-05, + "loss": 0.1103, + "num_input_tokens_seen": 44311824, + "step": 76380 + }, + { + "epoch": 11.376973488233542, + "grad_norm": 143.51303100585938, + "learning_rate": 2.335697781500791e-05, + "loss": 0.142, + "num_input_tokens_seen": 44314864, + "step": 76385 + }, + { + "epoch": 11.3777182007745, + "grad_norm": 0.37358805537223816, + "learning_rate": 2.3353735432541957e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44317776, + "step": 76390 + }, + { + "epoch": 11.37846291331546, + "grad_norm": 40.69533920288086, + "learning_rate": 2.335049307788797e-05, + "loss": 0.2073, + "num_input_tokens_seen": 44320688, + "step": 76395 + }, + { + "epoch": 11.37920762585642, + "grad_norm": 0.010502141900360584, + "learning_rate": 2.334725075110073e-05, + "loss": 0.0238, + "num_input_tokens_seen": 44323632, + "step": 76400 + }, + { + "epoch": 11.379952338397379, + "grad_norm": 0.03794795647263527, + "learning_rate": 2.3344008452235008e-05, + "loss": 0.0006, + "num_input_tokens_seen": 44326416, + "step": 76405 + }, + { + "epoch": 11.380697050938338, + "grad_norm": 0.0121386107057333, + "learning_rate": 2.3340766181345572e-05, + "loss": 0.0854, + "num_input_tokens_seen": 44329328, + "step": 76410 + }, + { + "epoch": 11.381441763479296, + "grad_norm": 0.047576531767845154, + "learning_rate": 2.3337523938487214e-05, + "loss": 0.0083, + "num_input_tokens_seen": 44332176, + "step": 76415 + }, + { + "epoch": 11.382186476020257, + "grad_norm": 0.007219336926937103, + "learning_rate": 2.3334281723714694e-05, + "loss": 0.0008, + "num_input_tokens_seen": 44335088, + "step": 76420 + }, + { + "epoch": 11.382931188561216, + "grad_norm": 0.019124170765280724, + "learning_rate": 2.3331039537082796e-05, + "loss": 0.0015, + "num_input_tokens_seen": 44338096, + "step": 76425 + }, + { + "epoch": 11.383675901102174, + "grad_norm": 0.0063271899707615376, + "learning_rate": 2.332779737864628e-05, + "loss": 0.0005, + "num_input_tokens_seen": 44341040, + "step": 76430 + }, + { + "epoch": 11.384420613643133, + "grad_norm": 0.002700116951018572, + "learning_rate": 2.3324555248459938e-05, + "loss": 0.145, + "num_input_tokens_seen": 44344240, + "step": 76435 + }, + { + "epoch": 11.385165326184094, + "grad_norm": 0.01887996681034565, + "learning_rate": 2.3321313146578532e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44346928, + "step": 76440 + }, + { + "epoch": 11.385910038725052, + "grad_norm": 0.027086833491921425, + "learning_rate": 2.3318071073056826e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44349872, + "step": 76445 + }, + { + "epoch": 11.386654751266011, + "grad_norm": 13.756084442138672, + "learning_rate": 2.3314829027949606e-05, + "loss": 0.2649, + "num_input_tokens_seen": 44352624, + "step": 76450 + }, + { + "epoch": 11.38739946380697, + "grad_norm": 0.0052628242410719395, + "learning_rate": 2.3311587011311634e-05, + "loss": 0.0012, + "num_input_tokens_seen": 44355600, + "step": 76455 + }, + { + "epoch": 11.38814417634793, + "grad_norm": 0.018343722447752953, + "learning_rate": 2.330834502319769e-05, + "loss": 0.0008, + "num_input_tokens_seen": 44358384, + "step": 76460 + }, + { + "epoch": 11.38888888888889, + "grad_norm": 0.036444053053855896, + "learning_rate": 2.3305103063662522e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44361584, + "step": 76465 + }, + { + "epoch": 11.389633601429848, + "grad_norm": 0.06960591673851013, + "learning_rate": 2.330186113276093e-05, + "loss": 0.2322, + "num_input_tokens_seen": 44364976, + "step": 76470 + }, + { + "epoch": 11.390378313970807, + "grad_norm": 0.006406502798199654, + "learning_rate": 2.3298619230547656e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44367856, + "step": 76475 + }, + { + "epoch": 11.391123026511767, + "grad_norm": 0.01080517191439867, + "learning_rate": 2.329537735707749e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44370608, + "step": 76480 + }, + { + "epoch": 11.391867739052726, + "grad_norm": 7.44996452331543, + "learning_rate": 2.3292135512405198e-05, + "loss": 0.0431, + "num_input_tokens_seen": 44373456, + "step": 76485 + }, + { + "epoch": 11.392612451593685, + "grad_norm": 3.2427706718444824, + "learning_rate": 2.3288893696585528e-05, + "loss": 0.0475, + "num_input_tokens_seen": 44376368, + "step": 76490 + }, + { + "epoch": 11.393357164134644, + "grad_norm": 20.265378952026367, + "learning_rate": 2.328565190967327e-05, + "loss": 0.0112, + "num_input_tokens_seen": 44379504, + "step": 76495 + }, + { + "epoch": 11.394101876675602, + "grad_norm": 0.04682615026831627, + "learning_rate": 2.3282410151723167e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44382256, + "step": 76500 + }, + { + "epoch": 11.394846589216563, + "grad_norm": 0.056320056319236755, + "learning_rate": 2.327916842279001e-05, + "loss": 0.0034, + "num_input_tokens_seen": 44385264, + "step": 76505 + }, + { + "epoch": 11.395591301757522, + "grad_norm": 33.595947265625, + "learning_rate": 2.3275926722928542e-05, + "loss": 0.0802, + "num_input_tokens_seen": 44387920, + "step": 76510 + }, + { + "epoch": 11.39633601429848, + "grad_norm": 0.0016830215463414788, + "learning_rate": 2.327268505219355e-05, + "loss": 0.011, + "num_input_tokens_seen": 44390704, + "step": 76515 + }, + { + "epoch": 11.39708072683944, + "grad_norm": 0.007295523304492235, + "learning_rate": 2.326944341063979e-05, + "loss": 0.0012, + "num_input_tokens_seen": 44393840, + "step": 76520 + }, + { + "epoch": 11.3978254393804, + "grad_norm": 0.002308064606040716, + "learning_rate": 2.326620179832202e-05, + "loss": 0.0005, + "num_input_tokens_seen": 44397040, + "step": 76525 + }, + { + "epoch": 11.398570151921358, + "grad_norm": 0.0006179005722515285, + "learning_rate": 2.3262960215295014e-05, + "loss": 0.0011, + "num_input_tokens_seen": 44400304, + "step": 76530 + }, + { + "epoch": 11.399314864462317, + "grad_norm": 0.0026578400284051895, + "learning_rate": 2.3259718661613518e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44403024, + "step": 76535 + }, + { + "epoch": 11.400059577003276, + "grad_norm": 0.010314581915736198, + "learning_rate": 2.3256477137332315e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44406320, + "step": 76540 + }, + { + "epoch": 11.400804289544237, + "grad_norm": 0.39584678411483765, + "learning_rate": 2.325323564250615e-05, + "loss": 0.4663, + "num_input_tokens_seen": 44409488, + "step": 76545 + }, + { + "epoch": 11.401549002085195, + "grad_norm": 0.012267413549125195, + "learning_rate": 2.324999417718981e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44412368, + "step": 76550 + }, + { + "epoch": 11.402293714626154, + "grad_norm": 0.004122724756598473, + "learning_rate": 2.3246752741438026e-05, + "loss": 0.0921, + "num_input_tokens_seen": 44415344, + "step": 76555 + }, + { + "epoch": 11.403038427167113, + "grad_norm": 0.00522263441234827, + "learning_rate": 2.324351133530558e-05, + "loss": 0.1813, + "num_input_tokens_seen": 44418160, + "step": 76560 + }, + { + "epoch": 11.403783139708073, + "grad_norm": 0.10572753101587296, + "learning_rate": 2.3240269958847226e-05, + "loss": 0.166, + "num_input_tokens_seen": 44421200, + "step": 76565 + }, + { + "epoch": 11.404527852249032, + "grad_norm": 0.011183897033333778, + "learning_rate": 2.3237028612117712e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44424048, + "step": 76570 + }, + { + "epoch": 11.40527256478999, + "grad_norm": 0.0015150101389735937, + "learning_rate": 2.3233787295171818e-05, + "loss": 0.0008, + "num_input_tokens_seen": 44426736, + "step": 76575 + }, + { + "epoch": 11.40601727733095, + "grad_norm": 0.012353903613984585, + "learning_rate": 2.323054600806428e-05, + "loss": 0.0009, + "num_input_tokens_seen": 44429712, + "step": 76580 + }, + { + "epoch": 11.40676198987191, + "grad_norm": 0.07290713489055634, + "learning_rate": 2.322730475084988e-05, + "loss": 0.0654, + "num_input_tokens_seen": 44432656, + "step": 76585 + }, + { + "epoch": 11.407506702412869, + "grad_norm": 0.015712816268205643, + "learning_rate": 2.3224063523583363e-05, + "loss": 0.1659, + "num_input_tokens_seen": 44435472, + "step": 76590 + }, + { + "epoch": 11.408251414953828, + "grad_norm": 0.004006551578640938, + "learning_rate": 2.322082232631949e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44438128, + "step": 76595 + }, + { + "epoch": 11.408996127494786, + "grad_norm": 0.0016673763748258352, + "learning_rate": 2.3217581159113016e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44440848, + "step": 76600 + }, + { + "epoch": 11.409740840035747, + "grad_norm": 0.0006981572369113564, + "learning_rate": 2.3214340022018688e-05, + "loss": 0.1906, + "num_input_tokens_seen": 44443792, + "step": 76605 + }, + { + "epoch": 11.410485552576706, + "grad_norm": 0.013815406709909439, + "learning_rate": 2.321109891509128e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44446416, + "step": 76610 + }, + { + "epoch": 11.411230265117664, + "grad_norm": 0.021804533898830414, + "learning_rate": 2.3207857838385524e-05, + "loss": 0.0864, + "num_input_tokens_seen": 44448944, + "step": 76615 + }, + { + "epoch": 11.411974977658623, + "grad_norm": 0.44836920499801636, + "learning_rate": 2.32046167919562e-05, + "loss": 0.0013, + "num_input_tokens_seen": 44452304, + "step": 76620 + }, + { + "epoch": 11.412719690199584, + "grad_norm": 0.0013542165979743004, + "learning_rate": 2.320137577585805e-05, + "loss": 0.0942, + "num_input_tokens_seen": 44455184, + "step": 76625 + }, + { + "epoch": 11.413464402740543, + "grad_norm": 0.0017617723206058145, + "learning_rate": 2.319813479014583e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44458224, + "step": 76630 + }, + { + "epoch": 11.414209115281501, + "grad_norm": 0.001386379124596715, + "learning_rate": 2.319489383487428e-05, + "loss": 0.0006, + "num_input_tokens_seen": 44461072, + "step": 76635 + }, + { + "epoch": 11.41495382782246, + "grad_norm": 0.13767197728157043, + "learning_rate": 2.3191652910098174e-05, + "loss": 0.2793, + "num_input_tokens_seen": 44463920, + "step": 76640 + }, + { + "epoch": 11.41569854036342, + "grad_norm": 0.007642973680049181, + "learning_rate": 2.3188412015872258e-05, + "loss": 0.0005, + "num_input_tokens_seen": 44466800, + "step": 76645 + }, + { + "epoch": 11.41644325290438, + "grad_norm": 0.03938139230012894, + "learning_rate": 2.3185171152251265e-05, + "loss": 0.0185, + "num_input_tokens_seen": 44469648, + "step": 76650 + }, + { + "epoch": 11.417187965445338, + "grad_norm": 0.003477145219221711, + "learning_rate": 2.3181930319289975e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44473072, + "step": 76655 + }, + { + "epoch": 11.417932677986297, + "grad_norm": 0.007500484585762024, + "learning_rate": 2.3178689517043116e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44476176, + "step": 76660 + }, + { + "epoch": 11.418677390527257, + "grad_norm": 0.0009840327547863126, + "learning_rate": 2.3175448745565454e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44479024, + "step": 76665 + }, + { + "epoch": 11.419422103068216, + "grad_norm": 0.021037790924310684, + "learning_rate": 2.317220800491172e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44482064, + "step": 76670 + }, + { + "epoch": 11.420166815609175, + "grad_norm": 0.004547678399831057, + "learning_rate": 2.3168967295136685e-05, + "loss": 0.0015, + "num_input_tokens_seen": 44485264, + "step": 76675 + }, + { + "epoch": 11.420911528150134, + "grad_norm": 0.0009820472914725542, + "learning_rate": 2.3165726616295083e-05, + "loss": 0.0, + "num_input_tokens_seen": 44488080, + "step": 76680 + }, + { + "epoch": 11.421656240691092, + "grad_norm": 0.0017430315492674708, + "learning_rate": 2.316248596844166e-05, + "loss": 0.1068, + "num_input_tokens_seen": 44490736, + "step": 76685 + }, + { + "epoch": 11.422400953232053, + "grad_norm": 0.003391234204173088, + "learning_rate": 2.3159245351631176e-05, + "loss": 0.0893, + "num_input_tokens_seen": 44493968, + "step": 76690 + }, + { + "epoch": 11.423145665773012, + "grad_norm": 0.03057353012263775, + "learning_rate": 2.315600476591837e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44497168, + "step": 76695 + }, + { + "epoch": 11.42389037831397, + "grad_norm": 0.4824768602848053, + "learning_rate": 2.3152764211357988e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44499824, + "step": 76700 + }, + { + "epoch": 11.42463509085493, + "grad_norm": 0.0018973445985466242, + "learning_rate": 2.314952368800477e-05, + "loss": 0.0285, + "num_input_tokens_seen": 44502768, + "step": 76705 + }, + { + "epoch": 11.42537980339589, + "grad_norm": 0.0029523868579417467, + "learning_rate": 2.3146283195913482e-05, + "loss": 0.0501, + "num_input_tokens_seen": 44505456, + "step": 76710 + }, + { + "epoch": 11.426124515936849, + "grad_norm": 0.24473145604133606, + "learning_rate": 2.3143042735138848e-05, + "loss": 0.0205, + "num_input_tokens_seen": 44508144, + "step": 76715 + }, + { + "epoch": 11.426869228477807, + "grad_norm": 0.01222192496061325, + "learning_rate": 2.3139802305735618e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44511056, + "step": 76720 + }, + { + "epoch": 11.427613941018766, + "grad_norm": 0.05385104939341545, + "learning_rate": 2.3136561907758543e-05, + "loss": 0.1805, + "num_input_tokens_seen": 44514000, + "step": 76725 + }, + { + "epoch": 11.428358653559727, + "grad_norm": 0.03721287101507187, + "learning_rate": 2.3133321541262356e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44517104, + "step": 76730 + }, + { + "epoch": 11.429103366100685, + "grad_norm": 7.431882858276367, + "learning_rate": 2.3130081206301812e-05, + "loss": 0.03, + "num_input_tokens_seen": 44519696, + "step": 76735 + }, + { + "epoch": 11.429848078641644, + "grad_norm": 0.22516104578971863, + "learning_rate": 2.3126840902931633e-05, + "loss": 0.085, + "num_input_tokens_seen": 44522384, + "step": 76740 + }, + { + "epoch": 11.430592791182603, + "grad_norm": 0.0030331164598464966, + "learning_rate": 2.312360063120658e-05, + "loss": 0.2544, + "num_input_tokens_seen": 44525232, + "step": 76745 + }, + { + "epoch": 11.431337503723563, + "grad_norm": 0.0029574621003121138, + "learning_rate": 2.3120360391181388e-05, + "loss": 0.0138, + "num_input_tokens_seen": 44528176, + "step": 76750 + }, + { + "epoch": 11.432082216264522, + "grad_norm": 0.2612142264842987, + "learning_rate": 2.31171201829108e-05, + "loss": 0.0006, + "num_input_tokens_seen": 44531152, + "step": 76755 + }, + { + "epoch": 11.432826928805481, + "grad_norm": 0.1126304566860199, + "learning_rate": 2.3113880006449547e-05, + "loss": 0.1317, + "num_input_tokens_seen": 44534160, + "step": 76760 + }, + { + "epoch": 11.43357164134644, + "grad_norm": 0.0017519730608910322, + "learning_rate": 2.3110639861852373e-05, + "loss": 0.0004, + "num_input_tokens_seen": 44536688, + "step": 76765 + }, + { + "epoch": 11.4343163538874, + "grad_norm": 11.082038879394531, + "learning_rate": 2.3107399749174027e-05, + "loss": 0.1775, + "num_input_tokens_seen": 44539536, + "step": 76770 + }, + { + "epoch": 11.435061066428359, + "grad_norm": 0.21248216927051544, + "learning_rate": 2.3104159668469226e-05, + "loss": 0.0006, + "num_input_tokens_seen": 44542320, + "step": 76775 + }, + { + "epoch": 11.435805778969318, + "grad_norm": 0.05444657430052757, + "learning_rate": 2.3100919619792733e-05, + "loss": 0.0381, + "num_input_tokens_seen": 44545104, + "step": 76780 + }, + { + "epoch": 11.436550491510276, + "grad_norm": 12.6268892288208, + "learning_rate": 2.3097679603199267e-05, + "loss": 0.0313, + "num_input_tokens_seen": 44548080, + "step": 76785 + }, + { + "epoch": 11.437295204051237, + "grad_norm": 2.3808019161224365, + "learning_rate": 2.3094439618743572e-05, + "loss": 0.0241, + "num_input_tokens_seen": 44551312, + "step": 76790 + }, + { + "epoch": 11.438039916592196, + "grad_norm": 0.033909160643815994, + "learning_rate": 2.3091199666480377e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44554480, + "step": 76795 + }, + { + "epoch": 11.438784629133155, + "grad_norm": 0.008825031109154224, + "learning_rate": 2.3087959746464432e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44557296, + "step": 76800 + }, + { + "epoch": 11.439529341674113, + "grad_norm": 0.05802488699555397, + "learning_rate": 2.3084719858750464e-05, + "loss": 0.0513, + "num_input_tokens_seen": 44560208, + "step": 76805 + }, + { + "epoch": 11.440274054215074, + "grad_norm": 0.0006726934225298464, + "learning_rate": 2.3081480003393198e-05, + "loss": 0.183, + "num_input_tokens_seen": 44563152, + "step": 76810 + }, + { + "epoch": 11.441018766756033, + "grad_norm": 0.01808520033955574, + "learning_rate": 2.3078240180447384e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44565968, + "step": 76815 + }, + { + "epoch": 11.441763479296991, + "grad_norm": 0.001622881623916328, + "learning_rate": 2.307500038996775e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44568560, + "step": 76820 + }, + { + "epoch": 11.44250819183795, + "grad_norm": 0.0063048419542610645, + "learning_rate": 2.3071760632009028e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44571312, + "step": 76825 + }, + { + "epoch": 11.443252904378909, + "grad_norm": 0.05818604677915573, + "learning_rate": 2.3068520906625943e-05, + "loss": 0.0026, + "num_input_tokens_seen": 44573936, + "step": 76830 + }, + { + "epoch": 11.44399761691987, + "grad_norm": 0.008780961856245995, + "learning_rate": 2.306528121387324e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44577008, + "step": 76835 + }, + { + "epoch": 11.444742329460828, + "grad_norm": 0.06957795470952988, + "learning_rate": 2.306204155380565e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44579856, + "step": 76840 + }, + { + "epoch": 11.445487042001787, + "grad_norm": 0.057054538279771805, + "learning_rate": 2.3058801926477885e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44582992, + "step": 76845 + }, + { + "epoch": 11.446231754542746, + "grad_norm": 0.0019359526922926307, + "learning_rate": 2.3055562331944703e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44585904, + "step": 76850 + }, + { + "epoch": 11.446976467083706, + "grad_norm": 7.943784236907959, + "learning_rate": 2.3052322770260808e-05, + "loss": 0.0029, + "num_input_tokens_seen": 44588976, + "step": 76855 + }, + { + "epoch": 11.447721179624665, + "grad_norm": 0.016481122002005577, + "learning_rate": 2.3049083241480948e-05, + "loss": 0.0016, + "num_input_tokens_seen": 44591664, + "step": 76860 + }, + { + "epoch": 11.448465892165624, + "grad_norm": 0.031031077727675438, + "learning_rate": 2.3045843745659834e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44594576, + "step": 76865 + }, + { + "epoch": 11.449210604706582, + "grad_norm": 0.006610472686588764, + "learning_rate": 2.3042604282852215e-05, + "loss": 0.2096, + "num_input_tokens_seen": 44597552, + "step": 76870 + }, + { + "epoch": 11.449955317247543, + "grad_norm": 0.0896843895316124, + "learning_rate": 2.3039364853112794e-05, + "loss": 0.0317, + "num_input_tokens_seen": 44600368, + "step": 76875 + }, + { + "epoch": 11.450700029788502, + "grad_norm": 0.0007801431347616017, + "learning_rate": 2.3036125456496324e-05, + "loss": 0.0995, + "num_input_tokens_seen": 44603440, + "step": 76880 + }, + { + "epoch": 11.45144474232946, + "grad_norm": 0.012122096493840218, + "learning_rate": 2.303288609305752e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44606288, + "step": 76885 + }, + { + "epoch": 11.45218945487042, + "grad_norm": 0.015407298691570759, + "learning_rate": 2.3029646762851096e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44609040, + "step": 76890 + }, + { + "epoch": 11.45293416741138, + "grad_norm": 0.008606181479990482, + "learning_rate": 2.3026407465931797e-05, + "loss": 0.0005, + "num_input_tokens_seen": 44611984, + "step": 76895 + }, + { + "epoch": 11.453678879952339, + "grad_norm": 0.005616229027509689, + "learning_rate": 2.3023168202354324e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44615152, + "step": 76900 + }, + { + "epoch": 11.454423592493297, + "grad_norm": 10.223382949829102, + "learning_rate": 2.301992897217343e-05, + "loss": 0.0806, + "num_input_tokens_seen": 44618064, + "step": 76905 + }, + { + "epoch": 11.455168305034256, + "grad_norm": 0.022146519273519516, + "learning_rate": 2.3016689775443806e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44620976, + "step": 76910 + }, + { + "epoch": 11.455913017575217, + "grad_norm": 0.006812945939600468, + "learning_rate": 2.3013450612220207e-05, + "loss": 0.1912, + "num_input_tokens_seen": 44623664, + "step": 76915 + }, + { + "epoch": 11.456657730116175, + "grad_norm": 0.0514771044254303, + "learning_rate": 2.3010211482557335e-05, + "loss": 0.0004, + "num_input_tokens_seen": 44626544, + "step": 76920 + }, + { + "epoch": 11.457402442657134, + "grad_norm": 0.0064034233801066875, + "learning_rate": 2.3006972386509925e-05, + "loss": 0.0032, + "num_input_tokens_seen": 44629584, + "step": 76925 + }, + { + "epoch": 11.458147155198093, + "grad_norm": 93.24760437011719, + "learning_rate": 2.3003733324132693e-05, + "loss": 0.1541, + "num_input_tokens_seen": 44632592, + "step": 76930 + }, + { + "epoch": 11.458891867739053, + "grad_norm": 0.01988011598587036, + "learning_rate": 2.300049429548034e-05, + "loss": 0.2352, + "num_input_tokens_seen": 44635344, + "step": 76935 + }, + { + "epoch": 11.459636580280012, + "grad_norm": 0.0925959050655365, + "learning_rate": 2.299725530060762e-05, + "loss": 0.0163, + "num_input_tokens_seen": 44638160, + "step": 76940 + }, + { + "epoch": 11.460381292820971, + "grad_norm": 0.0219105314463377, + "learning_rate": 2.2994016339569224e-05, + "loss": 0.0004, + "num_input_tokens_seen": 44641072, + "step": 76945 + }, + { + "epoch": 11.46112600536193, + "grad_norm": 0.01295513566583395, + "learning_rate": 2.2990777412419892e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44643856, + "step": 76950 + }, + { + "epoch": 11.46187071790289, + "grad_norm": 0.023803306743502617, + "learning_rate": 2.298753851921433e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44647056, + "step": 76955 + }, + { + "epoch": 11.462615430443849, + "grad_norm": 2.888948917388916, + "learning_rate": 2.2984299660007263e-05, + "loss": 0.0011, + "num_input_tokens_seen": 44650096, + "step": 76960 + }, + { + "epoch": 11.463360142984808, + "grad_norm": 0.03359922766685486, + "learning_rate": 2.2981060834853406e-05, + "loss": 0.2194, + "num_input_tokens_seen": 44653168, + "step": 76965 + }, + { + "epoch": 11.464104855525767, + "grad_norm": 0.016730155795812607, + "learning_rate": 2.2977822043807466e-05, + "loss": 0.0451, + "num_input_tokens_seen": 44656272, + "step": 76970 + }, + { + "epoch": 11.464849568066727, + "grad_norm": 0.0039491914212703705, + "learning_rate": 2.2974583286924176e-05, + "loss": 0.1208, + "num_input_tokens_seen": 44659088, + "step": 76975 + }, + { + "epoch": 11.465594280607686, + "grad_norm": 0.13344673812389374, + "learning_rate": 2.297134456425823e-05, + "loss": 0.0007, + "num_input_tokens_seen": 44662224, + "step": 76980 + }, + { + "epoch": 11.466338993148645, + "grad_norm": 2.610919237136841, + "learning_rate": 2.2968105875864368e-05, + "loss": 0.1506, + "num_input_tokens_seen": 44664848, + "step": 76985 + }, + { + "epoch": 11.467083705689603, + "grad_norm": 0.1203191950917244, + "learning_rate": 2.2964867221797286e-05, + "loss": 0.0386, + "num_input_tokens_seen": 44667888, + "step": 76990 + }, + { + "epoch": 11.467828418230564, + "grad_norm": 13.98852252960205, + "learning_rate": 2.296162860211171e-05, + "loss": 0.1386, + "num_input_tokens_seen": 44670768, + "step": 76995 + }, + { + "epoch": 11.468573130771523, + "grad_norm": 0.22512002289295197, + "learning_rate": 2.2958390016862335e-05, + "loss": 0.0032, + "num_input_tokens_seen": 44673520, + "step": 77000 + }, + { + "epoch": 11.469317843312481, + "grad_norm": 0.017766552045941353, + "learning_rate": 2.29551514661039e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44676528, + "step": 77005 + }, + { + "epoch": 11.47006255585344, + "grad_norm": 0.023518210276961327, + "learning_rate": 2.2951912949891098e-05, + "loss": 0.1352, + "num_input_tokens_seen": 44679536, + "step": 77010 + }, + { + "epoch": 11.470807268394399, + "grad_norm": 0.10665545612573624, + "learning_rate": 2.294867446827864e-05, + "loss": 0.0004, + "num_input_tokens_seen": 44682320, + "step": 77015 + }, + { + "epoch": 11.47155198093536, + "grad_norm": 0.028281619772315025, + "learning_rate": 2.294543602132125e-05, + "loss": 0.0613, + "num_input_tokens_seen": 44685232, + "step": 77020 + }, + { + "epoch": 11.472296693476318, + "grad_norm": 0.046124447137117386, + "learning_rate": 2.2942197609073624e-05, + "loss": 0.1705, + "num_input_tokens_seen": 44687856, + "step": 77025 + }, + { + "epoch": 11.473041406017277, + "grad_norm": 0.013022051192820072, + "learning_rate": 2.2938959231590483e-05, + "loss": 0.1599, + "num_input_tokens_seen": 44691056, + "step": 77030 + }, + { + "epoch": 11.473786118558236, + "grad_norm": 0.00800981093198061, + "learning_rate": 2.2935720888926522e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44693808, + "step": 77035 + }, + { + "epoch": 11.474530831099196, + "grad_norm": 0.001542345737107098, + "learning_rate": 2.2932482581136466e-05, + "loss": 0.0057, + "num_input_tokens_seen": 44696496, + "step": 77040 + }, + { + "epoch": 11.475275543640155, + "grad_norm": 0.1508546769618988, + "learning_rate": 2.292924430827502e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44699376, + "step": 77045 + }, + { + "epoch": 11.476020256181114, + "grad_norm": 0.006356789730489254, + "learning_rate": 2.292600607039687e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44702256, + "step": 77050 + }, + { + "epoch": 11.476764968722073, + "grad_norm": 0.03978722169995308, + "learning_rate": 2.2922767867556755e-05, + "loss": 0.0927, + "num_input_tokens_seen": 44704944, + "step": 77055 + }, + { + "epoch": 11.477509681263033, + "grad_norm": 0.005131382495164871, + "learning_rate": 2.291952969980936e-05, + "loss": 0.1593, + "num_input_tokens_seen": 44707888, + "step": 77060 + }, + { + "epoch": 11.478254393803992, + "grad_norm": 0.0032236985862255096, + "learning_rate": 2.29162915672094e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44710448, + "step": 77065 + }, + { + "epoch": 11.47899910634495, + "grad_norm": 0.00284197018481791, + "learning_rate": 2.2913053469811568e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44713360, + "step": 77070 + }, + { + "epoch": 11.47974381888591, + "grad_norm": 0.8567109107971191, + "learning_rate": 2.2909815407670584e-05, + "loss": 0.0008, + "num_input_tokens_seen": 44716080, + "step": 77075 + }, + { + "epoch": 11.48048853142687, + "grad_norm": 5.166972637176514, + "learning_rate": 2.2906577380841143e-05, + "loss": 0.0034, + "num_input_tokens_seen": 44718928, + "step": 77080 + }, + { + "epoch": 11.481233243967829, + "grad_norm": 52.88304901123047, + "learning_rate": 2.290333938937795e-05, + "loss": 0.1376, + "num_input_tokens_seen": 44721776, + "step": 77085 + }, + { + "epoch": 11.481977956508787, + "grad_norm": 69.7605209350586, + "learning_rate": 2.2900101433335704e-05, + "loss": 0.0491, + "num_input_tokens_seen": 44724560, + "step": 77090 + }, + { + "epoch": 11.482722669049746, + "grad_norm": 0.0013608152512460947, + "learning_rate": 2.289686351276911e-05, + "loss": 0.0312, + "num_input_tokens_seen": 44727312, + "step": 77095 + }, + { + "epoch": 11.483467381590707, + "grad_norm": 0.0059738559648394585, + "learning_rate": 2.2893625627732877e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44730128, + "step": 77100 + }, + { + "epoch": 11.484212094131665, + "grad_norm": 0.002222377108410001, + "learning_rate": 2.2890387778281686e-05, + "loss": 0.0004, + "num_input_tokens_seen": 44733040, + "step": 77105 + }, + { + "epoch": 11.484956806672624, + "grad_norm": 0.002774893306195736, + "learning_rate": 2.2887149964470258e-05, + "loss": 0.101, + "num_input_tokens_seen": 44735984, + "step": 77110 + }, + { + "epoch": 11.485701519213583, + "grad_norm": 0.0275233406573534, + "learning_rate": 2.2883912186353282e-05, + "loss": 0.001, + "num_input_tokens_seen": 44738672, + "step": 77115 + }, + { + "epoch": 11.486446231754543, + "grad_norm": 24.200838088989258, + "learning_rate": 2.288067444398546e-05, + "loss": 0.2127, + "num_input_tokens_seen": 44741648, + "step": 77120 + }, + { + "epoch": 11.487190944295502, + "grad_norm": 0.03279358521103859, + "learning_rate": 2.2877436737421494e-05, + "loss": 0.0525, + "num_input_tokens_seen": 44744432, + "step": 77125 + }, + { + "epoch": 11.487935656836461, + "grad_norm": 1.1832369565963745, + "learning_rate": 2.287419906671606e-05, + "loss": 0.1092, + "num_input_tokens_seen": 44747280, + "step": 77130 + }, + { + "epoch": 11.48868036937742, + "grad_norm": 0.018571147695183754, + "learning_rate": 2.287096143192389e-05, + "loss": 0.0708, + "num_input_tokens_seen": 44750032, + "step": 77135 + }, + { + "epoch": 11.48942508191838, + "grad_norm": 0.0021261400543153286, + "learning_rate": 2.286772383309965e-05, + "loss": 0.2411, + "num_input_tokens_seen": 44752944, + "step": 77140 + }, + { + "epoch": 11.490169794459339, + "grad_norm": 21.63136863708496, + "learning_rate": 2.286448627029806e-05, + "loss": 0.084, + "num_input_tokens_seen": 44756016, + "step": 77145 + }, + { + "epoch": 11.490914507000298, + "grad_norm": 1.0715131759643555, + "learning_rate": 2.2861248743573794e-05, + "loss": 0.0928, + "num_input_tokens_seen": 44758768, + "step": 77150 + }, + { + "epoch": 11.491659219541257, + "grad_norm": 1.1485161781311035, + "learning_rate": 2.2858011252981566e-05, + "loss": 0.0065, + "num_input_tokens_seen": 44761552, + "step": 77155 + }, + { + "epoch": 11.492403932082217, + "grad_norm": 9.440690040588379, + "learning_rate": 2.285477379857605e-05, + "loss": 0.0026, + "num_input_tokens_seen": 44764624, + "step": 77160 + }, + { + "epoch": 11.493148644623176, + "grad_norm": 0.1445510983467102, + "learning_rate": 2.2851536380411958e-05, + "loss": 0.116, + "num_input_tokens_seen": 44768112, + "step": 77165 + }, + { + "epoch": 11.493893357164135, + "grad_norm": 0.0722789540886879, + "learning_rate": 2.284829899854398e-05, + "loss": 0.2005, + "num_input_tokens_seen": 44770800, + "step": 77170 + }, + { + "epoch": 11.494638069705093, + "grad_norm": 0.0015630522975698113, + "learning_rate": 2.284506165302679e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44773808, + "step": 77175 + }, + { + "epoch": 11.495382782246054, + "grad_norm": 0.1586277335882187, + "learning_rate": 2.2841824343915103e-05, + "loss": 0.0378, + "num_input_tokens_seen": 44776464, + "step": 77180 + }, + { + "epoch": 11.496127494787013, + "grad_norm": 0.0012943230103701353, + "learning_rate": 2.28385870712636e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44779216, + "step": 77185 + }, + { + "epoch": 11.496872207327971, + "grad_norm": 0.0427863746881485, + "learning_rate": 2.283534983512697e-05, + "loss": 0.0158, + "num_input_tokens_seen": 44781968, + "step": 77190 + }, + { + "epoch": 11.49761691986893, + "grad_norm": 0.0023325702641159296, + "learning_rate": 2.2832112635559897e-05, + "loss": 0.0021, + "num_input_tokens_seen": 44784816, + "step": 77195 + }, + { + "epoch": 11.498361632409889, + "grad_norm": 24.522750854492188, + "learning_rate": 2.282887547261709e-05, + "loss": 0.0976, + "num_input_tokens_seen": 44787952, + "step": 77200 + }, + { + "epoch": 11.49910634495085, + "grad_norm": 0.00023291828983929008, + "learning_rate": 2.2825638346353223e-05, + "loss": 0.0006, + "num_input_tokens_seen": 44791184, + "step": 77205 + }, + { + "epoch": 11.499851057491808, + "grad_norm": 91.54508209228516, + "learning_rate": 2.2822401256822974e-05, + "loss": 0.0453, + "num_input_tokens_seen": 44794160, + "step": 77210 + }, + { + "epoch": 11.500595770032767, + "grad_norm": 0.0010726226028054953, + "learning_rate": 2.2819164204081057e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44796784, + "step": 77215 + }, + { + "epoch": 11.501340482573726, + "grad_norm": 0.0009500583983026445, + "learning_rate": 2.281592718818214e-05, + "loss": 0.183, + "num_input_tokens_seen": 44799600, + "step": 77220 + }, + { + "epoch": 11.502085195114686, + "grad_norm": 0.022356398403644562, + "learning_rate": 2.2812690209180914e-05, + "loss": 0.0545, + "num_input_tokens_seen": 44802256, + "step": 77225 + }, + { + "epoch": 11.502829907655645, + "grad_norm": 2.806716203689575, + "learning_rate": 2.2809453267132054e-05, + "loss": 0.0417, + "num_input_tokens_seen": 44805136, + "step": 77230 + }, + { + "epoch": 11.503574620196604, + "grad_norm": 0.008639559149742126, + "learning_rate": 2.2806216362090267e-05, + "loss": 0.0015, + "num_input_tokens_seen": 44807952, + "step": 77235 + }, + { + "epoch": 11.504319332737563, + "grad_norm": 0.0075584109872579575, + "learning_rate": 2.2802979494110213e-05, + "loss": 0.2708, + "num_input_tokens_seen": 44810992, + "step": 77240 + }, + { + "epoch": 11.505064045278523, + "grad_norm": 0.0014223976759240031, + "learning_rate": 2.27997426632466e-05, + "loss": 0.0104, + "num_input_tokens_seen": 44814064, + "step": 77245 + }, + { + "epoch": 11.505808757819482, + "grad_norm": 0.015115912072360516, + "learning_rate": 2.2796505869554098e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44816976, + "step": 77250 + }, + { + "epoch": 11.50655347036044, + "grad_norm": 0.0023261355236172676, + "learning_rate": 2.2793269113087385e-05, + "loss": 0.2163, + "num_input_tokens_seen": 44819856, + "step": 77255 + }, + { + "epoch": 11.5072981829014, + "grad_norm": 0.004346985835582018, + "learning_rate": 2.279003239390115e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44822608, + "step": 77260 + }, + { + "epoch": 11.50804289544236, + "grad_norm": 73.13685607910156, + "learning_rate": 2.2786795712050065e-05, + "loss": 0.4532, + "num_input_tokens_seen": 44825360, + "step": 77265 + }, + { + "epoch": 11.508787607983319, + "grad_norm": 2.240025758743286, + "learning_rate": 2.2783559067588822e-05, + "loss": 0.0016, + "num_input_tokens_seen": 44828080, + "step": 77270 + }, + { + "epoch": 11.509532320524277, + "grad_norm": 0.0005102552240714431, + "learning_rate": 2.278032246057209e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44831120, + "step": 77275 + }, + { + "epoch": 11.510277033065236, + "grad_norm": 0.002123373793438077, + "learning_rate": 2.2777085891054566e-05, + "loss": 0.012, + "num_input_tokens_seen": 44834160, + "step": 77280 + }, + { + "epoch": 11.511021745606197, + "grad_norm": 0.0035578752867877483, + "learning_rate": 2.277384935909091e-05, + "loss": 0.4287, + "num_input_tokens_seen": 44836912, + "step": 77285 + }, + { + "epoch": 11.511766458147155, + "grad_norm": 0.008454577066004276, + "learning_rate": 2.277061286473581e-05, + "loss": 0.0013, + "num_input_tokens_seen": 44839856, + "step": 77290 + }, + { + "epoch": 11.512511170688114, + "grad_norm": 0.012392417527735233, + "learning_rate": 2.2767376408043935e-05, + "loss": 0.2096, + "num_input_tokens_seen": 44842768, + "step": 77295 + }, + { + "epoch": 11.513255883229073, + "grad_norm": 0.005254592280834913, + "learning_rate": 2.2764139989069962e-05, + "loss": 0.1657, + "num_input_tokens_seen": 44845392, + "step": 77300 + }, + { + "epoch": 11.514000595770034, + "grad_norm": 0.0026620542630553246, + "learning_rate": 2.276090360786858e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44848432, + "step": 77305 + }, + { + "epoch": 11.514745308310992, + "grad_norm": 0.018399693071842194, + "learning_rate": 2.2757667264494448e-05, + "loss": 0.0037, + "num_input_tokens_seen": 44851152, + "step": 77310 + }, + { + "epoch": 11.515490020851951, + "grad_norm": 0.00922385323792696, + "learning_rate": 2.275443095900226e-05, + "loss": 0.0763, + "num_input_tokens_seen": 44854224, + "step": 77315 + }, + { + "epoch": 11.51623473339291, + "grad_norm": 0.01935805380344391, + "learning_rate": 2.2751194691446666e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44857136, + "step": 77320 + }, + { + "epoch": 11.51697944593387, + "grad_norm": 0.00034066769876517355, + "learning_rate": 2.2747958461882365e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44860048, + "step": 77325 + }, + { + "epoch": 11.517724158474829, + "grad_norm": 0.03315957635641098, + "learning_rate": 2.2744722270364012e-05, + "loss": 0.0829, + "num_input_tokens_seen": 44862960, + "step": 77330 + }, + { + "epoch": 11.518468871015788, + "grad_norm": 0.011890720576047897, + "learning_rate": 2.274148611694628e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44865904, + "step": 77335 + }, + { + "epoch": 11.519213583556747, + "grad_norm": 0.004792372230440378, + "learning_rate": 2.2738250001683846e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44868816, + "step": 77340 + }, + { + "epoch": 11.519958296097705, + "grad_norm": 15.021851539611816, + "learning_rate": 2.2735013924631378e-05, + "loss": 0.025, + "num_input_tokens_seen": 44871984, + "step": 77345 + }, + { + "epoch": 11.520703008638666, + "grad_norm": 0.0175413116812706, + "learning_rate": 2.273177788584355e-05, + "loss": 0.3659, + "num_input_tokens_seen": 44875088, + "step": 77350 + }, + { + "epoch": 11.521447721179625, + "grad_norm": 110.78495025634766, + "learning_rate": 2.272854188537503e-05, + "loss": 0.2792, + "num_input_tokens_seen": 44877872, + "step": 77355 + }, + { + "epoch": 11.522192433720583, + "grad_norm": 0.010309564881026745, + "learning_rate": 2.272530592328049e-05, + "loss": 0.0004, + "num_input_tokens_seen": 44880912, + "step": 77360 + }, + { + "epoch": 11.522937146261544, + "grad_norm": 0.331079363822937, + "learning_rate": 2.272206999961459e-05, + "loss": 0.1673, + "num_input_tokens_seen": 44883696, + "step": 77365 + }, + { + "epoch": 11.523681858802503, + "grad_norm": 0.033587340265512466, + "learning_rate": 2.2718834114432e-05, + "loss": 0.0198, + "num_input_tokens_seen": 44886352, + "step": 77370 + }, + { + "epoch": 11.524426571343461, + "grad_norm": 0.005871076602488756, + "learning_rate": 2.2715598267787394e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44888912, + "step": 77375 + }, + { + "epoch": 11.52517128388442, + "grad_norm": 0.0010787725914269686, + "learning_rate": 2.2712362459735425e-05, + "loss": 0.0108, + "num_input_tokens_seen": 44891632, + "step": 77380 + }, + { + "epoch": 11.525915996425379, + "grad_norm": 0.6855552792549133, + "learning_rate": 2.2709126690330778e-05, + "loss": 0.0029, + "num_input_tokens_seen": 44894608, + "step": 77385 + }, + { + "epoch": 11.52666070896634, + "grad_norm": 0.005980890244245529, + "learning_rate": 2.27058909596281e-05, + "loss": 0.1129, + "num_input_tokens_seen": 44897744, + "step": 77390 + }, + { + "epoch": 11.527405421507298, + "grad_norm": 0.014782676473259926, + "learning_rate": 2.2702655267682068e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44900656, + "step": 77395 + }, + { + "epoch": 11.528150134048257, + "grad_norm": 0.008718930184841156, + "learning_rate": 2.2699419614547333e-05, + "loss": 0.0114, + "num_input_tokens_seen": 44903600, + "step": 77400 + }, + { + "epoch": 11.528894846589216, + "grad_norm": 0.0009656331385485828, + "learning_rate": 2.2696184000278573e-05, + "loss": 0.0617, + "num_input_tokens_seen": 44906384, + "step": 77405 + }, + { + "epoch": 11.529639559130176, + "grad_norm": 0.004850190132856369, + "learning_rate": 2.2692948424930445e-05, + "loss": 0.0618, + "num_input_tokens_seen": 44909328, + "step": 77410 + }, + { + "epoch": 11.530384271671135, + "grad_norm": 0.009004710242152214, + "learning_rate": 2.2689712888557603e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44912624, + "step": 77415 + }, + { + "epoch": 11.531128984212094, + "grad_norm": 0.0017307234229519963, + "learning_rate": 2.268647739121471e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44915760, + "step": 77420 + }, + { + "epoch": 11.531873696753053, + "grad_norm": 0.0020595313981175423, + "learning_rate": 2.2683241932956432e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44918704, + "step": 77425 + }, + { + "epoch": 11.532618409294013, + "grad_norm": 0.003931982442736626, + "learning_rate": 2.2680006513837436e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44921456, + "step": 77430 + }, + { + "epoch": 11.533363121834972, + "grad_norm": 0.004703074228018522, + "learning_rate": 2.2676771133912355e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44924592, + "step": 77435 + }, + { + "epoch": 11.53410783437593, + "grad_norm": 0.025487273931503296, + "learning_rate": 2.2673535793235877e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44927344, + "step": 77440 + }, + { + "epoch": 11.53485254691689, + "grad_norm": 0.005228559020906687, + "learning_rate": 2.2670300491862646e-05, + "loss": 0.0135, + "num_input_tokens_seen": 44930640, + "step": 77445 + }, + { + "epoch": 11.53559725945785, + "grad_norm": 0.002614476950839162, + "learning_rate": 2.2667065229847323e-05, + "loss": 0.0441, + "num_input_tokens_seen": 44933328, + "step": 77450 + }, + { + "epoch": 11.536341971998809, + "grad_norm": 0.012518074363470078, + "learning_rate": 2.266383000724456e-05, + "loss": 0.1336, + "num_input_tokens_seen": 44936560, + "step": 77455 + }, + { + "epoch": 11.537086684539767, + "grad_norm": 0.015136105939745903, + "learning_rate": 2.2660594824109008e-05, + "loss": 0.2582, + "num_input_tokens_seen": 44939248, + "step": 77460 + }, + { + "epoch": 11.537831397080726, + "grad_norm": 0.02030244842171669, + "learning_rate": 2.2657359680495335e-05, + "loss": 0.1691, + "num_input_tokens_seen": 44941808, + "step": 77465 + }, + { + "epoch": 11.538576109621687, + "grad_norm": 0.06462313234806061, + "learning_rate": 2.2654124576458182e-05, + "loss": 0.1038, + "num_input_tokens_seen": 44944400, + "step": 77470 + }, + { + "epoch": 11.539320822162646, + "grad_norm": 0.12681539356708527, + "learning_rate": 2.265088951205222e-05, + "loss": 0.0249, + "num_input_tokens_seen": 44947472, + "step": 77475 + }, + { + "epoch": 11.540065534703604, + "grad_norm": 0.007146382238715887, + "learning_rate": 2.2647654487332086e-05, + "loss": 0.0069, + "num_input_tokens_seen": 44950544, + "step": 77480 + }, + { + "epoch": 11.540810247244563, + "grad_norm": 0.022139713168144226, + "learning_rate": 2.2644419502352444e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44953200, + "step": 77485 + }, + { + "epoch": 11.541554959785524, + "grad_norm": 102.4327163696289, + "learning_rate": 2.264118455716794e-05, + "loss": 0.0775, + "num_input_tokens_seen": 44956176, + "step": 77490 + }, + { + "epoch": 11.542299672326482, + "grad_norm": 0.08433603495359421, + "learning_rate": 2.2637949651833218e-05, + "loss": 0.0004, + "num_input_tokens_seen": 44959440, + "step": 77495 + }, + { + "epoch": 11.543044384867441, + "grad_norm": 0.09444192796945572, + "learning_rate": 2.2634714786402942e-05, + "loss": 0.0247, + "num_input_tokens_seen": 44962384, + "step": 77500 + }, + { + "epoch": 11.5437890974084, + "grad_norm": 0.9371747970581055, + "learning_rate": 2.2631479960931747e-05, + "loss": 0.0715, + "num_input_tokens_seen": 44965360, + "step": 77505 + }, + { + "epoch": 11.54453380994936, + "grad_norm": 0.0021711259614676237, + "learning_rate": 2.26282451754743e-05, + "loss": 0.0001, + "num_input_tokens_seen": 44968336, + "step": 77510 + }, + { + "epoch": 11.54527852249032, + "grad_norm": 0.007169189862906933, + "learning_rate": 2.262501043008524e-05, + "loss": 0.0002, + "num_input_tokens_seen": 44971216, + "step": 77515 + }, + { + "epoch": 11.546023235031278, + "grad_norm": 0.08439982682466507, + "learning_rate": 2.2621775724819218e-05, + "loss": 0.1349, + "num_input_tokens_seen": 44974128, + "step": 77520 + }, + { + "epoch": 11.546767947572237, + "grad_norm": 0.021886907517910004, + "learning_rate": 2.2618541059730862e-05, + "loss": 0.1722, + "num_input_tokens_seen": 44976784, + "step": 77525 + }, + { + "epoch": 11.547512660113195, + "grad_norm": 82.56429290771484, + "learning_rate": 2.2615306434874853e-05, + "loss": 0.1161, + "num_input_tokens_seen": 44980080, + "step": 77530 + }, + { + "epoch": 11.548257372654156, + "grad_norm": 169.84524536132812, + "learning_rate": 2.2612071850305812e-05, + "loss": 0.3254, + "num_input_tokens_seen": 44983024, + "step": 77535 + }, + { + "epoch": 11.549002085195115, + "grad_norm": 0.03994949907064438, + "learning_rate": 2.2608837306078385e-05, + "loss": 0.0009, + "num_input_tokens_seen": 44986096, + "step": 77540 + }, + { + "epoch": 11.549746797736073, + "grad_norm": 0.005016779992729425, + "learning_rate": 2.2605602802247227e-05, + "loss": 0.0006, + "num_input_tokens_seen": 44989008, + "step": 77545 + }, + { + "epoch": 11.550491510277032, + "grad_norm": 0.22645074129104614, + "learning_rate": 2.2602368338866974e-05, + "loss": 0.0004, + "num_input_tokens_seen": 44991568, + "step": 77550 + }, + { + "epoch": 11.551236222817993, + "grad_norm": 0.10757795721292496, + "learning_rate": 2.2599133915992273e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44994576, + "step": 77555 + }, + { + "epoch": 11.551980935358952, + "grad_norm": 0.003552385838702321, + "learning_rate": 2.2595899533677756e-05, + "loss": 0.0003, + "num_input_tokens_seen": 44997488, + "step": 77560 + }, + { + "epoch": 11.55272564789991, + "grad_norm": 7.177794933319092, + "learning_rate": 2.2592665191978085e-05, + "loss": 0.0011, + "num_input_tokens_seen": 45000304, + "step": 77565 + }, + { + "epoch": 11.553470360440869, + "grad_norm": 2.433473825454712, + "learning_rate": 2.2589430890947885e-05, + "loss": 0.1053, + "num_input_tokens_seen": 45002992, + "step": 77570 + }, + { + "epoch": 11.55421507298183, + "grad_norm": 0.0003045848570764065, + "learning_rate": 2.2586196630641792e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45005872, + "step": 77575 + }, + { + "epoch": 11.554959785522788, + "grad_norm": 0.0030315639451146126, + "learning_rate": 2.2582962411114464e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45008464, + "step": 77580 + }, + { + "epoch": 11.555704498063747, + "grad_norm": 0.00490829162299633, + "learning_rate": 2.2579728232420525e-05, + "loss": 0.1534, + "num_input_tokens_seen": 45011408, + "step": 77585 + }, + { + "epoch": 11.556449210604706, + "grad_norm": 0.008754203096032143, + "learning_rate": 2.2576494094614624e-05, + "loss": 0.04, + "num_input_tokens_seen": 45014576, + "step": 77590 + }, + { + "epoch": 11.557193923145666, + "grad_norm": 0.003951366059482098, + "learning_rate": 2.257325999775138e-05, + "loss": 0.1254, + "num_input_tokens_seen": 45017264, + "step": 77595 + }, + { + "epoch": 11.557938635686625, + "grad_norm": 0.000539912492968142, + "learning_rate": 2.257002594188545e-05, + "loss": 0.0028, + "num_input_tokens_seen": 45020176, + "step": 77600 + }, + { + "epoch": 11.558683348227584, + "grad_norm": 0.052956290543079376, + "learning_rate": 2.2566791927071453e-05, + "loss": 0.1256, + "num_input_tokens_seen": 45023088, + "step": 77605 + }, + { + "epoch": 11.559428060768543, + "grad_norm": 0.0024905146565288305, + "learning_rate": 2.2563557953364043e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45026160, + "step": 77610 + }, + { + "epoch": 11.560172773309503, + "grad_norm": 0.027378255501389503, + "learning_rate": 2.256032402081785e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45028912, + "step": 77615 + }, + { + "epoch": 11.560917485850462, + "grad_norm": 11.596074104309082, + "learning_rate": 2.2557090129487493e-05, + "loss": 0.2344, + "num_input_tokens_seen": 45031600, + "step": 77620 + }, + { + "epoch": 11.56166219839142, + "grad_norm": 11.166167259216309, + "learning_rate": 2.2553856279427625e-05, + "loss": 0.1991, + "num_input_tokens_seen": 45034992, + "step": 77625 + }, + { + "epoch": 11.56240691093238, + "grad_norm": 19.696840286254883, + "learning_rate": 2.2550622470692852e-05, + "loss": 0.0098, + "num_input_tokens_seen": 45038224, + "step": 77630 + }, + { + "epoch": 11.56315162347334, + "grad_norm": 0.008820266462862492, + "learning_rate": 2.2547388703337837e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45041264, + "step": 77635 + }, + { + "epoch": 11.563896336014299, + "grad_norm": 0.010604485869407654, + "learning_rate": 2.2544154977417187e-05, + "loss": 0.2036, + "num_input_tokens_seen": 45044048, + "step": 77640 + }, + { + "epoch": 11.564641048555258, + "grad_norm": 0.16712944209575653, + "learning_rate": 2.2540921292985553e-05, + "loss": 0.0005, + "num_input_tokens_seen": 45047024, + "step": 77645 + }, + { + "epoch": 11.565385761096216, + "grad_norm": 1.6268619298934937, + "learning_rate": 2.2537687650097554e-05, + "loss": 0.2886, + "num_input_tokens_seen": 45050064, + "step": 77650 + }, + { + "epoch": 11.566130473637177, + "grad_norm": 0.002469574101269245, + "learning_rate": 2.2534454048807814e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45052976, + "step": 77655 + }, + { + "epoch": 11.566875186178136, + "grad_norm": 1.3862754106521606, + "learning_rate": 2.2531220489170977e-05, + "loss": 0.0009, + "num_input_tokens_seen": 45055504, + "step": 77660 + }, + { + "epoch": 11.567619898719094, + "grad_norm": 0.0030925683677196503, + "learning_rate": 2.2527986971241642e-05, + "loss": 0.13, + "num_input_tokens_seen": 45058288, + "step": 77665 + }, + { + "epoch": 11.568364611260053, + "grad_norm": 5.134763717651367, + "learning_rate": 2.252475349507447e-05, + "loss": 0.1804, + "num_input_tokens_seen": 45061008, + "step": 77670 + }, + { + "epoch": 11.569109323801012, + "grad_norm": 30.985633850097656, + "learning_rate": 2.2521520060724062e-05, + "loss": 0.0521, + "num_input_tokens_seen": 45063792, + "step": 77675 + }, + { + "epoch": 11.569854036341972, + "grad_norm": 0.12533146142959595, + "learning_rate": 2.251828666824506e-05, + "loss": 0.0006, + "num_input_tokens_seen": 45066736, + "step": 77680 + }, + { + "epoch": 11.570598748882931, + "grad_norm": 0.006946980021893978, + "learning_rate": 2.2515053317692082e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45069840, + "step": 77685 + }, + { + "epoch": 11.57134346142389, + "grad_norm": 0.0021636560559272766, + "learning_rate": 2.2511820009119755e-05, + "loss": 0.2948, + "num_input_tokens_seen": 45072688, + "step": 77690 + }, + { + "epoch": 11.57208817396485, + "grad_norm": 0.08451499044895172, + "learning_rate": 2.25085867425827e-05, + "loss": 0.0032, + "num_input_tokens_seen": 45075440, + "step": 77695 + }, + { + "epoch": 11.57283288650581, + "grad_norm": 0.04371413588523865, + "learning_rate": 2.2505353518135534e-05, + "loss": 0.0021, + "num_input_tokens_seen": 45078192, + "step": 77700 + }, + { + "epoch": 11.573577599046768, + "grad_norm": 0.4898952543735504, + "learning_rate": 2.250212033583289e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45080720, + "step": 77705 + }, + { + "epoch": 11.574322311587727, + "grad_norm": 0.04236092418432236, + "learning_rate": 2.2498887195729375e-05, + "loss": 0.0006, + "num_input_tokens_seen": 45083568, + "step": 77710 + }, + { + "epoch": 11.575067024128685, + "grad_norm": 0.08591322600841522, + "learning_rate": 2.2495654097879627e-05, + "loss": 0.0007, + "num_input_tokens_seen": 45086480, + "step": 77715 + }, + { + "epoch": 11.575811736669646, + "grad_norm": 0.0785682275891304, + "learning_rate": 2.2492421042338257e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45089392, + "step": 77720 + }, + { + "epoch": 11.576556449210605, + "grad_norm": 0.020364878699183464, + "learning_rate": 2.2489188029159887e-05, + "loss": 0.0007, + "num_input_tokens_seen": 45092272, + "step": 77725 + }, + { + "epoch": 11.577301161751564, + "grad_norm": 0.01554103847593069, + "learning_rate": 2.2485955058399135e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45095248, + "step": 77730 + }, + { + "epoch": 11.578045874292522, + "grad_norm": 0.0897517129778862, + "learning_rate": 2.2482722130110608e-05, + "loss": 0.0029, + "num_input_tokens_seen": 45098064, + "step": 77735 + }, + { + "epoch": 11.578790586833483, + "grad_norm": 0.05693129077553749, + "learning_rate": 2.2479489244348938e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45101104, + "step": 77740 + }, + { + "epoch": 11.579535299374442, + "grad_norm": 36.727928161621094, + "learning_rate": 2.2476256401168736e-05, + "loss": 0.246, + "num_input_tokens_seen": 45104272, + "step": 77745 + }, + { + "epoch": 11.5802800119154, + "grad_norm": 0.003149133175611496, + "learning_rate": 2.247302360062461e-05, + "loss": 0.0005, + "num_input_tokens_seen": 45107056, + "step": 77750 + }, + { + "epoch": 11.581024724456359, + "grad_norm": 0.6461102366447449, + "learning_rate": 2.246979084277119e-05, + "loss": 0.0012, + "num_input_tokens_seen": 45109872, + "step": 77755 + }, + { + "epoch": 11.58176943699732, + "grad_norm": 0.0703468918800354, + "learning_rate": 2.2466558127663086e-05, + "loss": 0.0948, + "num_input_tokens_seen": 45112752, + "step": 77760 + }, + { + "epoch": 11.582514149538278, + "grad_norm": 206.4024658203125, + "learning_rate": 2.246332545535489e-05, + "loss": 0.1598, + "num_input_tokens_seen": 45115440, + "step": 77765 + }, + { + "epoch": 11.583258862079237, + "grad_norm": 0.013846314512193203, + "learning_rate": 2.246009282590125e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45118288, + "step": 77770 + }, + { + "epoch": 11.584003574620196, + "grad_norm": 17.330968856811523, + "learning_rate": 2.2456860239356755e-05, + "loss": 0.1784, + "num_input_tokens_seen": 45121008, + "step": 77775 + }, + { + "epoch": 11.584748287161156, + "grad_norm": 0.0009498507133685052, + "learning_rate": 2.245362769577602e-05, + "loss": 0.0229, + "num_input_tokens_seen": 45123760, + "step": 77780 + }, + { + "epoch": 11.585492999702115, + "grad_norm": 0.0006641821819357574, + "learning_rate": 2.245039519521366e-05, + "loss": 0.0036, + "num_input_tokens_seen": 45126800, + "step": 77785 + }, + { + "epoch": 11.586237712243074, + "grad_norm": 0.14353665709495544, + "learning_rate": 2.2447162737724274e-05, + "loss": 0.2658, + "num_input_tokens_seen": 45129712, + "step": 77790 + }, + { + "epoch": 11.586982424784033, + "grad_norm": 0.00031991361174732447, + "learning_rate": 2.2443930323362487e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45132720, + "step": 77795 + }, + { + "epoch": 11.587727137324993, + "grad_norm": 86.67530822753906, + "learning_rate": 2.244069795218289e-05, + "loss": 0.0507, + "num_input_tokens_seen": 45135408, + "step": 77800 + }, + { + "epoch": 11.588471849865952, + "grad_norm": 0.04201774671673775, + "learning_rate": 2.243746562424011e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45138000, + "step": 77805 + }, + { + "epoch": 11.58921656240691, + "grad_norm": 0.012649801559746265, + "learning_rate": 2.2434233339588746e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45141040, + "step": 77810 + }, + { + "epoch": 11.58996127494787, + "grad_norm": 0.016455503180623055, + "learning_rate": 2.2431001098283393e-05, + "loss": 0.0418, + "num_input_tokens_seen": 45143792, + "step": 77815 + }, + { + "epoch": 11.59070598748883, + "grad_norm": 0.03623479604721069, + "learning_rate": 2.2427768900378674e-05, + "loss": 0.0294, + "num_input_tokens_seen": 45146672, + "step": 77820 + }, + { + "epoch": 11.591450700029789, + "grad_norm": 9.481009483337402, + "learning_rate": 2.2424536745929174e-05, + "loss": 0.0023, + "num_input_tokens_seen": 45149328, + "step": 77825 + }, + { + "epoch": 11.592195412570748, + "grad_norm": 2.1295464038848877, + "learning_rate": 2.2421304634989517e-05, + "loss": 0.1495, + "num_input_tokens_seen": 45152144, + "step": 77830 + }, + { + "epoch": 11.592940125111706, + "grad_norm": 0.004466530866920948, + "learning_rate": 2.2418072567614286e-05, + "loss": 0.0136, + "num_input_tokens_seen": 45154928, + "step": 77835 + }, + { + "epoch": 11.593684837652667, + "grad_norm": 0.12145301699638367, + "learning_rate": 2.241484054385811e-05, + "loss": 0.0007, + "num_input_tokens_seen": 45157744, + "step": 77840 + }, + { + "epoch": 11.594429550193626, + "grad_norm": 0.0135838333517313, + "learning_rate": 2.2411608563775564e-05, + "loss": 0.1452, + "num_input_tokens_seen": 45160848, + "step": 77845 + }, + { + "epoch": 11.595174262734584, + "grad_norm": 0.05115600302815437, + "learning_rate": 2.240837662742127e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45163728, + "step": 77850 + }, + { + "epoch": 11.595918975275543, + "grad_norm": 0.0476691909134388, + "learning_rate": 2.240514473484982e-05, + "loss": 0.2632, + "num_input_tokens_seen": 45166672, + "step": 77855 + }, + { + "epoch": 11.596663687816502, + "grad_norm": 0.05265723168849945, + "learning_rate": 2.24019128861158e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45169744, + "step": 77860 + }, + { + "epoch": 11.597408400357462, + "grad_norm": 0.008587592281401157, + "learning_rate": 2.2398681081273832e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45172368, + "step": 77865 + }, + { + "epoch": 11.598153112898421, + "grad_norm": 0.039985671639442444, + "learning_rate": 2.239544932037849e-05, + "loss": 0.2058, + "num_input_tokens_seen": 45175472, + "step": 77870 + }, + { + "epoch": 11.59889782543938, + "grad_norm": 0.0038560277316719294, + "learning_rate": 2.2392217603484397e-05, + "loss": 0.2016, + "num_input_tokens_seen": 45178480, + "step": 77875 + }, + { + "epoch": 11.59964253798034, + "grad_norm": 0.003931943792849779, + "learning_rate": 2.2388985930646135e-05, + "loss": 0.025, + "num_input_tokens_seen": 45181328, + "step": 77880 + }, + { + "epoch": 11.6003872505213, + "grad_norm": 0.006443815305829048, + "learning_rate": 2.2385754301918303e-05, + "loss": 0.1197, + "num_input_tokens_seen": 45184496, + "step": 77885 + }, + { + "epoch": 11.601131963062258, + "grad_norm": 0.723632276058197, + "learning_rate": 2.2382522717355498e-05, + "loss": 0.105, + "num_input_tokens_seen": 45187248, + "step": 77890 + }, + { + "epoch": 11.601876675603217, + "grad_norm": 15.489468574523926, + "learning_rate": 2.2379291177012295e-05, + "loss": 0.089, + "num_input_tokens_seen": 45190128, + "step": 77895 + }, + { + "epoch": 11.602621388144176, + "grad_norm": 0.007373988628387451, + "learning_rate": 2.2376059680943324e-05, + "loss": 0.3602, + "num_input_tokens_seen": 45193104, + "step": 77900 + }, + { + "epoch": 11.603366100685136, + "grad_norm": 8.271793365478516, + "learning_rate": 2.237282822920314e-05, + "loss": 0.0019, + "num_input_tokens_seen": 45196112, + "step": 77905 + }, + { + "epoch": 11.604110813226095, + "grad_norm": 0.0018643415533006191, + "learning_rate": 2.2369596821846367e-05, + "loss": 0.1131, + "num_input_tokens_seen": 45198992, + "step": 77910 + }, + { + "epoch": 11.604855525767054, + "grad_norm": 0.006026910152286291, + "learning_rate": 2.2366365458927574e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45202352, + "step": 77915 + }, + { + "epoch": 11.605600238308012, + "grad_norm": 0.6234559416770935, + "learning_rate": 2.236313414050137e-05, + "loss": 0.2596, + "num_input_tokens_seen": 45205424, + "step": 77920 + }, + { + "epoch": 11.606344950848973, + "grad_norm": 0.7913302183151245, + "learning_rate": 2.2359902866622317e-05, + "loss": 0.0092, + "num_input_tokens_seen": 45208240, + "step": 77925 + }, + { + "epoch": 11.607089663389932, + "grad_norm": 0.0021225139498710632, + "learning_rate": 2.2356671637345038e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45211088, + "step": 77930 + }, + { + "epoch": 11.60783437593089, + "grad_norm": 0.07044409960508347, + "learning_rate": 2.2353440452724102e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45214224, + "step": 77935 + }, + { + "epoch": 11.60857908847185, + "grad_norm": 0.007600768469274044, + "learning_rate": 2.235020931281409e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45217296, + "step": 77940 + }, + { + "epoch": 11.60932380101281, + "grad_norm": 0.00040644194814376533, + "learning_rate": 2.2346978217669613e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45220304, + "step": 77945 + }, + { + "epoch": 11.610068513553768, + "grad_norm": 0.03254257142543793, + "learning_rate": 2.2343747167345233e-05, + "loss": 0.0107, + "num_input_tokens_seen": 45223184, + "step": 77950 + }, + { + "epoch": 11.610813226094727, + "grad_norm": 0.010520408861339092, + "learning_rate": 2.2340516161895553e-05, + "loss": 0.0472, + "num_input_tokens_seen": 45226480, + "step": 77955 + }, + { + "epoch": 11.611557938635686, + "grad_norm": 0.41417795419692993, + "learning_rate": 2.2337285201375137e-05, + "loss": 0.1054, + "num_input_tokens_seen": 45229360, + "step": 77960 + }, + { + "epoch": 11.612302651176647, + "grad_norm": 0.8441044688224792, + "learning_rate": 2.233405428583859e-05, + "loss": 0.0025, + "num_input_tokens_seen": 45231920, + "step": 77965 + }, + { + "epoch": 11.613047363717605, + "grad_norm": 0.015673715621232986, + "learning_rate": 2.233082341534049e-05, + "loss": 0.0851, + "num_input_tokens_seen": 45234768, + "step": 77970 + }, + { + "epoch": 11.613792076258564, + "grad_norm": 2.111879825592041, + "learning_rate": 2.2327592589935403e-05, + "loss": 0.0021, + "num_input_tokens_seen": 45237712, + "step": 77975 + }, + { + "epoch": 11.614536788799523, + "grad_norm": 0.05018904060125351, + "learning_rate": 2.2324361809677933e-05, + "loss": 0.0005, + "num_input_tokens_seen": 45240560, + "step": 77980 + }, + { + "epoch": 11.615281501340483, + "grad_norm": 0.4559842348098755, + "learning_rate": 2.2321131074622647e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45243536, + "step": 77985 + }, + { + "epoch": 11.616026213881442, + "grad_norm": 0.02877824194729328, + "learning_rate": 2.2317900384824132e-05, + "loss": 0.1565, + "num_input_tokens_seen": 45246224, + "step": 77990 + }, + { + "epoch": 11.6167709264224, + "grad_norm": 44.59413146972656, + "learning_rate": 2.2314669740336957e-05, + "loss": 0.0064, + "num_input_tokens_seen": 45249360, + "step": 77995 + }, + { + "epoch": 11.61751563896336, + "grad_norm": 16.798616409301758, + "learning_rate": 2.2311439141215715e-05, + "loss": 0.1231, + "num_input_tokens_seen": 45252368, + "step": 78000 + }, + { + "epoch": 11.61826035150432, + "grad_norm": 0.03852533549070358, + "learning_rate": 2.2308208587514967e-05, + "loss": 0.0143, + "num_input_tokens_seen": 45255248, + "step": 78005 + }, + { + "epoch": 11.619005064045279, + "grad_norm": 13.854122161865234, + "learning_rate": 2.230497807928931e-05, + "loss": 0.2243, + "num_input_tokens_seen": 45258160, + "step": 78010 + }, + { + "epoch": 11.619749776586238, + "grad_norm": 5.886069297790527, + "learning_rate": 2.2301747616593306e-05, + "loss": 0.0752, + "num_input_tokens_seen": 45261008, + "step": 78015 + }, + { + "epoch": 11.620494489127196, + "grad_norm": 0.012844855897128582, + "learning_rate": 2.2298517199481534e-05, + "loss": 0.1655, + "num_input_tokens_seen": 45264144, + "step": 78020 + }, + { + "epoch": 11.621239201668157, + "grad_norm": 0.004201974719762802, + "learning_rate": 2.2295286828008572e-05, + "loss": 0.005, + "num_input_tokens_seen": 45266864, + "step": 78025 + }, + { + "epoch": 11.621983914209116, + "grad_norm": 0.010491969995200634, + "learning_rate": 2.2292056502228975e-05, + "loss": 0.0107, + "num_input_tokens_seen": 45269680, + "step": 78030 + }, + { + "epoch": 11.622728626750074, + "grad_norm": 0.20106929540634155, + "learning_rate": 2.2288826222197346e-05, + "loss": 0.0295, + "num_input_tokens_seen": 45272944, + "step": 78035 + }, + { + "epoch": 11.623473339291033, + "grad_norm": 2.7335925102233887, + "learning_rate": 2.228559598796823e-05, + "loss": 0.0009, + "num_input_tokens_seen": 45276208, + "step": 78040 + }, + { + "epoch": 11.624218051831992, + "grad_norm": 0.01886676996946335, + "learning_rate": 2.2282365799596222e-05, + "loss": 0.0008, + "num_input_tokens_seen": 45279120, + "step": 78045 + }, + { + "epoch": 11.624962764372953, + "grad_norm": 18.610729217529297, + "learning_rate": 2.2279135657135876e-05, + "loss": 0.1679, + "num_input_tokens_seen": 45281968, + "step": 78050 + }, + { + "epoch": 11.625707476913911, + "grad_norm": 22.303991317749023, + "learning_rate": 2.2275905560641775e-05, + "loss": 0.1994, + "num_input_tokens_seen": 45285072, + "step": 78055 + }, + { + "epoch": 11.62645218945487, + "grad_norm": 3.711195945739746, + "learning_rate": 2.2272675510168482e-05, + "loss": 0.0703, + "num_input_tokens_seen": 45288304, + "step": 78060 + }, + { + "epoch": 11.627196901995829, + "grad_norm": 0.051099758595228195, + "learning_rate": 2.226944550577055e-05, + "loss": 0.0005, + "num_input_tokens_seen": 45291120, + "step": 78065 + }, + { + "epoch": 11.62794161453679, + "grad_norm": 0.013987711630761623, + "learning_rate": 2.2266215547502573e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45293776, + "step": 78070 + }, + { + "epoch": 11.628686327077748, + "grad_norm": 0.5710164308547974, + "learning_rate": 2.22629856354191e-05, + "loss": 0.18, + "num_input_tokens_seen": 45296624, + "step": 78075 + }, + { + "epoch": 11.629431039618707, + "grad_norm": 0.0017921162070706487, + "learning_rate": 2.22597557695747e-05, + "loss": 0.0008, + "num_input_tokens_seen": 45299376, + "step": 78080 + }, + { + "epoch": 11.630175752159666, + "grad_norm": 3.334632396697998, + "learning_rate": 2.225652595002395e-05, + "loss": 0.0006, + "num_input_tokens_seen": 45302416, + "step": 78085 + }, + { + "epoch": 11.630920464700626, + "grad_norm": 0.0009628038969822228, + "learning_rate": 2.2253296176821402e-05, + "loss": 0.0008, + "num_input_tokens_seen": 45305392, + "step": 78090 + }, + { + "epoch": 11.631665177241585, + "grad_norm": 0.03604698181152344, + "learning_rate": 2.2250066450021628e-05, + "loss": 0.0031, + "num_input_tokens_seen": 45308432, + "step": 78095 + }, + { + "epoch": 11.632409889782544, + "grad_norm": 5.335107326507568, + "learning_rate": 2.2246836769679175e-05, + "loss": 0.2465, + "num_input_tokens_seen": 45311408, + "step": 78100 + }, + { + "epoch": 11.633154602323502, + "grad_norm": 0.005955581087619066, + "learning_rate": 2.2243607135848625e-05, + "loss": 0.1694, + "num_input_tokens_seen": 45314192, + "step": 78105 + }, + { + "epoch": 11.633899314864463, + "grad_norm": 0.0035452682059258223, + "learning_rate": 2.2240377548584532e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45317168, + "step": 78110 + }, + { + "epoch": 11.634644027405422, + "grad_norm": 0.7719079852104187, + "learning_rate": 2.2237148007941455e-05, + "loss": 0.0029, + "num_input_tokens_seen": 45319856, + "step": 78115 + }, + { + "epoch": 11.63538873994638, + "grad_norm": 0.004728150554001331, + "learning_rate": 2.2233918513973944e-05, + "loss": 0.1284, + "num_input_tokens_seen": 45322576, + "step": 78120 + }, + { + "epoch": 11.63613345248734, + "grad_norm": 0.0029414084274321795, + "learning_rate": 2.223068906673658e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45325264, + "step": 78125 + }, + { + "epoch": 11.6368781650283, + "grad_norm": 0.008958986029028893, + "learning_rate": 2.22274596662839e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45328048, + "step": 78130 + }, + { + "epoch": 11.637622877569259, + "grad_norm": 0.003952785395085812, + "learning_rate": 2.222423031267048e-05, + "loss": 0.0228, + "num_input_tokens_seen": 45331344, + "step": 78135 + }, + { + "epoch": 11.638367590110217, + "grad_norm": 0.012179592624306679, + "learning_rate": 2.222100100595087e-05, + "loss": 0.0081, + "num_input_tokens_seen": 45334256, + "step": 78140 + }, + { + "epoch": 11.639112302651176, + "grad_norm": 0.008096622303128242, + "learning_rate": 2.221777174617962e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45337072, + "step": 78145 + }, + { + "epoch": 11.639857015192137, + "grad_norm": 0.001747169648297131, + "learning_rate": 2.221454253341129e-05, + "loss": 0.1385, + "num_input_tokens_seen": 45339696, + "step": 78150 + }, + { + "epoch": 11.640601727733095, + "grad_norm": 0.0038306121714413166, + "learning_rate": 2.2211313367700422e-05, + "loss": 0.2723, + "num_input_tokens_seen": 45342544, + "step": 78155 + }, + { + "epoch": 11.641346440274054, + "grad_norm": 0.00887396652251482, + "learning_rate": 2.2208084249101593e-05, + "loss": 0.016, + "num_input_tokens_seen": 45345360, + "step": 78160 + }, + { + "epoch": 11.642091152815013, + "grad_norm": 0.004473034758120775, + "learning_rate": 2.220485517766933e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45348016, + "step": 78165 + }, + { + "epoch": 11.642835865355973, + "grad_norm": 0.10025876015424728, + "learning_rate": 2.220162615345821e-05, + "loss": 0.1501, + "num_input_tokens_seen": 45350640, + "step": 78170 + }, + { + "epoch": 11.643580577896932, + "grad_norm": 0.005983190611004829, + "learning_rate": 2.2198397176522773e-05, + "loss": 0.2345, + "num_input_tokens_seen": 45353232, + "step": 78175 + }, + { + "epoch": 11.64432529043789, + "grad_norm": 58.97355651855469, + "learning_rate": 2.2195168246917564e-05, + "loss": 0.1845, + "num_input_tokens_seen": 45356304, + "step": 78180 + }, + { + "epoch": 11.64507000297885, + "grad_norm": 0.0020181071013212204, + "learning_rate": 2.219193936469714e-05, + "loss": 0.2202, + "num_input_tokens_seen": 45359120, + "step": 78185 + }, + { + "epoch": 11.64581471551981, + "grad_norm": 0.030081575736403465, + "learning_rate": 2.2188710529916033e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45361968, + "step": 78190 + }, + { + "epoch": 11.646559428060769, + "grad_norm": 0.00327571714296937, + "learning_rate": 2.218548174262882e-05, + "loss": 0.307, + "num_input_tokens_seen": 45364880, + "step": 78195 + }, + { + "epoch": 11.647304140601728, + "grad_norm": 0.002129188273102045, + "learning_rate": 2.218225300289002e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45367760, + "step": 78200 + }, + { + "epoch": 11.648048853142686, + "grad_norm": 9.825057983398438, + "learning_rate": 2.21790243107542e-05, + "loss": 0.2887, + "num_input_tokens_seen": 45370672, + "step": 78205 + }, + { + "epoch": 11.648793565683647, + "grad_norm": 0.11305545270442963, + "learning_rate": 2.2175795666275894e-05, + "loss": 0.0011, + "num_input_tokens_seen": 45373584, + "step": 78210 + }, + { + "epoch": 11.649538278224606, + "grad_norm": 0.008215252310037613, + "learning_rate": 2.2172567069509656e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45376400, + "step": 78215 + }, + { + "epoch": 11.650282990765565, + "grad_norm": 0.002471687737852335, + "learning_rate": 2.2169338520510025e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45379504, + "step": 78220 + }, + { + "epoch": 11.651027703306523, + "grad_norm": 34.70931625366211, + "learning_rate": 2.2166110019331526e-05, + "loss": 0.1317, + "num_input_tokens_seen": 45382544, + "step": 78225 + }, + { + "epoch": 11.651772415847482, + "grad_norm": 8.966224670410156, + "learning_rate": 2.2162881566028736e-05, + "loss": 0.0846, + "num_input_tokens_seen": 45385392, + "step": 78230 + }, + { + "epoch": 11.652517128388443, + "grad_norm": 0.01070273295044899, + "learning_rate": 2.2159653160656162e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45388304, + "step": 78235 + }, + { + "epoch": 11.653261840929401, + "grad_norm": 0.09643827378749847, + "learning_rate": 2.2156424803268374e-05, + "loss": 0.0008, + "num_input_tokens_seen": 45391312, + "step": 78240 + }, + { + "epoch": 11.65400655347036, + "grad_norm": 0.01594260148704052, + "learning_rate": 2.2153196493919896e-05, + "loss": 0.0013, + "num_input_tokens_seen": 45394224, + "step": 78245 + }, + { + "epoch": 11.654751266011319, + "grad_norm": 0.0009510329691693187, + "learning_rate": 2.214996823266527e-05, + "loss": 0.0164, + "num_input_tokens_seen": 45397328, + "step": 78250 + }, + { + "epoch": 11.65549597855228, + "grad_norm": 4.622618198394775, + "learning_rate": 2.2146740019559036e-05, + "loss": 0.3228, + "num_input_tokens_seen": 45400336, + "step": 78255 + }, + { + "epoch": 11.656240691093238, + "grad_norm": 287.42718505859375, + "learning_rate": 2.214351185465572e-05, + "loss": 0.2024, + "num_input_tokens_seen": 45403248, + "step": 78260 + }, + { + "epoch": 11.656985403634197, + "grad_norm": 0.015720732510089874, + "learning_rate": 2.214028373800988e-05, + "loss": 0.1072, + "num_input_tokens_seen": 45405936, + "step": 78265 + }, + { + "epoch": 11.657730116175156, + "grad_norm": 0.01724608801305294, + "learning_rate": 2.2137055669676027e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45408688, + "step": 78270 + }, + { + "epoch": 11.658474828716116, + "grad_norm": 0.016524964943528175, + "learning_rate": 2.213382764970872e-05, + "loss": 0.0804, + "num_input_tokens_seen": 45411472, + "step": 78275 + }, + { + "epoch": 11.659219541257075, + "grad_norm": 4.741809368133545, + "learning_rate": 2.2130599678162474e-05, + "loss": 0.0018, + "num_input_tokens_seen": 45414256, + "step": 78280 + }, + { + "epoch": 11.659964253798034, + "grad_norm": 0.014217250980436802, + "learning_rate": 2.212737175509184e-05, + "loss": 0.0147, + "num_input_tokens_seen": 45417072, + "step": 78285 + }, + { + "epoch": 11.660708966338992, + "grad_norm": 212.77401733398438, + "learning_rate": 2.2124143880551327e-05, + "loss": 0.066, + "num_input_tokens_seen": 45419952, + "step": 78290 + }, + { + "epoch": 11.661453678879953, + "grad_norm": 0.14488595724105835, + "learning_rate": 2.2120916054595492e-05, + "loss": 0.0007, + "num_input_tokens_seen": 45422864, + "step": 78295 + }, + { + "epoch": 11.662198391420912, + "grad_norm": 0.040472958236932755, + "learning_rate": 2.211768827727885e-05, + "loss": 0.0022, + "num_input_tokens_seen": 45425808, + "step": 78300 + }, + { + "epoch": 11.66294310396187, + "grad_norm": 0.10747608542442322, + "learning_rate": 2.211446054865593e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45428720, + "step": 78305 + }, + { + "epoch": 11.66368781650283, + "grad_norm": 0.010456605814397335, + "learning_rate": 2.2111232868781277e-05, + "loss": 0.1533, + "num_input_tokens_seen": 45431728, + "step": 78310 + }, + { + "epoch": 11.66443252904379, + "grad_norm": 0.07861433178186417, + "learning_rate": 2.21080052377094e-05, + "loss": 0.0007, + "num_input_tokens_seen": 45435088, + "step": 78315 + }, + { + "epoch": 11.665177241584749, + "grad_norm": 9.372856140136719, + "learning_rate": 2.210477765549484e-05, + "loss": 0.0166, + "num_input_tokens_seen": 45438416, + "step": 78320 + }, + { + "epoch": 11.665921954125707, + "grad_norm": 0.004066196735948324, + "learning_rate": 2.210155012219211e-05, + "loss": 0.0102, + "num_input_tokens_seen": 45441648, + "step": 78325 + }, + { + "epoch": 11.666666666666666, + "grad_norm": 0.00577979302033782, + "learning_rate": 2.2098322637855757e-05, + "loss": 0.0215, + "num_input_tokens_seen": 45444528, + "step": 78330 + }, + { + "epoch": 11.667411379207627, + "grad_norm": 0.01808742992579937, + "learning_rate": 2.2095095202540293e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45447280, + "step": 78335 + }, + { + "epoch": 11.668156091748585, + "grad_norm": 0.013937463983893394, + "learning_rate": 2.209186781630023e-05, + "loss": 0.0036, + "num_input_tokens_seen": 45450288, + "step": 78340 + }, + { + "epoch": 11.668900804289544, + "grad_norm": 0.0007918372866697609, + "learning_rate": 2.2088640479190116e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45453008, + "step": 78345 + }, + { + "epoch": 11.669645516830503, + "grad_norm": 0.0018764396663755178, + "learning_rate": 2.208541319126446e-05, + "loss": 0.0653, + "num_input_tokens_seen": 45455536, + "step": 78350 + }, + { + "epoch": 11.670390229371463, + "grad_norm": 0.006895305588841438, + "learning_rate": 2.2082185952577788e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45458352, + "step": 78355 + }, + { + "epoch": 11.671134941912422, + "grad_norm": 0.1803836226463318, + "learning_rate": 2.207895876318461e-05, + "loss": 0.0167, + "num_input_tokens_seen": 45461072, + "step": 78360 + }, + { + "epoch": 11.671879654453381, + "grad_norm": 0.06039834022521973, + "learning_rate": 2.2075731623139463e-05, + "loss": 0.2472, + "num_input_tokens_seen": 45464272, + "step": 78365 + }, + { + "epoch": 11.67262436699434, + "grad_norm": 0.1709490567445755, + "learning_rate": 2.207250453249685e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45467248, + "step": 78370 + }, + { + "epoch": 11.673369079535298, + "grad_norm": 0.006586907897144556, + "learning_rate": 2.2069277491311306e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45470192, + "step": 78375 + }, + { + "epoch": 11.674113792076259, + "grad_norm": 0.06296126544475555, + "learning_rate": 2.2066050499637344e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45472912, + "step": 78380 + }, + { + "epoch": 11.674858504617218, + "grad_norm": 0.004378170240670443, + "learning_rate": 2.2062823557529467e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45476176, + "step": 78385 + }, + { + "epoch": 11.675603217158177, + "grad_norm": 2.6803343296051025, + "learning_rate": 2.2059596665042213e-05, + "loss": 0.2547, + "num_input_tokens_seen": 45479216, + "step": 78390 + }, + { + "epoch": 11.676347929699137, + "grad_norm": 23.92318344116211, + "learning_rate": 2.2056369822230067e-05, + "loss": 0.0133, + "num_input_tokens_seen": 45482256, + "step": 78395 + }, + { + "epoch": 11.677092642240096, + "grad_norm": 342.1924133300781, + "learning_rate": 2.2053143029147574e-05, + "loss": 0.059, + "num_input_tokens_seen": 45485232, + "step": 78400 + }, + { + "epoch": 11.677837354781055, + "grad_norm": 0.02458564005792141, + "learning_rate": 2.2049916285849233e-05, + "loss": 0.0023, + "num_input_tokens_seen": 45487984, + "step": 78405 + }, + { + "epoch": 11.678582067322013, + "grad_norm": 0.028369035571813583, + "learning_rate": 2.204668959238955e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45490672, + "step": 78410 + }, + { + "epoch": 11.679326779862972, + "grad_norm": 0.002328941598534584, + "learning_rate": 2.2043462948823057e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45493744, + "step": 78415 + }, + { + "epoch": 11.680071492403933, + "grad_norm": 0.003003334626555443, + "learning_rate": 2.2040236355204244e-05, + "loss": 0.0056, + "num_input_tokens_seen": 45496432, + "step": 78420 + }, + { + "epoch": 11.680816204944891, + "grad_norm": 0.14807258546352386, + "learning_rate": 2.2037009811587638e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45499216, + "step": 78425 + }, + { + "epoch": 11.68156091748585, + "grad_norm": 0.00013282238796819001, + "learning_rate": 2.2033783318027725e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45502160, + "step": 78430 + }, + { + "epoch": 11.682305630026809, + "grad_norm": 53.280513763427734, + "learning_rate": 2.203055687457904e-05, + "loss": 0.2018, + "num_input_tokens_seen": 45505296, + "step": 78435 + }, + { + "epoch": 11.68305034256777, + "grad_norm": 0.001145301852375269, + "learning_rate": 2.2027330481296074e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45508112, + "step": 78440 + }, + { + "epoch": 11.683795055108728, + "grad_norm": 0.013926198706030846, + "learning_rate": 2.2024104138233343e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45510672, + "step": 78445 + }, + { + "epoch": 11.684539767649687, + "grad_norm": 0.002209024503827095, + "learning_rate": 2.2020877845445338e-05, + "loss": 0.0064, + "num_input_tokens_seen": 45513520, + "step": 78450 + }, + { + "epoch": 11.685284480190646, + "grad_norm": 43.55665969848633, + "learning_rate": 2.2017651602986584e-05, + "loss": 0.1492, + "num_input_tokens_seen": 45516272, + "step": 78455 + }, + { + "epoch": 11.686029192731606, + "grad_norm": 0.005598341580480337, + "learning_rate": 2.2014425410911575e-05, + "loss": 0.0065, + "num_input_tokens_seen": 45519088, + "step": 78460 + }, + { + "epoch": 11.686773905272565, + "grad_norm": 0.007896595634520054, + "learning_rate": 2.2011199269274804e-05, + "loss": 0.0005, + "num_input_tokens_seen": 45522000, + "step": 78465 + }, + { + "epoch": 11.687518617813524, + "grad_norm": 0.013345245271921158, + "learning_rate": 2.2007973178130795e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45525200, + "step": 78470 + }, + { + "epoch": 11.688263330354483, + "grad_norm": 0.00298802787438035, + "learning_rate": 2.2004747137534032e-05, + "loss": 0.0047, + "num_input_tokens_seen": 45527824, + "step": 78475 + }, + { + "epoch": 11.689008042895443, + "grad_norm": 0.02584105171263218, + "learning_rate": 2.2001521147539028e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45530544, + "step": 78480 + }, + { + "epoch": 11.689752755436402, + "grad_norm": 0.011929242871701717, + "learning_rate": 2.1998295208200263e-05, + "loss": 0.0454, + "num_input_tokens_seen": 45533360, + "step": 78485 + }, + { + "epoch": 11.69049746797736, + "grad_norm": 0.03383792191743851, + "learning_rate": 2.1995069319572264e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45536112, + "step": 78490 + }, + { + "epoch": 11.69124218051832, + "grad_norm": 0.0011100003030151129, + "learning_rate": 2.1991843481709513e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45538800, + "step": 78495 + }, + { + "epoch": 11.69198689305928, + "grad_norm": 0.014664755202829838, + "learning_rate": 2.19886176946665e-05, + "loss": 0.0, + "num_input_tokens_seen": 45541744, + "step": 78500 + }, + { + "epoch": 11.692731605600239, + "grad_norm": 0.002655621385201812, + "learning_rate": 2.1985391958497743e-05, + "loss": 0.1569, + "num_input_tokens_seen": 45544336, + "step": 78505 + }, + { + "epoch": 11.693476318141197, + "grad_norm": 10.496335983276367, + "learning_rate": 2.1982166273257716e-05, + "loss": 0.2008, + "num_input_tokens_seen": 45547056, + "step": 78510 + }, + { + "epoch": 11.694221030682156, + "grad_norm": 52.48786163330078, + "learning_rate": 2.1978940639000927e-05, + "loss": 0.0675, + "num_input_tokens_seen": 45549968, + "step": 78515 + }, + { + "epoch": 11.694965743223117, + "grad_norm": 34.67756652832031, + "learning_rate": 2.1975715055781858e-05, + "loss": 0.0528, + "num_input_tokens_seen": 45553232, + "step": 78520 + }, + { + "epoch": 11.695710455764075, + "grad_norm": 0.11811698973178864, + "learning_rate": 2.1972489523655016e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45556240, + "step": 78525 + }, + { + "epoch": 11.696455168305034, + "grad_norm": 0.0016192685579881072, + "learning_rate": 2.1969264042674877e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45559216, + "step": 78530 + }, + { + "epoch": 11.697199880845993, + "grad_norm": 0.001071005710400641, + "learning_rate": 2.1966038612895958e-05, + "loss": 0.1215, + "num_input_tokens_seen": 45562128, + "step": 78535 + }, + { + "epoch": 11.697944593386953, + "grad_norm": 0.025256045162677765, + "learning_rate": 2.1962813234372727e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45565200, + "step": 78540 + }, + { + "epoch": 11.698689305927912, + "grad_norm": 0.026795489713549614, + "learning_rate": 2.1959587907159673e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45568016, + "step": 78545 + }, + { + "epoch": 11.699434018468871, + "grad_norm": 0.005925777833908796, + "learning_rate": 2.19563626313113e-05, + "loss": 0.0711, + "num_input_tokens_seen": 45571120, + "step": 78550 + }, + { + "epoch": 11.70017873100983, + "grad_norm": 0.27270206809043884, + "learning_rate": 2.1953137406882078e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45574096, + "step": 78555 + }, + { + "epoch": 11.700923443550789, + "grad_norm": 0.00019801878079306334, + "learning_rate": 2.194991223392651e-05, + "loss": 0.0024, + "num_input_tokens_seen": 45577136, + "step": 78560 + }, + { + "epoch": 11.701668156091749, + "grad_norm": 42.20408630371094, + "learning_rate": 2.1946687112499066e-05, + "loss": 0.2257, + "num_input_tokens_seen": 45579920, + "step": 78565 + }, + { + "epoch": 11.702412868632708, + "grad_norm": 0.0033823878038674593, + "learning_rate": 2.194346204265425e-05, + "loss": 0.2105, + "num_input_tokens_seen": 45583472, + "step": 78570 + }, + { + "epoch": 11.703157581173667, + "grad_norm": 0.39818698167800903, + "learning_rate": 2.1940237024446535e-05, + "loss": 0.0871, + "num_input_tokens_seen": 45586192, + "step": 78575 + }, + { + "epoch": 11.703902293714627, + "grad_norm": 0.0008587394841015339, + "learning_rate": 2.19370120579304e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45589328, + "step": 78580 + }, + { + "epoch": 11.704647006255586, + "grad_norm": 0.01660236530005932, + "learning_rate": 2.1933787143160343e-05, + "loss": 0.0011, + "num_input_tokens_seen": 45592464, + "step": 78585 + }, + { + "epoch": 11.705391718796545, + "grad_norm": 23.011821746826172, + "learning_rate": 2.193056228019082e-05, + "loss": 0.0302, + "num_input_tokens_seen": 45595568, + "step": 78590 + }, + { + "epoch": 11.706136431337503, + "grad_norm": 0.005133288912475109, + "learning_rate": 2.1927337469076343e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45598512, + "step": 78595 + }, + { + "epoch": 11.706881143878462, + "grad_norm": 216.31466674804688, + "learning_rate": 2.1924112709871362e-05, + "loss": 0.159, + "num_input_tokens_seen": 45601680, + "step": 78600 + }, + { + "epoch": 11.707625856419423, + "grad_norm": 0.05088479071855545, + "learning_rate": 2.1920888002630382e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45604528, + "step": 78605 + }, + { + "epoch": 11.708370568960381, + "grad_norm": 0.0048484960570931435, + "learning_rate": 2.1917663347407867e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45607408, + "step": 78610 + }, + { + "epoch": 11.70911528150134, + "grad_norm": 0.0017953995848074555, + "learning_rate": 2.1914438744258298e-05, + "loss": 0.0363, + "num_input_tokens_seen": 45610320, + "step": 78615 + }, + { + "epoch": 11.709859994042299, + "grad_norm": 0.0622478723526001, + "learning_rate": 2.1911214193236153e-05, + "loss": 0.0107, + "num_input_tokens_seen": 45613072, + "step": 78620 + }, + { + "epoch": 11.71060470658326, + "grad_norm": 0.0014795465394854546, + "learning_rate": 2.1907989694395893e-05, + "loss": 0.0532, + "num_input_tokens_seen": 45615600, + "step": 78625 + }, + { + "epoch": 11.711349419124218, + "grad_norm": 14.415977478027344, + "learning_rate": 2.1904765247792016e-05, + "loss": 0.297, + "num_input_tokens_seen": 45618480, + "step": 78630 + }, + { + "epoch": 11.712094131665177, + "grad_norm": 0.0012802404817193747, + "learning_rate": 2.1901540853478976e-05, + "loss": 0.0884, + "num_input_tokens_seen": 45621392, + "step": 78635 + }, + { + "epoch": 11.712838844206136, + "grad_norm": 0.02702292427420616, + "learning_rate": 2.1898316511511264e-05, + "loss": 0.2196, + "num_input_tokens_seen": 45624144, + "step": 78640 + }, + { + "epoch": 11.713583556747096, + "grad_norm": 0.025038382038474083, + "learning_rate": 2.1895092221943335e-05, + "loss": 0.0102, + "num_input_tokens_seen": 45627280, + "step": 78645 + }, + { + "epoch": 11.714328269288055, + "grad_norm": 0.016655299812555313, + "learning_rate": 2.1891867984829672e-05, + "loss": 0.1854, + "num_input_tokens_seen": 45630288, + "step": 78650 + }, + { + "epoch": 11.715072981829014, + "grad_norm": 156.50650024414062, + "learning_rate": 2.1888643800224728e-05, + "loss": 0.1995, + "num_input_tokens_seen": 45633104, + "step": 78655 + }, + { + "epoch": 11.715817694369973, + "grad_norm": 0.002735569141805172, + "learning_rate": 2.1885419668183e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45636176, + "step": 78660 + }, + { + "epoch": 11.716562406910933, + "grad_norm": 0.0018802781123667955, + "learning_rate": 2.188219558875894e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45639120, + "step": 78665 + }, + { + "epoch": 11.717307119451892, + "grad_norm": 213.7245330810547, + "learning_rate": 2.1878971562007007e-05, + "loss": 0.0888, + "num_input_tokens_seen": 45642000, + "step": 78670 + }, + { + "epoch": 11.71805183199285, + "grad_norm": 85.9752197265625, + "learning_rate": 2.1875747587981686e-05, + "loss": 0.1783, + "num_input_tokens_seen": 45644752, + "step": 78675 + }, + { + "epoch": 11.71879654453381, + "grad_norm": 0.00535440631210804, + "learning_rate": 2.1872523666737428e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45647696, + "step": 78680 + }, + { + "epoch": 11.71954125707477, + "grad_norm": 0.0013754963874816895, + "learning_rate": 2.186929979832871e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45650896, + "step": 78685 + }, + { + "epoch": 11.720285969615729, + "grad_norm": 0.02030964009463787, + "learning_rate": 2.186607598280998e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45654192, + "step": 78690 + }, + { + "epoch": 11.721030682156687, + "grad_norm": 0.000209367586649023, + "learning_rate": 2.186285222023572e-05, + "loss": 0.0824, + "num_input_tokens_seen": 45656880, + "step": 78695 + }, + { + "epoch": 11.721775394697646, + "grad_norm": 0.004902437329292297, + "learning_rate": 2.185962851066039e-05, + "loss": 0.0027, + "num_input_tokens_seen": 45660272, + "step": 78700 + }, + { + "epoch": 11.722520107238607, + "grad_norm": 1.4316810369491577, + "learning_rate": 2.1856404854138426e-05, + "loss": 0.1711, + "num_input_tokens_seen": 45663088, + "step": 78705 + }, + { + "epoch": 11.723264819779565, + "grad_norm": 8.644400596618652, + "learning_rate": 2.1853181250724318e-05, + "loss": 0.0024, + "num_input_tokens_seen": 45666064, + "step": 78710 + }, + { + "epoch": 11.724009532320524, + "grad_norm": 0.011248343624174595, + "learning_rate": 2.1849957700472515e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45669008, + "step": 78715 + }, + { + "epoch": 11.724754244861483, + "grad_norm": 0.0008935515652410686, + "learning_rate": 2.1846734203437478e-05, + "loss": 0.2384, + "num_input_tokens_seen": 45671728, + "step": 78720 + }, + { + "epoch": 11.725498957402444, + "grad_norm": 0.028231162577867508, + "learning_rate": 2.1843510759673648e-05, + "loss": 0.0059, + "num_input_tokens_seen": 45674864, + "step": 78725 + }, + { + "epoch": 11.726243669943402, + "grad_norm": 0.002320609986782074, + "learning_rate": 2.184028736923551e-05, + "loss": 0.0478, + "num_input_tokens_seen": 45677712, + "step": 78730 + }, + { + "epoch": 11.726988382484361, + "grad_norm": 0.0015787174925208092, + "learning_rate": 2.1837064032177497e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45680272, + "step": 78735 + }, + { + "epoch": 11.72773309502532, + "grad_norm": 0.002298436826094985, + "learning_rate": 2.1833840748554075e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45683248, + "step": 78740 + }, + { + "epoch": 11.728477807566279, + "grad_norm": 2.9828763008117676, + "learning_rate": 2.18306175184197e-05, + "loss": 0.0115, + "num_input_tokens_seen": 45686192, + "step": 78745 + }, + { + "epoch": 11.729222520107239, + "grad_norm": 0.00608684029430151, + "learning_rate": 2.1827394341828817e-05, + "loss": 0.0005, + "num_input_tokens_seen": 45689360, + "step": 78750 + }, + { + "epoch": 11.729967232648198, + "grad_norm": 0.0031687153968960047, + "learning_rate": 2.1824171218835886e-05, + "loss": 0.1425, + "num_input_tokens_seen": 45692272, + "step": 78755 + }, + { + "epoch": 11.730711945189157, + "grad_norm": 80.89385986328125, + "learning_rate": 2.1820948149495343e-05, + "loss": 0.2544, + "num_input_tokens_seen": 45695280, + "step": 78760 + }, + { + "epoch": 11.731456657730115, + "grad_norm": 0.02892742119729519, + "learning_rate": 2.181772513386166e-05, + "loss": 0.0176, + "num_input_tokens_seen": 45697968, + "step": 78765 + }, + { + "epoch": 11.732201370271076, + "grad_norm": 0.12920363247394562, + "learning_rate": 2.1814502171989276e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45700944, + "step": 78770 + }, + { + "epoch": 11.732946082812035, + "grad_norm": 0.21392332017421722, + "learning_rate": 2.1811279263932642e-05, + "loss": 0.0173, + "num_input_tokens_seen": 45703888, + "step": 78775 + }, + { + "epoch": 11.733690795352993, + "grad_norm": 82.1504135131836, + "learning_rate": 2.1808056409746196e-05, + "loss": 0.1845, + "num_input_tokens_seen": 45706672, + "step": 78780 + }, + { + "epoch": 11.734435507893952, + "grad_norm": 0.0024088427890092134, + "learning_rate": 2.18048336094844e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45709744, + "step": 78785 + }, + { + "epoch": 11.735180220434913, + "grad_norm": 0.004167560022324324, + "learning_rate": 2.180161086320169e-05, + "loss": 0.2065, + "num_input_tokens_seen": 45712432, + "step": 78790 + }, + { + "epoch": 11.735924932975871, + "grad_norm": 0.21314473450183868, + "learning_rate": 2.1798388170952508e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45715248, + "step": 78795 + }, + { + "epoch": 11.73666964551683, + "grad_norm": 0.0008814418106339872, + "learning_rate": 2.1795165532791315e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45718288, + "step": 78800 + }, + { + "epoch": 11.737414358057789, + "grad_norm": 0.16176310181617737, + "learning_rate": 2.1791942948772533e-05, + "loss": 0.2064, + "num_input_tokens_seen": 45721136, + "step": 78805 + }, + { + "epoch": 11.73815907059875, + "grad_norm": 0.0027575737331062555, + "learning_rate": 2.1788720418950626e-05, + "loss": 0.2438, + "num_input_tokens_seen": 45724144, + "step": 78810 + }, + { + "epoch": 11.738903783139708, + "grad_norm": 0.003456340404227376, + "learning_rate": 2.178549794338001e-05, + "loss": 0.1873, + "num_input_tokens_seen": 45726960, + "step": 78815 + }, + { + "epoch": 11.739648495680667, + "grad_norm": 0.05488637834787369, + "learning_rate": 2.178227552211515e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45729936, + "step": 78820 + }, + { + "epoch": 11.740393208221626, + "grad_norm": 0.0005463984562084079, + "learning_rate": 2.1779053155210474e-05, + "loss": 0.0008, + "num_input_tokens_seen": 45732752, + "step": 78825 + }, + { + "epoch": 11.741137920762586, + "grad_norm": 0.002108963904902339, + "learning_rate": 2.177583084272041e-05, + "loss": 0.2157, + "num_input_tokens_seen": 45735792, + "step": 78830 + }, + { + "epoch": 11.741882633303545, + "grad_norm": 0.0005194743280299008, + "learning_rate": 2.177260858469942e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45738608, + "step": 78835 + }, + { + "epoch": 11.742627345844504, + "grad_norm": 0.5683776140213013, + "learning_rate": 2.176938638120192e-05, + "loss": 0.0079, + "num_input_tokens_seen": 45741584, + "step": 78840 + }, + { + "epoch": 11.743372058385463, + "grad_norm": 0.006610546726733446, + "learning_rate": 2.176616423228236e-05, + "loss": 0.0006, + "num_input_tokens_seen": 45744592, + "step": 78845 + }, + { + "epoch": 11.744116770926423, + "grad_norm": 0.0513729490339756, + "learning_rate": 2.1762942137995158e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45747504, + "step": 78850 + }, + { + "epoch": 11.744861483467382, + "grad_norm": 0.041741013526916504, + "learning_rate": 2.175972009839477e-05, + "loss": 0.2626, + "num_input_tokens_seen": 45750288, + "step": 78855 + }, + { + "epoch": 11.74560619600834, + "grad_norm": 0.017267534509301186, + "learning_rate": 2.1756498113535617e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45753264, + "step": 78860 + }, + { + "epoch": 11.7463509085493, + "grad_norm": 0.22478146851062775, + "learning_rate": 2.1753276183472122e-05, + "loss": 0.0006, + "num_input_tokens_seen": 45756144, + "step": 78865 + }, + { + "epoch": 11.74709562109026, + "grad_norm": 0.012398806400597095, + "learning_rate": 2.1750054308258737e-05, + "loss": 0.0166, + "num_input_tokens_seen": 45758960, + "step": 78870 + }, + { + "epoch": 11.747840333631219, + "grad_norm": 0.008264010772109032, + "learning_rate": 2.1746832487949874e-05, + "loss": 0.035, + "num_input_tokens_seen": 45761680, + "step": 78875 + }, + { + "epoch": 11.748585046172177, + "grad_norm": 0.00825907289981842, + "learning_rate": 2.174361072259998e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45764944, + "step": 78880 + }, + { + "epoch": 11.749329758713136, + "grad_norm": 0.0816170796751976, + "learning_rate": 2.1740389012263454e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45768048, + "step": 78885 + }, + { + "epoch": 11.750074471254095, + "grad_norm": 149.03916931152344, + "learning_rate": 2.173716735699476e-05, + "loss": 0.1984, + "num_input_tokens_seen": 45770992, + "step": 78890 + }, + { + "epoch": 11.750819183795056, + "grad_norm": 0.1071411743760109, + "learning_rate": 2.173394575684829e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45773936, + "step": 78895 + }, + { + "epoch": 11.751563896336014, + "grad_norm": 0.10174593329429626, + "learning_rate": 2.1730724211878506e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45776592, + "step": 78900 + }, + { + "epoch": 11.752308608876973, + "grad_norm": 0.008096232078969479, + "learning_rate": 2.172750272213981e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45779376, + "step": 78905 + }, + { + "epoch": 11.753053321417934, + "grad_norm": 7.960326194763184, + "learning_rate": 2.1724281287686622e-05, + "loss": 0.0011, + "num_input_tokens_seen": 45782288, + "step": 78910 + }, + { + "epoch": 11.753798033958892, + "grad_norm": 0.0024306816048920155, + "learning_rate": 2.1721059908573383e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45785296, + "step": 78915 + }, + { + "epoch": 11.754542746499851, + "grad_norm": 0.023345956578850746, + "learning_rate": 2.171783858485449e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45787952, + "step": 78920 + }, + { + "epoch": 11.75528745904081, + "grad_norm": 24.54650115966797, + "learning_rate": 2.171461731658439e-05, + "loss": 0.1989, + "num_input_tokens_seen": 45790704, + "step": 78925 + }, + { + "epoch": 11.756032171581769, + "grad_norm": 3.923501491546631, + "learning_rate": 2.1711396103817477e-05, + "loss": 0.0148, + "num_input_tokens_seen": 45793552, + "step": 78930 + }, + { + "epoch": 11.75677688412273, + "grad_norm": 31.20705795288086, + "learning_rate": 2.17081749466082e-05, + "loss": 0.0018, + "num_input_tokens_seen": 45796528, + "step": 78935 + }, + { + "epoch": 11.757521596663688, + "grad_norm": 0.0011457490036264062, + "learning_rate": 2.170495384501096e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45799536, + "step": 78940 + }, + { + "epoch": 11.758266309204647, + "grad_norm": 2.588684320449829, + "learning_rate": 2.1701732799080173e-05, + "loss": 0.1667, + "num_input_tokens_seen": 45802544, + "step": 78945 + }, + { + "epoch": 11.759011021745605, + "grad_norm": 0.007686696480959654, + "learning_rate": 2.169851180887026e-05, + "loss": 0.1345, + "num_input_tokens_seen": 45805520, + "step": 78950 + }, + { + "epoch": 11.759755734286566, + "grad_norm": 43.0899772644043, + "learning_rate": 2.1695290874435623e-05, + "loss": 0.2293, + "num_input_tokens_seen": 45808304, + "step": 78955 + }, + { + "epoch": 11.760500446827525, + "grad_norm": 0.031772319227457047, + "learning_rate": 2.16920699958307e-05, + "loss": 0.1503, + "num_input_tokens_seen": 45811120, + "step": 78960 + }, + { + "epoch": 11.761245159368483, + "grad_norm": 0.0026154331862926483, + "learning_rate": 2.168884917310988e-05, + "loss": 0.1813, + "num_input_tokens_seen": 45814096, + "step": 78965 + }, + { + "epoch": 11.761989871909442, + "grad_norm": 0.0023503091651946306, + "learning_rate": 2.16856284063276e-05, + "loss": 0.1546, + "num_input_tokens_seen": 45817008, + "step": 78970 + }, + { + "epoch": 11.762734584450403, + "grad_norm": 0.04851313680410385, + "learning_rate": 2.1682407695538255e-05, + "loss": 0.0197, + "num_input_tokens_seen": 45820080, + "step": 78975 + }, + { + "epoch": 11.763479296991362, + "grad_norm": 0.0031465552747249603, + "learning_rate": 2.1679187040796266e-05, + "loss": 0.155, + "num_input_tokens_seen": 45822896, + "step": 78980 + }, + { + "epoch": 11.76422400953232, + "grad_norm": 0.009293891489505768, + "learning_rate": 2.1675966442156038e-05, + "loss": 0.1755, + "num_input_tokens_seen": 45825904, + "step": 78985 + }, + { + "epoch": 11.764968722073279, + "grad_norm": 0.029859991744160652, + "learning_rate": 2.1672745899671965e-05, + "loss": 0.0494, + "num_input_tokens_seen": 45829072, + "step": 78990 + }, + { + "epoch": 11.76571343461424, + "grad_norm": 0.058552999049425125, + "learning_rate": 2.1669525413398477e-05, + "loss": 0.1468, + "num_input_tokens_seen": 45831920, + "step": 78995 + }, + { + "epoch": 11.766458147155198, + "grad_norm": 0.004650586750358343, + "learning_rate": 2.166630498338997e-05, + "loss": 0.1874, + "num_input_tokens_seen": 45835152, + "step": 79000 + }, + { + "epoch": 11.767202859696157, + "grad_norm": 0.004961838945746422, + "learning_rate": 2.1663084609700853e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45837904, + "step": 79005 + }, + { + "epoch": 11.767947572237116, + "grad_norm": 0.023326734080910683, + "learning_rate": 2.1659864292385528e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45840912, + "step": 79010 + }, + { + "epoch": 11.768692284778076, + "grad_norm": 0.06471745669841766, + "learning_rate": 2.1656644031498407e-05, + "loss": 0.0006, + "num_input_tokens_seen": 45843792, + "step": 79015 + }, + { + "epoch": 11.769436997319035, + "grad_norm": 0.9843964576721191, + "learning_rate": 2.1653423827093888e-05, + "loss": 0.0023, + "num_input_tokens_seen": 45846960, + "step": 79020 + }, + { + "epoch": 11.770181709859994, + "grad_norm": 67.57152557373047, + "learning_rate": 2.1650203679226362e-05, + "loss": 0.2539, + "num_input_tokens_seen": 45849776, + "step": 79025 + }, + { + "epoch": 11.770926422400953, + "grad_norm": 0.04214276373386383, + "learning_rate": 2.164698358795025e-05, + "loss": 0.0066, + "num_input_tokens_seen": 45852848, + "step": 79030 + }, + { + "epoch": 11.771671134941913, + "grad_norm": 0.10239556431770325, + "learning_rate": 2.164376355331993e-05, + "loss": 0.1199, + "num_input_tokens_seen": 45855920, + "step": 79035 + }, + { + "epoch": 11.772415847482872, + "grad_norm": 9.037190437316895, + "learning_rate": 2.1640543575389828e-05, + "loss": 0.0022, + "num_input_tokens_seen": 45858800, + "step": 79040 + }, + { + "epoch": 11.77316056002383, + "grad_norm": 0.21061895787715912, + "learning_rate": 2.163732365421432e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45861744, + "step": 79045 + }, + { + "epoch": 11.77390527256479, + "grad_norm": 23.307918548583984, + "learning_rate": 2.1634103789847813e-05, + "loss": 0.0611, + "num_input_tokens_seen": 45864528, + "step": 79050 + }, + { + "epoch": 11.77464998510575, + "grad_norm": 0.15471608936786652, + "learning_rate": 2.1630883982344695e-05, + "loss": 0.0023, + "num_input_tokens_seen": 45867280, + "step": 79055 + }, + { + "epoch": 11.775394697646709, + "grad_norm": 0.030099613592028618, + "learning_rate": 2.1627664231759383e-05, + "loss": 0.1409, + "num_input_tokens_seen": 45870032, + "step": 79060 + }, + { + "epoch": 11.776139410187668, + "grad_norm": 0.0066647520288825035, + "learning_rate": 2.1624444538146248e-05, + "loss": 0.1051, + "num_input_tokens_seen": 45872912, + "step": 79065 + }, + { + "epoch": 11.776884122728626, + "grad_norm": 0.008440149948000908, + "learning_rate": 2.1621224901559685e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45875504, + "step": 79070 + }, + { + "epoch": 11.777628835269585, + "grad_norm": 1.6462981700897217, + "learning_rate": 2.1618005322054103e-05, + "loss": 0.0502, + "num_input_tokens_seen": 45878256, + "step": 79075 + }, + { + "epoch": 11.778373547810546, + "grad_norm": 0.0214946698397398, + "learning_rate": 2.1614785799683877e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45881136, + "step": 79080 + }, + { + "epoch": 11.779118260351504, + "grad_norm": 0.002109231660142541, + "learning_rate": 2.1611566334503413e-05, + "loss": 0.0019, + "num_input_tokens_seen": 45884368, + "step": 79085 + }, + { + "epoch": 11.779862972892463, + "grad_norm": 0.0017909635789692402, + "learning_rate": 2.160834692656708e-05, + "loss": 0.0059, + "num_input_tokens_seen": 45887280, + "step": 79090 + }, + { + "epoch": 11.780607685433424, + "grad_norm": 0.028334667906165123, + "learning_rate": 2.160512757592929e-05, + "loss": 0.0527, + "num_input_tokens_seen": 45890000, + "step": 79095 + }, + { + "epoch": 11.781352397974382, + "grad_norm": 0.006375045981258154, + "learning_rate": 2.1601908282644418e-05, + "loss": 0.0205, + "num_input_tokens_seen": 45892720, + "step": 79100 + }, + { + "epoch": 11.782097110515341, + "grad_norm": 0.0036557854618877172, + "learning_rate": 2.1598689046766848e-05, + "loss": 0.0007, + "num_input_tokens_seen": 45895728, + "step": 79105 + }, + { + "epoch": 11.7828418230563, + "grad_norm": 0.026400882750749588, + "learning_rate": 2.1595469868350966e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45898576, + "step": 79110 + }, + { + "epoch": 11.783586535597259, + "grad_norm": 0.13839952647686005, + "learning_rate": 2.1592250747451166e-05, + "loss": 0.0006, + "num_input_tokens_seen": 45901648, + "step": 79115 + }, + { + "epoch": 11.78433124813822, + "grad_norm": 0.010505471378564835, + "learning_rate": 2.1589031684121828e-05, + "loss": 0.0004, + "num_input_tokens_seen": 45904496, + "step": 79120 + }, + { + "epoch": 11.785075960679178, + "grad_norm": 0.004188333638012409, + "learning_rate": 2.1585812678417323e-05, + "loss": 0.0005, + "num_input_tokens_seen": 45907440, + "step": 79125 + }, + { + "epoch": 11.785820673220137, + "grad_norm": 0.012005571275949478, + "learning_rate": 2.1582593730392055e-05, + "loss": 0.0158, + "num_input_tokens_seen": 45910160, + "step": 79130 + }, + { + "epoch": 11.786565385761095, + "grad_norm": 0.06878058612346649, + "learning_rate": 2.1579374840100383e-05, + "loss": 0.0427, + "num_input_tokens_seen": 45913168, + "step": 79135 + }, + { + "epoch": 11.787310098302056, + "grad_norm": 0.006665393244475126, + "learning_rate": 2.1576156007596705e-05, + "loss": 0.0005, + "num_input_tokens_seen": 45916016, + "step": 79140 + }, + { + "epoch": 11.788054810843015, + "grad_norm": 0.0006993901915848255, + "learning_rate": 2.1572937232935385e-05, + "loss": 0.0012, + "num_input_tokens_seen": 45918576, + "step": 79145 + }, + { + "epoch": 11.788799523383974, + "grad_norm": 0.0011929761385545135, + "learning_rate": 2.1569718516170806e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45921616, + "step": 79150 + }, + { + "epoch": 11.789544235924932, + "grad_norm": 0.0012927009956911206, + "learning_rate": 2.1566499857357352e-05, + "loss": 0.0481, + "num_input_tokens_seen": 45924400, + "step": 79155 + }, + { + "epoch": 11.790288948465893, + "grad_norm": 0.00032415700843557715, + "learning_rate": 2.1563281256549385e-05, + "loss": 0.1472, + "num_input_tokens_seen": 45927440, + "step": 79160 + }, + { + "epoch": 11.791033661006852, + "grad_norm": 0.0005150816286914051, + "learning_rate": 2.15600627138013e-05, + "loss": 0.2399, + "num_input_tokens_seen": 45930448, + "step": 79165 + }, + { + "epoch": 11.79177837354781, + "grad_norm": 0.030419865623116493, + "learning_rate": 2.155684422916745e-05, + "loss": 0.0882, + "num_input_tokens_seen": 45933392, + "step": 79170 + }, + { + "epoch": 11.792523086088769, + "grad_norm": 0.027313046157360077, + "learning_rate": 2.1553625802702226e-05, + "loss": 0.0002, + "num_input_tokens_seen": 45936144, + "step": 79175 + }, + { + "epoch": 11.79326779862973, + "grad_norm": 0.001569981686770916, + "learning_rate": 2.155040743445999e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45938896, + "step": 79180 + }, + { + "epoch": 11.794012511170688, + "grad_norm": 0.023945527151226997, + "learning_rate": 2.1547189124495103e-05, + "loss": 0.1721, + "num_input_tokens_seen": 45941872, + "step": 79185 + }, + { + "epoch": 11.794757223711647, + "grad_norm": 0.0007301964215002954, + "learning_rate": 2.1543970872861957e-05, + "loss": 0.0244, + "num_input_tokens_seen": 45945104, + "step": 79190 + }, + { + "epoch": 11.795501936252606, + "grad_norm": 0.021496212109923363, + "learning_rate": 2.15407526796149e-05, + "loss": 0.0717, + "num_input_tokens_seen": 45948240, + "step": 79195 + }, + { + "epoch": 11.796246648793566, + "grad_norm": 0.058681681752204895, + "learning_rate": 2.153753454480832e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45951408, + "step": 79200 + }, + { + "epoch": 11.796991361334525, + "grad_norm": 0.000721084070391953, + "learning_rate": 2.1534316468496575e-05, + "loss": 0.2065, + "num_input_tokens_seen": 45954320, + "step": 79205 + }, + { + "epoch": 11.797736073875484, + "grad_norm": 0.14156167209148407, + "learning_rate": 2.153109845073403e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45957136, + "step": 79210 + }, + { + "epoch": 11.798480786416443, + "grad_norm": 0.007784585002809763, + "learning_rate": 2.1527880491575042e-05, + "loss": 0.0479, + "num_input_tokens_seen": 45959792, + "step": 79215 + }, + { + "epoch": 11.799225498957403, + "grad_norm": 0.0013659982942044735, + "learning_rate": 2.1524662591073997e-05, + "loss": 0.1994, + "num_input_tokens_seen": 45962416, + "step": 79220 + }, + { + "epoch": 11.799970211498362, + "grad_norm": 0.06771261245012283, + "learning_rate": 2.1521444749285244e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45965616, + "step": 79225 + }, + { + "epoch": 11.80071492403932, + "grad_norm": 0.027359716594219208, + "learning_rate": 2.1518226966263136e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45968304, + "step": 79230 + }, + { + "epoch": 11.80145963658028, + "grad_norm": 52.45176696777344, + "learning_rate": 2.1515009242062055e-05, + "loss": 0.1534, + "num_input_tokens_seen": 45970928, + "step": 79235 + }, + { + "epoch": 11.80220434912124, + "grad_norm": 0.06739038974046707, + "learning_rate": 2.1511791576736346e-05, + "loss": 0.0866, + "num_input_tokens_seen": 45973904, + "step": 79240 + }, + { + "epoch": 11.802949061662199, + "grad_norm": 0.02486412040889263, + "learning_rate": 2.1508573970340377e-05, + "loss": 0.0003, + "num_input_tokens_seen": 45976560, + "step": 79245 + }, + { + "epoch": 11.803693774203158, + "grad_norm": 15.654631614685059, + "learning_rate": 2.1505356422928493e-05, + "loss": 0.1657, + "num_input_tokens_seen": 45979312, + "step": 79250 + }, + { + "epoch": 11.804438486744116, + "grad_norm": 0.00857124850153923, + "learning_rate": 2.1502138934555072e-05, + "loss": 0.0506, + "num_input_tokens_seen": 45982224, + "step": 79255 + }, + { + "epoch": 11.805183199285075, + "grad_norm": 0.8085479140281677, + "learning_rate": 2.1498921505274444e-05, + "loss": 0.0006, + "num_input_tokens_seen": 45984944, + "step": 79260 + }, + { + "epoch": 11.805927911826036, + "grad_norm": 0.8365883231163025, + "learning_rate": 2.1495704135140992e-05, + "loss": 0.4739, + "num_input_tokens_seen": 45988144, + "step": 79265 + }, + { + "epoch": 11.806672624366994, + "grad_norm": 0.001727319206111133, + "learning_rate": 2.1492486824209058e-05, + "loss": 0.0001, + "num_input_tokens_seen": 45990672, + "step": 79270 + }, + { + "epoch": 11.807417336907953, + "grad_norm": 0.004557053558528423, + "learning_rate": 2.1489269572532987e-05, + "loss": 0.1752, + "num_input_tokens_seen": 45993392, + "step": 79275 + }, + { + "epoch": 11.808162049448912, + "grad_norm": 0.008065281435847282, + "learning_rate": 2.1486052380167146e-05, + "loss": 0.0842, + "num_input_tokens_seen": 45996368, + "step": 79280 + }, + { + "epoch": 11.808906761989872, + "grad_norm": 0.04037093371152878, + "learning_rate": 2.1482835247165867e-05, + "loss": 0.0049, + "num_input_tokens_seen": 45999536, + "step": 79285 + }, + { + "epoch": 11.809651474530831, + "grad_norm": 0.014838209375739098, + "learning_rate": 2.1479618173583522e-05, + "loss": 0.0545, + "num_input_tokens_seen": 46002288, + "step": 79290 + }, + { + "epoch": 11.81039618707179, + "grad_norm": 0.0015201501082628965, + "learning_rate": 2.147640115947444e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46005136, + "step": 79295 + }, + { + "epoch": 11.811140899612749, + "grad_norm": 0.006843833718448877, + "learning_rate": 2.147318420489299e-05, + "loss": 0.0019, + "num_input_tokens_seen": 46008080, + "step": 79300 + }, + { + "epoch": 11.81188561215371, + "grad_norm": 0.49085134267807007, + "learning_rate": 2.1469967309893508e-05, + "loss": 0.0952, + "num_input_tokens_seen": 46011088, + "step": 79305 + }, + { + "epoch": 11.812630324694668, + "grad_norm": 0.017784763127565384, + "learning_rate": 2.1466750474530333e-05, + "loss": 0.011, + "num_input_tokens_seen": 46013616, + "step": 79310 + }, + { + "epoch": 11.813375037235627, + "grad_norm": 0.02844288945198059, + "learning_rate": 2.1463533698857827e-05, + "loss": 0.1879, + "num_input_tokens_seen": 46016592, + "step": 79315 + }, + { + "epoch": 11.814119749776586, + "grad_norm": 0.030737262219190598, + "learning_rate": 2.1460316982930313e-05, + "loss": 0.0035, + "num_input_tokens_seen": 46019344, + "step": 79320 + }, + { + "epoch": 11.814864462317546, + "grad_norm": 0.3919678032398224, + "learning_rate": 2.1457100326802155e-05, + "loss": 0.0058, + "num_input_tokens_seen": 46022672, + "step": 79325 + }, + { + "epoch": 11.815609174858505, + "grad_norm": 0.0029712191317230463, + "learning_rate": 2.1453883730527677e-05, + "loss": 0.0006, + "num_input_tokens_seen": 46025520, + "step": 79330 + }, + { + "epoch": 11.816353887399464, + "grad_norm": 74.20623779296875, + "learning_rate": 2.145066719416124e-05, + "loss": 0.0991, + "num_input_tokens_seen": 46028592, + "step": 79335 + }, + { + "epoch": 11.817098599940422, + "grad_norm": 0.03987545892596245, + "learning_rate": 2.1447450717757167e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46031824, + "step": 79340 + }, + { + "epoch": 11.817843312481383, + "grad_norm": 0.006069393362849951, + "learning_rate": 2.144423430136981e-05, + "loss": 0.0003, + "num_input_tokens_seen": 46034480, + "step": 79345 + }, + { + "epoch": 11.818588025022342, + "grad_norm": 0.00246352213434875, + "learning_rate": 2.1441017945053497e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46037424, + "step": 79350 + }, + { + "epoch": 11.8193327375633, + "grad_norm": 0.02123318798840046, + "learning_rate": 2.143780164886256e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46040432, + "step": 79355 + }, + { + "epoch": 11.82007745010426, + "grad_norm": 1.4829963445663452, + "learning_rate": 2.143458541285136e-05, + "loss": 0.1796, + "num_input_tokens_seen": 46043888, + "step": 79360 + }, + { + "epoch": 11.82082216264522, + "grad_norm": 0.009119496680796146, + "learning_rate": 2.1431369237074196e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46046704, + "step": 79365 + }, + { + "epoch": 11.821566875186178, + "grad_norm": 0.0031038157176226377, + "learning_rate": 2.1428153121585438e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46049808, + "step": 79370 + }, + { + "epoch": 11.822311587727137, + "grad_norm": 0.007016804069280624, + "learning_rate": 2.1424937066439398e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46052656, + "step": 79375 + }, + { + "epoch": 11.823056300268096, + "grad_norm": 0.017904816195368767, + "learning_rate": 2.1421721071690415e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46055568, + "step": 79380 + }, + { + "epoch": 11.823801012809056, + "grad_norm": 0.00906533282250166, + "learning_rate": 2.141850513739282e-05, + "loss": 0.194, + "num_input_tokens_seen": 46058288, + "step": 79385 + }, + { + "epoch": 11.824545725350015, + "grad_norm": 71.31290435791016, + "learning_rate": 2.1415289263600927e-05, + "loss": 0.1181, + "num_input_tokens_seen": 46061552, + "step": 79390 + }, + { + "epoch": 11.825290437890974, + "grad_norm": 0.3019821047782898, + "learning_rate": 2.1412073450369092e-05, + "loss": 0.0003, + "num_input_tokens_seen": 46064432, + "step": 79395 + }, + { + "epoch": 11.826035150431933, + "grad_norm": 9.40173625946045, + "learning_rate": 2.1408857697751617e-05, + "loss": 0.0027, + "num_input_tokens_seen": 46067184, + "step": 79400 + }, + { + "epoch": 11.826779862972892, + "grad_norm": 0.005281712859869003, + "learning_rate": 2.1405642005802852e-05, + "loss": 0.0725, + "num_input_tokens_seen": 46069968, + "step": 79405 + }, + { + "epoch": 11.827524575513852, + "grad_norm": 69.61919403076172, + "learning_rate": 2.1402426374577107e-05, + "loss": 0.2669, + "num_input_tokens_seen": 46072880, + "step": 79410 + }, + { + "epoch": 11.82826928805481, + "grad_norm": 0.016256587579846382, + "learning_rate": 2.139921080412872e-05, + "loss": 0.0014, + "num_input_tokens_seen": 46075632, + "step": 79415 + }, + { + "epoch": 11.82901400059577, + "grad_norm": 0.01011858880519867, + "learning_rate": 2.1395995294511993e-05, + "loss": 0.1413, + "num_input_tokens_seen": 46078544, + "step": 79420 + }, + { + "epoch": 11.82975871313673, + "grad_norm": 19.001859664916992, + "learning_rate": 2.1392779845781275e-05, + "loss": 0.069, + "num_input_tokens_seen": 46081456, + "step": 79425 + }, + { + "epoch": 11.830503425677689, + "grad_norm": 0.14915449917316437, + "learning_rate": 2.1389564457990875e-05, + "loss": 0.0005, + "num_input_tokens_seen": 46084528, + "step": 79430 + }, + { + "epoch": 11.831248138218648, + "grad_norm": 0.01957515813410282, + "learning_rate": 2.1386349131195103e-05, + "loss": 0.0005, + "num_input_tokens_seen": 46087280, + "step": 79435 + }, + { + "epoch": 11.831992850759606, + "grad_norm": 0.015228983014822006, + "learning_rate": 2.13831338654483e-05, + "loss": 0.0003, + "num_input_tokens_seen": 46090256, + "step": 79440 + }, + { + "epoch": 11.832737563300565, + "grad_norm": 0.004666037857532501, + "learning_rate": 2.1379918660804766e-05, + "loss": 0.0343, + "num_input_tokens_seen": 46093072, + "step": 79445 + }, + { + "epoch": 11.833482275841526, + "grad_norm": 0.0018573739798739552, + "learning_rate": 2.1376703517318837e-05, + "loss": 0.0457, + "num_input_tokens_seen": 46095984, + "step": 79450 + }, + { + "epoch": 11.834226988382484, + "grad_norm": 0.015448222868144512, + "learning_rate": 2.1373488435044804e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46098928, + "step": 79455 + }, + { + "epoch": 11.834971700923443, + "grad_norm": 0.010110647417604923, + "learning_rate": 2.1370273414037013e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46102032, + "step": 79460 + }, + { + "epoch": 11.835716413464402, + "grad_norm": 0.0033147798385471106, + "learning_rate": 2.1367058454349763e-05, + "loss": 0.1943, + "num_input_tokens_seen": 46104688, + "step": 79465 + }, + { + "epoch": 11.836461126005362, + "grad_norm": 0.020954059436917305, + "learning_rate": 2.1363843556037365e-05, + "loss": 0.0004, + "num_input_tokens_seen": 46107664, + "step": 79470 + }, + { + "epoch": 11.837205838546321, + "grad_norm": 0.009180930443108082, + "learning_rate": 2.136062871915413e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46110480, + "step": 79475 + }, + { + "epoch": 11.83795055108728, + "grad_norm": 0.11458660662174225, + "learning_rate": 2.1357413943754374e-05, + "loss": 0.0199, + "num_input_tokens_seen": 46113296, + "step": 79480 + }, + { + "epoch": 11.838695263628239, + "grad_norm": 0.6342438459396362, + "learning_rate": 2.1354199229892416e-05, + "loss": 0.001, + "num_input_tokens_seen": 46116272, + "step": 79485 + }, + { + "epoch": 11.8394399761692, + "grad_norm": 0.005624328274279833, + "learning_rate": 2.1350984577622547e-05, + "loss": 0.0058, + "num_input_tokens_seen": 46119120, + "step": 79490 + }, + { + "epoch": 11.840184688710158, + "grad_norm": 0.007127092685550451, + "learning_rate": 2.1347769986999088e-05, + "loss": 0.0004, + "num_input_tokens_seen": 46121872, + "step": 79495 + }, + { + "epoch": 11.840929401251117, + "grad_norm": 0.031228765845298767, + "learning_rate": 2.1344555458076345e-05, + "loss": 0.0003, + "num_input_tokens_seen": 46124720, + "step": 79500 + }, + { + "epoch": 11.841674113792076, + "grad_norm": 0.004475032445043325, + "learning_rate": 2.1341340990908627e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46127856, + "step": 79505 + }, + { + "epoch": 11.842418826333036, + "grad_norm": 4.06627893447876, + "learning_rate": 2.133812658555023e-05, + "loss": 0.0139, + "num_input_tokens_seen": 46130800, + "step": 79510 + }, + { + "epoch": 11.843163538873995, + "grad_norm": 0.0005611151573248208, + "learning_rate": 2.1334912242055454e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46133584, + "step": 79515 + }, + { + "epoch": 11.843908251414954, + "grad_norm": 0.010267787612974644, + "learning_rate": 2.1331697960478624e-05, + "loss": 0.0088, + "num_input_tokens_seen": 46136304, + "step": 79520 + }, + { + "epoch": 11.844652963955912, + "grad_norm": 0.0006743501289747655, + "learning_rate": 2.1328483740874014e-05, + "loss": 0.0071, + "num_input_tokens_seen": 46139376, + "step": 79525 + }, + { + "epoch": 11.845397676496873, + "grad_norm": 0.0015836793463677168, + "learning_rate": 2.1325269583295953e-05, + "loss": 0.0595, + "num_input_tokens_seen": 46142224, + "step": 79530 + }, + { + "epoch": 11.846142389037832, + "grad_norm": 0.00024854668299667537, + "learning_rate": 2.132205548779872e-05, + "loss": 0.1012, + "num_input_tokens_seen": 46145008, + "step": 79535 + }, + { + "epoch": 11.84688710157879, + "grad_norm": 0.001377431908622384, + "learning_rate": 2.131884145443663e-05, + "loss": 0.0011, + "num_input_tokens_seen": 46147888, + "step": 79540 + }, + { + "epoch": 11.84763181411975, + "grad_norm": 0.0006255063926801085, + "learning_rate": 2.131562748326397e-05, + "loss": 0.031, + "num_input_tokens_seen": 46150832, + "step": 79545 + }, + { + "epoch": 11.84837652666071, + "grad_norm": 0.0017758454196155071, + "learning_rate": 2.131241357433503e-05, + "loss": 0.0432, + "num_input_tokens_seen": 46153712, + "step": 79550 + }, + { + "epoch": 11.849121239201668, + "grad_norm": 0.022900592535734177, + "learning_rate": 2.1309199727704125e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46156400, + "step": 79555 + }, + { + "epoch": 11.849865951742627, + "grad_norm": 3.84194016456604, + "learning_rate": 2.130598594342553e-05, + "loss": 0.0709, + "num_input_tokens_seen": 46159280, + "step": 79560 + }, + { + "epoch": 11.850610664283586, + "grad_norm": 0.0030353786423802376, + "learning_rate": 2.130277222155355e-05, + "loss": 0.0005, + "num_input_tokens_seen": 46162384, + "step": 79565 + }, + { + "epoch": 11.851355376824547, + "grad_norm": 0.0058110677637159824, + "learning_rate": 2.129955856214248e-05, + "loss": 0.0, + "num_input_tokens_seen": 46165392, + "step": 79570 + }, + { + "epoch": 11.852100089365505, + "grad_norm": 0.0010896628955379128, + "learning_rate": 2.129634496524661e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46168368, + "step": 79575 + }, + { + "epoch": 11.852844801906464, + "grad_norm": 0.0006290427991189063, + "learning_rate": 2.1293131430920215e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46171248, + "step": 79580 + }, + { + "epoch": 11.853589514447423, + "grad_norm": 0.0029624546878039837, + "learning_rate": 2.128991795921761e-05, + "loss": 0.1502, + "num_input_tokens_seen": 46174128, + "step": 79585 + }, + { + "epoch": 11.854334226988382, + "grad_norm": 13.316695213317871, + "learning_rate": 2.128670455019307e-05, + "loss": 0.178, + "num_input_tokens_seen": 46177008, + "step": 79590 + }, + { + "epoch": 11.855078939529342, + "grad_norm": 8.995450973510742, + "learning_rate": 2.128349120390087e-05, + "loss": 0.0988, + "num_input_tokens_seen": 46179856, + "step": 79595 + }, + { + "epoch": 11.8558236520703, + "grad_norm": 0.00029384702793322504, + "learning_rate": 2.1280277920395322e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46183024, + "step": 79600 + }, + { + "epoch": 11.85656836461126, + "grad_norm": 0.00035066629061475396, + "learning_rate": 2.1277064699730694e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46186320, + "step": 79605 + }, + { + "epoch": 11.85731307715222, + "grad_norm": 0.013554721139371395, + "learning_rate": 2.1273851541961274e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46189296, + "step": 79610 + }, + { + "epoch": 11.858057789693179, + "grad_norm": 0.10716163367033005, + "learning_rate": 2.1270638447141337e-05, + "loss": 0.0009, + "num_input_tokens_seen": 46192432, + "step": 79615 + }, + { + "epoch": 11.858802502234138, + "grad_norm": 0.03112964518368244, + "learning_rate": 2.1267425415325185e-05, + "loss": 0.0191, + "num_input_tokens_seen": 46195248, + "step": 79620 + }, + { + "epoch": 11.859547214775096, + "grad_norm": 0.0008070872281678021, + "learning_rate": 2.1264212446567084e-05, + "loss": 0.2578, + "num_input_tokens_seen": 46198224, + "step": 79625 + }, + { + "epoch": 11.860291927316055, + "grad_norm": 6.17420591879636e-05, + "learning_rate": 2.1260999540921307e-05, + "loss": 0.2112, + "num_input_tokens_seen": 46201040, + "step": 79630 + }, + { + "epoch": 11.861036639857016, + "grad_norm": 0.03641033545136452, + "learning_rate": 2.1257786698442155e-05, + "loss": 0.0003, + "num_input_tokens_seen": 46203952, + "step": 79635 + }, + { + "epoch": 11.861781352397974, + "grad_norm": 0.0015337140066549182, + "learning_rate": 2.125457391918389e-05, + "loss": 0.0547, + "num_input_tokens_seen": 46207024, + "step": 79640 + }, + { + "epoch": 11.862526064938933, + "grad_norm": 0.009490487165749073, + "learning_rate": 2.1251361203200793e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46210096, + "step": 79645 + }, + { + "epoch": 11.863270777479892, + "grad_norm": 0.021598856896162033, + "learning_rate": 2.124814855054713e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46213008, + "step": 79650 + }, + { + "epoch": 11.864015490020853, + "grad_norm": 0.0009468134376220405, + "learning_rate": 2.1244935961277197e-05, + "loss": 0.0008, + "num_input_tokens_seen": 46215792, + "step": 79655 + }, + { + "epoch": 11.864760202561811, + "grad_norm": 0.012723890133202076, + "learning_rate": 2.124172343544524e-05, + "loss": 0.0402, + "num_input_tokens_seen": 46218800, + "step": 79660 + }, + { + "epoch": 11.86550491510277, + "grad_norm": 0.02584611065685749, + "learning_rate": 2.123851097310556e-05, + "loss": 0.001, + "num_input_tokens_seen": 46221680, + "step": 79665 + }, + { + "epoch": 11.866249627643729, + "grad_norm": 0.8636004328727722, + "learning_rate": 2.1235298574312405e-05, + "loss": 0.1691, + "num_input_tokens_seen": 46224656, + "step": 79670 + }, + { + "epoch": 11.86699434018469, + "grad_norm": 0.00032989378087222576, + "learning_rate": 2.123208623912006e-05, + "loss": 0.0036, + "num_input_tokens_seen": 46227728, + "step": 79675 + }, + { + "epoch": 11.867739052725648, + "grad_norm": 0.0050870939157903194, + "learning_rate": 2.1228873967582787e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46230704, + "step": 79680 + }, + { + "epoch": 11.868483765266607, + "grad_norm": 0.015814611688256264, + "learning_rate": 2.1225661759754848e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46233712, + "step": 79685 + }, + { + "epoch": 11.869228477807566, + "grad_norm": 0.0015409993939101696, + "learning_rate": 2.1222449615690525e-05, + "loss": 0.0471, + "num_input_tokens_seen": 46236816, + "step": 79690 + }, + { + "epoch": 11.869973190348526, + "grad_norm": 0.03478246554732323, + "learning_rate": 2.121923753544407e-05, + "loss": 0.0004, + "num_input_tokens_seen": 46239824, + "step": 79695 + }, + { + "epoch": 11.870717902889485, + "grad_norm": 0.011056140996515751, + "learning_rate": 2.1216025519069766e-05, + "loss": 0.0003, + "num_input_tokens_seen": 46242672, + "step": 79700 + }, + { + "epoch": 11.871462615430444, + "grad_norm": 0.00016435715951956809, + "learning_rate": 2.121281356662186e-05, + "loss": 0.2377, + "num_input_tokens_seen": 46245360, + "step": 79705 + }, + { + "epoch": 11.872207327971402, + "grad_norm": 0.005223105661571026, + "learning_rate": 2.1209601678154615e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46248208, + "step": 79710 + }, + { + "epoch": 11.872952040512363, + "grad_norm": 9.808415052248165e-05, + "learning_rate": 2.1206389853722306e-05, + "loss": 0.0491, + "num_input_tokens_seen": 46250928, + "step": 79715 + }, + { + "epoch": 11.873696753053322, + "grad_norm": 0.0007565726991742849, + "learning_rate": 2.1203178093379172e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46254000, + "step": 79720 + }, + { + "epoch": 11.87444146559428, + "grad_norm": 0.00322482967749238, + "learning_rate": 2.1199966397179492e-05, + "loss": 0.0, + "num_input_tokens_seen": 46257168, + "step": 79725 + }, + { + "epoch": 11.87518617813524, + "grad_norm": 135.0012664794922, + "learning_rate": 2.1196754765177514e-05, + "loss": 0.1134, + "num_input_tokens_seen": 46260112, + "step": 79730 + }, + { + "epoch": 11.8759308906762, + "grad_norm": 0.15334586799144745, + "learning_rate": 2.1193543197427507e-05, + "loss": 0.0071, + "num_input_tokens_seen": 46263120, + "step": 79735 + }, + { + "epoch": 11.876675603217159, + "grad_norm": 0.11599563807249069, + "learning_rate": 2.119033169398371e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46266128, + "step": 79740 + }, + { + "epoch": 11.877420315758117, + "grad_norm": 0.00024971531820483506, + "learning_rate": 2.1187120254900397e-05, + "loss": 0.0452, + "num_input_tokens_seen": 46268816, + "step": 79745 + }, + { + "epoch": 11.878165028299076, + "grad_norm": 0.24345634877681732, + "learning_rate": 2.118390888023181e-05, + "loss": 0.0006, + "num_input_tokens_seen": 46271568, + "step": 79750 + }, + { + "epoch": 11.878909740840037, + "grad_norm": 0.02980806864798069, + "learning_rate": 2.1180697570032195e-05, + "loss": 0.0, + "num_input_tokens_seen": 46274288, + "step": 79755 + }, + { + "epoch": 11.879654453380995, + "grad_norm": 7.287118205567822e-05, + "learning_rate": 2.117748632435582e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46276880, + "step": 79760 + }, + { + "epoch": 11.880399165921954, + "grad_norm": 19.2037353515625, + "learning_rate": 2.1174275143256927e-05, + "loss": 0.0101, + "num_input_tokens_seen": 46279632, + "step": 79765 + }, + { + "epoch": 11.881143878462913, + "grad_norm": 0.0004318252031225711, + "learning_rate": 2.1171064026789768e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46282608, + "step": 79770 + }, + { + "epoch": 11.881888591003872, + "grad_norm": 0.0011857975041493773, + "learning_rate": 2.1167852975008587e-05, + "loss": 0.1407, + "num_input_tokens_seen": 46285296, + "step": 79775 + }, + { + "epoch": 11.882633303544832, + "grad_norm": 5.420663001132198e-05, + "learning_rate": 2.1164641987967638e-05, + "loss": 0.0, + "num_input_tokens_seen": 46288048, + "step": 79780 + }, + { + "epoch": 11.883378016085791, + "grad_norm": 0.05342639237642288, + "learning_rate": 2.116143106572117e-05, + "loss": 0.0035, + "num_input_tokens_seen": 46290960, + "step": 79785 + }, + { + "epoch": 11.88412272862675, + "grad_norm": 0.001969965174794197, + "learning_rate": 2.115822020832341e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46293872, + "step": 79790 + }, + { + "epoch": 11.884867441167708, + "grad_norm": 0.002977985655888915, + "learning_rate": 2.1155009415828628e-05, + "loss": 0.2656, + "num_input_tokens_seen": 46296624, + "step": 79795 + }, + { + "epoch": 11.885612153708669, + "grad_norm": 0.002955279080197215, + "learning_rate": 2.1151798688291046e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46299984, + "step": 79800 + }, + { + "epoch": 11.886356866249628, + "grad_norm": 0.0009776043007150292, + "learning_rate": 2.1148588025764916e-05, + "loss": 0.0014, + "num_input_tokens_seen": 46302928, + "step": 79805 + }, + { + "epoch": 11.887101578790586, + "grad_norm": 0.001572376349940896, + "learning_rate": 2.1145377428304476e-05, + "loss": 0.0532, + "num_input_tokens_seen": 46305840, + "step": 79810 + }, + { + "epoch": 11.887846291331545, + "grad_norm": 0.000499738089274615, + "learning_rate": 2.1142166895963973e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46308432, + "step": 79815 + }, + { + "epoch": 11.888591003872506, + "grad_norm": 69.90351104736328, + "learning_rate": 2.1138956428797624e-05, + "loss": 0.0618, + "num_input_tokens_seen": 46311344, + "step": 79820 + }, + { + "epoch": 11.889335716413465, + "grad_norm": 0.025488782674074173, + "learning_rate": 2.1135746026859697e-05, + "loss": 0.2647, + "num_input_tokens_seen": 46314032, + "step": 79825 + }, + { + "epoch": 11.890080428954423, + "grad_norm": 0.010814071632921696, + "learning_rate": 2.1132535690204415e-05, + "loss": 0.0998, + "num_input_tokens_seen": 46316976, + "step": 79830 + }, + { + "epoch": 11.890825141495382, + "grad_norm": 0.002449653809890151, + "learning_rate": 2.1129325418886e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46319792, + "step": 79835 + }, + { + "epoch": 11.891569854036343, + "grad_norm": 0.000244631344685331, + "learning_rate": 2.1126115212958708e-05, + "loss": 0.2352, + "num_input_tokens_seen": 46322704, + "step": 79840 + }, + { + "epoch": 11.892314566577301, + "grad_norm": 11.876937866210938, + "learning_rate": 2.112290507247675e-05, + "loss": 0.1044, + "num_input_tokens_seen": 46325520, + "step": 79845 + }, + { + "epoch": 11.89305927911826, + "grad_norm": 0.01033048052340746, + "learning_rate": 2.1119694997494382e-05, + "loss": 0.0009, + "num_input_tokens_seen": 46328176, + "step": 79850 + }, + { + "epoch": 11.893803991659219, + "grad_norm": 0.0028246541041880846, + "learning_rate": 2.1116484988065813e-05, + "loss": 0.1572, + "num_input_tokens_seen": 46331344, + "step": 79855 + }, + { + "epoch": 11.89454870420018, + "grad_norm": 0.01882852427661419, + "learning_rate": 2.1113275044245293e-05, + "loss": 0.001, + "num_input_tokens_seen": 46334096, + "step": 79860 + }, + { + "epoch": 11.895293416741138, + "grad_norm": 0.002320573665201664, + "learning_rate": 2.1110065166087037e-05, + "loss": 0.0793, + "num_input_tokens_seen": 46336816, + "step": 79865 + }, + { + "epoch": 11.896038129282097, + "grad_norm": 0.009947153739631176, + "learning_rate": 2.110685535364528e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46339696, + "step": 79870 + }, + { + "epoch": 11.896782841823056, + "grad_norm": 0.01651112362742424, + "learning_rate": 2.1103645606974244e-05, + "loss": 0.0, + "num_input_tokens_seen": 46342480, + "step": 79875 + }, + { + "epoch": 11.897527554364016, + "grad_norm": 0.15780951082706451, + "learning_rate": 2.1100435926128146e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46345200, + "step": 79880 + }, + { + "epoch": 11.898272266904975, + "grad_norm": 0.028127456083893776, + "learning_rate": 2.1097226311161232e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46347856, + "step": 79885 + }, + { + "epoch": 11.899016979445934, + "grad_norm": 0.008107667788863182, + "learning_rate": 2.1094016762127698e-05, + "loss": 0.0465, + "num_input_tokens_seen": 46350704, + "step": 79890 + }, + { + "epoch": 11.899761691986892, + "grad_norm": 20.16825294494629, + "learning_rate": 2.10908072790818e-05, + "loss": 0.1265, + "num_input_tokens_seen": 46353648, + "step": 79895 + }, + { + "epoch": 11.900506404527853, + "grad_norm": 0.015834083780646324, + "learning_rate": 2.1087597862077726e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46356336, + "step": 79900 + }, + { + "epoch": 11.901251117068812, + "grad_norm": 3.7391743659973145, + "learning_rate": 2.1084388511169718e-05, + "loss": 0.0152, + "num_input_tokens_seen": 46360528, + "step": 79905 + }, + { + "epoch": 11.90199582960977, + "grad_norm": 0.003853294299915433, + "learning_rate": 2.1081179226411985e-05, + "loss": 0.0, + "num_input_tokens_seen": 46363824, + "step": 79910 + }, + { + "epoch": 11.90274054215073, + "grad_norm": 0.012496787123382092, + "learning_rate": 2.107797000785874e-05, + "loss": 0.0032, + "num_input_tokens_seen": 46366704, + "step": 79915 + }, + { + "epoch": 11.90348525469169, + "grad_norm": 0.000992841087281704, + "learning_rate": 2.107476085556421e-05, + "loss": 0.1228, + "num_input_tokens_seen": 46369712, + "step": 79920 + }, + { + "epoch": 11.904229967232649, + "grad_norm": 0.010500956326723099, + "learning_rate": 2.10715517695826e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46372592, + "step": 79925 + }, + { + "epoch": 11.904974679773607, + "grad_norm": 5.564972877502441, + "learning_rate": 2.106834274996814e-05, + "loss": 0.0119, + "num_input_tokens_seen": 46375504, + "step": 79930 + }, + { + "epoch": 11.905719392314566, + "grad_norm": 0.06378959119319916, + "learning_rate": 2.1065133796775026e-05, + "loss": 0.0005, + "num_input_tokens_seen": 46378320, + "step": 79935 + }, + { + "epoch": 11.906464104855527, + "grad_norm": 3.5756454467773438, + "learning_rate": 2.1061924910057485e-05, + "loss": 0.006, + "num_input_tokens_seen": 46381296, + "step": 79940 + }, + { + "epoch": 11.907208817396485, + "grad_norm": 0.04424016550183296, + "learning_rate": 2.1058716089869707e-05, + "loss": 0.008, + "num_input_tokens_seen": 46384048, + "step": 79945 + }, + { + "epoch": 11.907953529937444, + "grad_norm": 0.004567022901028395, + "learning_rate": 2.1055507336265925e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46387152, + "step": 79950 + }, + { + "epoch": 11.908698242478403, + "grad_norm": 0.0015647667460143566, + "learning_rate": 2.105229864930034e-05, + "loss": 0.0597, + "num_input_tokens_seen": 46389936, + "step": 79955 + }, + { + "epoch": 11.909442955019362, + "grad_norm": 0.0017045275308191776, + "learning_rate": 2.1049090029027146e-05, + "loss": 0.0, + "num_input_tokens_seen": 46392752, + "step": 79960 + }, + { + "epoch": 11.910187667560322, + "grad_norm": 0.0013554244069382548, + "learning_rate": 2.104588147550057e-05, + "loss": 0.0, + "num_input_tokens_seen": 46395824, + "step": 79965 + }, + { + "epoch": 11.910932380101281, + "grad_norm": 0.0005835825577378273, + "learning_rate": 2.1042672988774805e-05, + "loss": 0.0, + "num_input_tokens_seen": 46399024, + "step": 79970 + }, + { + "epoch": 11.91167709264224, + "grad_norm": 0.004780121613293886, + "learning_rate": 2.103946456890406e-05, + "loss": 0.1098, + "num_input_tokens_seen": 46401904, + "step": 79975 + }, + { + "epoch": 11.912421805183198, + "grad_norm": 1.0336182117462158, + "learning_rate": 2.1036256215942526e-05, + "loss": 0.0007, + "num_input_tokens_seen": 46404688, + "step": 79980 + }, + { + "epoch": 11.913166517724159, + "grad_norm": 0.003390203695744276, + "learning_rate": 2.1033047929944427e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46407312, + "step": 79985 + }, + { + "epoch": 11.913911230265118, + "grad_norm": 0.03140663355588913, + "learning_rate": 2.102983971096395e-05, + "loss": 0.1036, + "num_input_tokens_seen": 46410352, + "step": 79990 + }, + { + "epoch": 11.914655942806077, + "grad_norm": 0.015044515952467918, + "learning_rate": 2.1026631559055285e-05, + "loss": 0.1067, + "num_input_tokens_seen": 46413296, + "step": 79995 + }, + { + "epoch": 11.915400655347035, + "grad_norm": 0.03123338334262371, + "learning_rate": 2.1023423474272652e-05, + "loss": 0.0027, + "num_input_tokens_seen": 46416176, + "step": 80000 + }, + { + "epoch": 11.916145367887996, + "grad_norm": 199.97854614257812, + "learning_rate": 2.1020215456670234e-05, + "loss": 0.1971, + "num_input_tokens_seen": 46418992, + "step": 80005 + }, + { + "epoch": 11.916890080428955, + "grad_norm": 0.0008614790858700871, + "learning_rate": 2.1017007506302233e-05, + "loss": 0.0, + "num_input_tokens_seen": 46421712, + "step": 80010 + }, + { + "epoch": 11.917634792969913, + "grad_norm": 0.0020595744717866182, + "learning_rate": 2.1013799623222833e-05, + "loss": 0.0142, + "num_input_tokens_seen": 46424368, + "step": 80015 + }, + { + "epoch": 11.918379505510872, + "grad_norm": 0.00018101542082149535, + "learning_rate": 2.1010591807486253e-05, + "loss": 0.0, + "num_input_tokens_seen": 46427088, + "step": 80020 + }, + { + "epoch": 11.919124218051833, + "grad_norm": 0.11357564479112625, + "learning_rate": 2.100738405914665e-05, + "loss": 0.1004, + "num_input_tokens_seen": 46429904, + "step": 80025 + }, + { + "epoch": 11.919868930592791, + "grad_norm": 0.018164003267884254, + "learning_rate": 2.1004176378258252e-05, + "loss": 0.0, + "num_input_tokens_seen": 46432528, + "step": 80030 + }, + { + "epoch": 11.92061364313375, + "grad_norm": 7.688459396362305, + "learning_rate": 2.100096876487523e-05, + "loss": 0.0009, + "num_input_tokens_seen": 46435568, + "step": 80035 + }, + { + "epoch": 11.921358355674709, + "grad_norm": 0.0001965398114407435, + "learning_rate": 2.0997761219051777e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46438512, + "step": 80040 + }, + { + "epoch": 11.92210306821567, + "grad_norm": 0.013390966691076756, + "learning_rate": 2.099455374084208e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46441456, + "step": 80045 + }, + { + "epoch": 11.922847780756628, + "grad_norm": 0.0005880999378859997, + "learning_rate": 2.0991346330300314e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46444432, + "step": 80050 + }, + { + "epoch": 11.923592493297587, + "grad_norm": 0.002181432442739606, + "learning_rate": 2.0988138987480694e-05, + "loss": 0.0004, + "num_input_tokens_seen": 46447216, + "step": 80055 + }, + { + "epoch": 11.924337205838546, + "grad_norm": 4.930593967437744, + "learning_rate": 2.0984931712437377e-05, + "loss": 0.0353, + "num_input_tokens_seen": 46449872, + "step": 80060 + }, + { + "epoch": 11.925081918379506, + "grad_norm": 10.301668167114258, + "learning_rate": 2.0981724505224563e-05, + "loss": 0.1938, + "num_input_tokens_seen": 46452976, + "step": 80065 + }, + { + "epoch": 11.925826630920465, + "grad_norm": 0.0014239710289984941, + "learning_rate": 2.0978517365896433e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46455856, + "step": 80070 + }, + { + "epoch": 11.926571343461424, + "grad_norm": 5.5057869758456945e-05, + "learning_rate": 2.0975310294507162e-05, + "loss": 0.0006, + "num_input_tokens_seen": 46458576, + "step": 80075 + }, + { + "epoch": 11.927316056002383, + "grad_norm": 0.03761054947972298, + "learning_rate": 2.0972103291110933e-05, + "loss": 0.0044, + "num_input_tokens_seen": 46461488, + "step": 80080 + }, + { + "epoch": 11.928060768543343, + "grad_norm": 430.056884765625, + "learning_rate": 2.096889635576192e-05, + "loss": 0.0764, + "num_input_tokens_seen": 46464208, + "step": 80085 + }, + { + "epoch": 11.928805481084302, + "grad_norm": 0.006587509531527758, + "learning_rate": 2.0965689488514314e-05, + "loss": 0.0008, + "num_input_tokens_seen": 46466992, + "step": 80090 + }, + { + "epoch": 11.92955019362526, + "grad_norm": 136.814453125, + "learning_rate": 2.0962482689422276e-05, + "loss": 0.5721, + "num_input_tokens_seen": 46470096, + "step": 80095 + }, + { + "epoch": 11.93029490616622, + "grad_norm": 3.722897529602051, + "learning_rate": 2.0959275958539996e-05, + "loss": 0.1109, + "num_input_tokens_seen": 46472880, + "step": 80100 + }, + { + "epoch": 11.931039618707178, + "grad_norm": 0.0003088362864218652, + "learning_rate": 2.095606929592164e-05, + "loss": 0.0218, + "num_input_tokens_seen": 46475600, + "step": 80105 + }, + { + "epoch": 11.931784331248139, + "grad_norm": 0.004639448132365942, + "learning_rate": 2.0952862701621385e-05, + "loss": 0.1314, + "num_input_tokens_seen": 46478512, + "step": 80110 + }, + { + "epoch": 11.932529043789097, + "grad_norm": 0.0002715588780120015, + "learning_rate": 2.09496561756934e-05, + "loss": 0.0, + "num_input_tokens_seen": 46481520, + "step": 80115 + }, + { + "epoch": 11.933273756330056, + "grad_norm": 0.0006133955321274698, + "learning_rate": 2.094644971819185e-05, + "loss": 0.2594, + "num_input_tokens_seen": 46484208, + "step": 80120 + }, + { + "epoch": 11.934018468871017, + "grad_norm": 0.0004512887680903077, + "learning_rate": 2.0943243329170922e-05, + "loss": 0.0, + "num_input_tokens_seen": 46487248, + "step": 80125 + }, + { + "epoch": 11.934763181411975, + "grad_norm": 0.0017935234354808927, + "learning_rate": 2.0940037008684772e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46490192, + "step": 80130 + }, + { + "epoch": 11.935507893952934, + "grad_norm": 0.0005081784911453724, + "learning_rate": 2.0936830756787568e-05, + "loss": 0.0, + "num_input_tokens_seen": 46492816, + "step": 80135 + }, + { + "epoch": 11.936252606493893, + "grad_norm": 0.005808461457490921, + "learning_rate": 2.0933624573533477e-05, + "loss": 0.008, + "num_input_tokens_seen": 46495856, + "step": 80140 + }, + { + "epoch": 11.936997319034852, + "grad_norm": 0.0026400841306895018, + "learning_rate": 2.0930418458976676e-05, + "loss": 0.0913, + "num_input_tokens_seen": 46498864, + "step": 80145 + }, + { + "epoch": 11.937742031575812, + "grad_norm": 0.0006116272998042405, + "learning_rate": 2.0927212413171316e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46501520, + "step": 80150 + }, + { + "epoch": 11.938486744116771, + "grad_norm": 0.00028594082687050104, + "learning_rate": 2.092400643617155e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46504432, + "step": 80155 + }, + { + "epoch": 11.93923145665773, + "grad_norm": 0.0018173670396208763, + "learning_rate": 2.092080052803157e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46507120, + "step": 80160 + }, + { + "epoch": 11.939976169198689, + "grad_norm": 0.010444139130413532, + "learning_rate": 2.0917594688805507e-05, + "loss": 0.0037, + "num_input_tokens_seen": 46509872, + "step": 80165 + }, + { + "epoch": 11.940720881739649, + "grad_norm": 0.002534173196181655, + "learning_rate": 2.091438891854754e-05, + "loss": 0.1281, + "num_input_tokens_seen": 46512624, + "step": 80170 + }, + { + "epoch": 11.941465594280608, + "grad_norm": 0.01571774110198021, + "learning_rate": 2.091118321731181e-05, + "loss": 0.0, + "num_input_tokens_seen": 46515952, + "step": 80175 + }, + { + "epoch": 11.942210306821567, + "grad_norm": 0.008537075482308865, + "learning_rate": 2.0907977585152495e-05, + "loss": 0.2433, + "num_input_tokens_seen": 46519120, + "step": 80180 + }, + { + "epoch": 11.942955019362525, + "grad_norm": 0.0007599641103297472, + "learning_rate": 2.0904772022123725e-05, + "loss": 0.0, + "num_input_tokens_seen": 46521904, + "step": 80185 + }, + { + "epoch": 11.943699731903486, + "grad_norm": 0.008675706572830677, + "learning_rate": 2.0901566528279687e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46524848, + "step": 80190 + }, + { + "epoch": 11.944444444444445, + "grad_norm": 0.0016294653760269284, + "learning_rate": 2.089836110367451e-05, + "loss": 0.0, + "num_input_tokens_seen": 46527984, + "step": 80195 + }, + { + "epoch": 11.945189156985403, + "grad_norm": 0.2957630455493927, + "learning_rate": 2.0895155748362353e-05, + "loss": 0.0079, + "num_input_tokens_seen": 46530704, + "step": 80200 + }, + { + "epoch": 11.945933869526362, + "grad_norm": 0.03811873123049736, + "learning_rate": 2.0891950462397372e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46533712, + "step": 80205 + }, + { + "epoch": 11.946678582067323, + "grad_norm": 0.003346341894939542, + "learning_rate": 2.0888745245833703e-05, + "loss": 0.1261, + "num_input_tokens_seen": 46536528, + "step": 80210 + }, + { + "epoch": 11.947423294608281, + "grad_norm": 0.8642683625221252, + "learning_rate": 2.0885540098725513e-05, + "loss": 0.0453, + "num_input_tokens_seen": 46539408, + "step": 80215 + }, + { + "epoch": 11.94816800714924, + "grad_norm": 0.014162523671984673, + "learning_rate": 2.088233502112693e-05, + "loss": 0.172, + "num_input_tokens_seen": 46542288, + "step": 80220 + }, + { + "epoch": 11.948912719690199, + "grad_norm": 0.0009373055072501302, + "learning_rate": 2.0879130013092124e-05, + "loss": 0.1718, + "num_input_tokens_seen": 46545104, + "step": 80225 + }, + { + "epoch": 11.94965743223116, + "grad_norm": 0.0054537770338356495, + "learning_rate": 2.087592507467523e-05, + "loss": 0.0018, + "num_input_tokens_seen": 46548400, + "step": 80230 + }, + { + "epoch": 11.950402144772118, + "grad_norm": 0.004705659579485655, + "learning_rate": 2.087272020593038e-05, + "loss": 0.2001, + "num_input_tokens_seen": 46551216, + "step": 80235 + }, + { + "epoch": 11.951146857313077, + "grad_norm": 0.00598375778645277, + "learning_rate": 2.086951540691174e-05, + "loss": 0.0145, + "num_input_tokens_seen": 46554480, + "step": 80240 + }, + { + "epoch": 11.951891569854036, + "grad_norm": 0.011426836252212524, + "learning_rate": 2.086631067767342e-05, + "loss": 0.0003, + "num_input_tokens_seen": 46557200, + "step": 80245 + }, + { + "epoch": 11.952636282394996, + "grad_norm": 0.0024368902668356895, + "learning_rate": 2.0863106018269596e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46560304, + "step": 80250 + }, + { + "epoch": 11.953380994935955, + "grad_norm": 0.001750342664308846, + "learning_rate": 2.085990142875438e-05, + "loss": 0.2407, + "num_input_tokens_seen": 46563152, + "step": 80255 + }, + { + "epoch": 11.954125707476914, + "grad_norm": 0.019186099991202354, + "learning_rate": 2.0856696909181932e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46565904, + "step": 80260 + }, + { + "epoch": 11.954870420017873, + "grad_norm": 0.0026616386603564024, + "learning_rate": 2.0853492459606373e-05, + "loss": 0.0352, + "num_input_tokens_seen": 46568784, + "step": 80265 + }, + { + "epoch": 11.955615132558833, + "grad_norm": 7.941535949707031, + "learning_rate": 2.085028808008185e-05, + "loss": 0.0486, + "num_input_tokens_seen": 46571568, + "step": 80270 + }, + { + "epoch": 11.956359845099792, + "grad_norm": 0.0017264584312215447, + "learning_rate": 2.084708377066249e-05, + "loss": 0.1067, + "num_input_tokens_seen": 46574480, + "step": 80275 + }, + { + "epoch": 11.95710455764075, + "grad_norm": 0.0016851266846060753, + "learning_rate": 2.084387953140242e-05, + "loss": 0.0279, + "num_input_tokens_seen": 46577264, + "step": 80280 + }, + { + "epoch": 11.95784927018171, + "grad_norm": 0.03383021801710129, + "learning_rate": 2.0840675362355792e-05, + "loss": 0.0011, + "num_input_tokens_seen": 46580080, + "step": 80285 + }, + { + "epoch": 11.958593982722668, + "grad_norm": 0.03074258379638195, + "learning_rate": 2.0837471263576716e-05, + "loss": 0.0063, + "num_input_tokens_seen": 46582864, + "step": 80290 + }, + { + "epoch": 11.959338695263629, + "grad_norm": 0.0015858032274991274, + "learning_rate": 2.0834267235119342e-05, + "loss": 0.0027, + "num_input_tokens_seen": 46585616, + "step": 80295 + }, + { + "epoch": 11.960083407804587, + "grad_norm": 41.79151153564453, + "learning_rate": 2.083106327703779e-05, + "loss": 0.0152, + "num_input_tokens_seen": 46588624, + "step": 80300 + }, + { + "epoch": 11.960828120345546, + "grad_norm": 0.11773834377527237, + "learning_rate": 2.0827859389386184e-05, + "loss": 0.0005, + "num_input_tokens_seen": 46591472, + "step": 80305 + }, + { + "epoch": 11.961572832886507, + "grad_norm": 0.00013287848560139537, + "learning_rate": 2.0824655572218655e-05, + "loss": 0.0698, + "num_input_tokens_seen": 46594352, + "step": 80310 + }, + { + "epoch": 11.962317545427466, + "grad_norm": 1.1498463153839111, + "learning_rate": 2.0821451825589315e-05, + "loss": 0.0011, + "num_input_tokens_seen": 46597040, + "step": 80315 + }, + { + "epoch": 11.963062257968424, + "grad_norm": 30.4056396484375, + "learning_rate": 2.0818248149552315e-05, + "loss": 0.2942, + "num_input_tokens_seen": 46599728, + "step": 80320 + }, + { + "epoch": 11.963806970509383, + "grad_norm": 17.63993263244629, + "learning_rate": 2.0815044544161748e-05, + "loss": 0.0802, + "num_input_tokens_seen": 46602640, + "step": 80325 + }, + { + "epoch": 11.964551683050342, + "grad_norm": 62.42561340332031, + "learning_rate": 2.081184100947176e-05, + "loss": 0.0088, + "num_input_tokens_seen": 46605520, + "step": 80330 + }, + { + "epoch": 11.965296395591302, + "grad_norm": 9.049198150634766, + "learning_rate": 2.080863754553646e-05, + "loss": 0.3525, + "num_input_tokens_seen": 46608464, + "step": 80335 + }, + { + "epoch": 11.966041108132261, + "grad_norm": 0.006572726182639599, + "learning_rate": 2.080543415240997e-05, + "loss": 0.004, + "num_input_tokens_seen": 46611472, + "step": 80340 + }, + { + "epoch": 11.96678582067322, + "grad_norm": 4.241944789886475, + "learning_rate": 2.0802230830146398e-05, + "loss": 0.0346, + "num_input_tokens_seen": 46614608, + "step": 80345 + }, + { + "epoch": 11.967530533214179, + "grad_norm": 0.034917183220386505, + "learning_rate": 2.0799027578799882e-05, + "loss": 0.0008, + "num_input_tokens_seen": 46617584, + "step": 80350 + }, + { + "epoch": 11.96827524575514, + "grad_norm": 0.01542076375335455, + "learning_rate": 2.0795824398424523e-05, + "loss": 0.1392, + "num_input_tokens_seen": 46620496, + "step": 80355 + }, + { + "epoch": 11.969019958296098, + "grad_norm": 0.030427025631070137, + "learning_rate": 2.079262128907443e-05, + "loss": 0.0003, + "num_input_tokens_seen": 46623120, + "step": 80360 + }, + { + "epoch": 11.969764670837057, + "grad_norm": 0.09185274690389633, + "learning_rate": 2.0789418250803732e-05, + "loss": 0.0009, + "num_input_tokens_seen": 46626256, + "step": 80365 + }, + { + "epoch": 11.970509383378015, + "grad_norm": 0.006813287269324064, + "learning_rate": 2.078621528366653e-05, + "loss": 0.1516, + "num_input_tokens_seen": 46629200, + "step": 80370 + }, + { + "epoch": 11.971254095918976, + "grad_norm": 0.005888029932975769, + "learning_rate": 2.078301238771694e-05, + "loss": 0.0003, + "num_input_tokens_seen": 46631952, + "step": 80375 + }, + { + "epoch": 11.971998808459935, + "grad_norm": 0.5875582098960876, + "learning_rate": 2.0779809563009063e-05, + "loss": 0.0576, + "num_input_tokens_seen": 46634704, + "step": 80380 + }, + { + "epoch": 11.972743521000893, + "grad_norm": 0.0028016739524900913, + "learning_rate": 2.0776606809597022e-05, + "loss": 0.0011, + "num_input_tokens_seen": 46637648, + "step": 80385 + }, + { + "epoch": 11.973488233541852, + "grad_norm": 0.0064588082022964954, + "learning_rate": 2.077340412753492e-05, + "loss": 0.1689, + "num_input_tokens_seen": 46640656, + "step": 80390 + }, + { + "epoch": 11.974232946082813, + "grad_norm": 2.7574033737182617, + "learning_rate": 2.077020151687684e-05, + "loss": 0.0039, + "num_input_tokens_seen": 46643536, + "step": 80395 + }, + { + "epoch": 11.974977658623772, + "grad_norm": 0.0067623029462993145, + "learning_rate": 2.0766998977676928e-05, + "loss": 0.0951, + "num_input_tokens_seen": 46646288, + "step": 80400 + }, + { + "epoch": 11.97572237116473, + "grad_norm": 0.6141113638877869, + "learning_rate": 2.0763796509989252e-05, + "loss": 0.0004, + "num_input_tokens_seen": 46649072, + "step": 80405 + }, + { + "epoch": 11.976467083705689, + "grad_norm": 0.00200195936486125, + "learning_rate": 2.076059411386794e-05, + "loss": 0.0009, + "num_input_tokens_seen": 46652336, + "step": 80410 + }, + { + "epoch": 11.97721179624665, + "grad_norm": 0.011300824582576752, + "learning_rate": 2.075739178936707e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46655184, + "step": 80415 + }, + { + "epoch": 11.977956508787608, + "grad_norm": 0.0755162313580513, + "learning_rate": 2.0754189536540764e-05, + "loss": 0.0078, + "num_input_tokens_seen": 46658128, + "step": 80420 + }, + { + "epoch": 11.978701221328567, + "grad_norm": 0.03368563577532768, + "learning_rate": 2.07509873554431e-05, + "loss": 0.144, + "num_input_tokens_seen": 46660816, + "step": 80425 + }, + { + "epoch": 11.979445933869526, + "grad_norm": 0.08841775357723236, + "learning_rate": 2.074778524612819e-05, + "loss": 0.0003, + "num_input_tokens_seen": 46663824, + "step": 80430 + }, + { + "epoch": 11.980190646410486, + "grad_norm": 0.00805534515529871, + "learning_rate": 2.0744583208650136e-05, + "loss": 0.0237, + "num_input_tokens_seen": 46666704, + "step": 80435 + }, + { + "epoch": 11.980935358951445, + "grad_norm": 0.014413119293749332, + "learning_rate": 2.0741381243063015e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46669520, + "step": 80440 + }, + { + "epoch": 11.981680071492404, + "grad_norm": 0.0007184948772192001, + "learning_rate": 2.0738179349420935e-05, + "loss": 0.0566, + "num_input_tokens_seen": 46672240, + "step": 80445 + }, + { + "epoch": 11.982424784033363, + "grad_norm": 0.09811940044164658, + "learning_rate": 2.0734977527777974e-05, + "loss": 0.096, + "num_input_tokens_seen": 46675184, + "step": 80450 + }, + { + "epoch": 11.983169496574323, + "grad_norm": 0.002605035901069641, + "learning_rate": 2.0731775778188242e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46679088, + "step": 80455 + }, + { + "epoch": 11.983914209115282, + "grad_norm": 9.903298377990723, + "learning_rate": 2.0728574100705813e-05, + "loss": 0.0438, + "num_input_tokens_seen": 46681680, + "step": 80460 + }, + { + "epoch": 11.98465892165624, + "grad_norm": 0.0034621278755366802, + "learning_rate": 2.0725372495384786e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46684336, + "step": 80465 + }, + { + "epoch": 11.9854036341972, + "grad_norm": 0.020170435309410095, + "learning_rate": 2.0722170962279248e-05, + "loss": 0.0009, + "num_input_tokens_seen": 46686800, + "step": 80470 + }, + { + "epoch": 11.986148346738158, + "grad_norm": 0.001371934893541038, + "learning_rate": 2.0718969501443286e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46689744, + "step": 80475 + }, + { + "epoch": 11.986893059279119, + "grad_norm": 0.036858126521110535, + "learning_rate": 2.0715768112930984e-05, + "loss": 0.1471, + "num_input_tokens_seen": 46692624, + "step": 80480 + }, + { + "epoch": 11.987637771820078, + "grad_norm": 0.011078078299760818, + "learning_rate": 2.071256679679641e-05, + "loss": 0.008, + "num_input_tokens_seen": 46695280, + "step": 80485 + }, + { + "epoch": 11.988382484361036, + "grad_norm": 0.01107528991997242, + "learning_rate": 2.070936555309368e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46698192, + "step": 80490 + }, + { + "epoch": 11.989127196901995, + "grad_norm": 0.012832156382501125, + "learning_rate": 2.0706164381876852e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46701136, + "step": 80495 + }, + { + "epoch": 11.989871909442956, + "grad_norm": 0.00321055855602026, + "learning_rate": 2.0702963283200018e-05, + "loss": 0.0, + "num_input_tokens_seen": 46704144, + "step": 80500 + }, + { + "epoch": 11.990616621983914, + "grad_norm": 6.539295196533203, + "learning_rate": 2.0699762257117235e-05, + "loss": 0.0665, + "num_input_tokens_seen": 46707152, + "step": 80505 + }, + { + "epoch": 11.991361334524873, + "grad_norm": 0.001888418453745544, + "learning_rate": 2.0696561303682617e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46709936, + "step": 80510 + }, + { + "epoch": 11.992106047065832, + "grad_norm": 0.0013953684829175472, + "learning_rate": 2.0693360422950217e-05, + "loss": 0.0957, + "num_input_tokens_seen": 46712912, + "step": 80515 + }, + { + "epoch": 11.992850759606792, + "grad_norm": 0.01256493292748928, + "learning_rate": 2.0690159614974107e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46715856, + "step": 80520 + }, + { + "epoch": 11.993595472147751, + "grad_norm": 0.013332363218069077, + "learning_rate": 2.068695887980838e-05, + "loss": 0.2846, + "num_input_tokens_seen": 46718480, + "step": 80525 + }, + { + "epoch": 11.99434018468871, + "grad_norm": 0.011794657446444035, + "learning_rate": 2.0683758217507092e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46721296, + "step": 80530 + }, + { + "epoch": 11.995084897229669, + "grad_norm": 0.07440438121557236, + "learning_rate": 2.068055762812433e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46724016, + "step": 80535 + }, + { + "epoch": 11.99582960977063, + "grad_norm": 0.014736227691173553, + "learning_rate": 2.0677357111714147e-05, + "loss": 0.0979, + "num_input_tokens_seen": 46726864, + "step": 80540 + }, + { + "epoch": 11.996574322311588, + "grad_norm": 0.015194644220173359, + "learning_rate": 2.067415666833063e-05, + "loss": 0.0037, + "num_input_tokens_seen": 46729584, + "step": 80545 + }, + { + "epoch": 11.997319034852547, + "grad_norm": 0.007734041661024094, + "learning_rate": 2.0670956298027833e-05, + "loss": 0.0034, + "num_input_tokens_seen": 46732400, + "step": 80550 + }, + { + "epoch": 11.998063747393505, + "grad_norm": 0.006130393594503403, + "learning_rate": 2.0667756000859835e-05, + "loss": 0.2096, + "num_input_tokens_seen": 46734896, + "step": 80555 + }, + { + "epoch": 11.998808459934466, + "grad_norm": 0.008435572497546673, + "learning_rate": 2.06645557768807e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46737904, + "step": 80560 + }, + { + "epoch": 11.999553172475425, + "grad_norm": 1.873286485671997, + "learning_rate": 2.0661355626144483e-05, + "loss": 0.0371, + "num_input_tokens_seen": 46740624, + "step": 80565 + }, + { + "epoch": 12.0, + "eval_loss": 1.8436585664749146, + "eval_runtime": 51.5705, + "eval_samples_per_second": 57.863, + "eval_steps_per_second": 14.466, + "num_input_tokens_seen": 46741816, + "step": 80568 + }, + { + "epoch": 12.000297885016384, + "grad_norm": 0.016401266679167747, + "learning_rate": 2.0658155548705258e-05, + "loss": 0.2221, + "num_input_tokens_seen": 46743320, + "step": 80570 + }, + { + "epoch": 12.001042597557342, + "grad_norm": 0.008675933815538883, + "learning_rate": 2.065495554461707e-05, + "loss": 0.0007, + "num_input_tokens_seen": 46745944, + "step": 80575 + }, + { + "epoch": 12.001787310098303, + "grad_norm": 0.0029756943695247173, + "learning_rate": 2.0651755613934005e-05, + "loss": 0.1792, + "num_input_tokens_seen": 46748792, + "step": 80580 + }, + { + "epoch": 12.002532022639262, + "grad_norm": 0.09469664096832275, + "learning_rate": 2.0648555756710098e-05, + "loss": 0.0004, + "num_input_tokens_seen": 46752088, + "step": 80585 + }, + { + "epoch": 12.00327673518022, + "grad_norm": 3.73609182133805e-05, + "learning_rate": 2.064535597299943e-05, + "loss": 0.082, + "num_input_tokens_seen": 46755288, + "step": 80590 + }, + { + "epoch": 12.004021447721179, + "grad_norm": 30.598388671875, + "learning_rate": 2.0642156262856045e-05, + "loss": 0.1487, + "num_input_tokens_seen": 46758296, + "step": 80595 + }, + { + "epoch": 12.00476616026214, + "grad_norm": 0.24725249409675598, + "learning_rate": 2.0638956626333993e-05, + "loss": 0.0006, + "num_input_tokens_seen": 46761048, + "step": 80600 + }, + { + "epoch": 12.005510872803098, + "grad_norm": 0.00602373480796814, + "learning_rate": 2.0635757063487348e-05, + "loss": 0.0005, + "num_input_tokens_seen": 46764024, + "step": 80605 + }, + { + "epoch": 12.006255585344057, + "grad_norm": 15.108871459960938, + "learning_rate": 2.0632557574370137e-05, + "loss": 0.0038, + "num_input_tokens_seen": 46766776, + "step": 80610 + }, + { + "epoch": 12.007000297885016, + "grad_norm": 0.012554696761071682, + "learning_rate": 2.0629358159036437e-05, + "loss": 0.0008, + "num_input_tokens_seen": 46769400, + "step": 80615 + }, + { + "epoch": 12.007745010425976, + "grad_norm": 0.008221089839935303, + "learning_rate": 2.0626158817540284e-05, + "loss": 0.0004, + "num_input_tokens_seen": 46772280, + "step": 80620 + }, + { + "epoch": 12.008489722966935, + "grad_norm": 0.013495899736881256, + "learning_rate": 2.0622959549935738e-05, + "loss": 0.0006, + "num_input_tokens_seen": 46775384, + "step": 80625 + }, + { + "epoch": 12.009234435507894, + "grad_norm": 0.00725532928481698, + "learning_rate": 2.061976035627684e-05, + "loss": 0.0, + "num_input_tokens_seen": 46778264, + "step": 80630 + }, + { + "epoch": 12.009979148048853, + "grad_norm": 0.0014817103510722518, + "learning_rate": 2.061656123661764e-05, + "loss": 0.0, + "num_input_tokens_seen": 46781112, + "step": 80635 + }, + { + "epoch": 12.010723860589811, + "grad_norm": 0.03314521908760071, + "learning_rate": 2.0613362191012185e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46784024, + "step": 80640 + }, + { + "epoch": 12.011468573130772, + "grad_norm": 0.0020408015698194504, + "learning_rate": 2.0610163219514504e-05, + "loss": 0.0, + "num_input_tokens_seen": 46787032, + "step": 80645 + }, + { + "epoch": 12.01221328567173, + "grad_norm": 0.00652332603931427, + "learning_rate": 2.0606964322178667e-05, + "loss": 0.1377, + "num_input_tokens_seen": 46790072, + "step": 80650 + }, + { + "epoch": 12.01295799821269, + "grad_norm": 0.005186873953789473, + "learning_rate": 2.0603765499058695e-05, + "loss": 0.0, + "num_input_tokens_seen": 46792856, + "step": 80655 + }, + { + "epoch": 12.013702710753648, + "grad_norm": 0.03790440782904625, + "learning_rate": 2.0600566750208642e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46795672, + "step": 80660 + }, + { + "epoch": 12.014447423294609, + "grad_norm": 1.3512037992477417, + "learning_rate": 2.0597368075682542e-05, + "loss": 0.0011, + "num_input_tokens_seen": 46798584, + "step": 80665 + }, + { + "epoch": 12.015192135835568, + "grad_norm": 24.904762268066406, + "learning_rate": 2.0594169475534436e-05, + "loss": 0.0175, + "num_input_tokens_seen": 46801304, + "step": 80670 + }, + { + "epoch": 12.015936848376526, + "grad_norm": 0.00444587180390954, + "learning_rate": 2.0590970949818357e-05, + "loss": 0.0396, + "num_input_tokens_seen": 46804056, + "step": 80675 + }, + { + "epoch": 12.016681560917485, + "grad_norm": 417.30615234375, + "learning_rate": 2.0587772498588336e-05, + "loss": 0.0824, + "num_input_tokens_seen": 46806744, + "step": 80680 + }, + { + "epoch": 12.017426273458446, + "grad_norm": 0.0022950523998588324, + "learning_rate": 2.0584574121898424e-05, + "loss": 0.021, + "num_input_tokens_seen": 46809720, + "step": 80685 + }, + { + "epoch": 12.018170985999404, + "grad_norm": 0.017078084871172905, + "learning_rate": 2.0581375819802635e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46812568, + "step": 80690 + }, + { + "epoch": 12.018915698540363, + "grad_norm": 0.07768075168132782, + "learning_rate": 2.057817759235502e-05, + "loss": 0.0004, + "num_input_tokens_seen": 46815160, + "step": 80695 + }, + { + "epoch": 12.019660411081322, + "grad_norm": 0.33672067523002625, + "learning_rate": 2.0574979439609593e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46817848, + "step": 80700 + }, + { + "epoch": 12.020405123622282, + "grad_norm": 0.10594814270734787, + "learning_rate": 2.0571781361620398e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46820632, + "step": 80705 + }, + { + "epoch": 12.021149836163241, + "grad_norm": 0.014276907779276371, + "learning_rate": 2.0568583358441445e-05, + "loss": 0.0, + "num_input_tokens_seen": 46823640, + "step": 80710 + }, + { + "epoch": 12.0218945487042, + "grad_norm": 0.13482436537742615, + "learning_rate": 2.0565385430126783e-05, + "loss": 0.0003, + "num_input_tokens_seen": 46826360, + "step": 80715 + }, + { + "epoch": 12.022639261245159, + "grad_norm": 0.000286584923742339, + "learning_rate": 2.0562187576730428e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46829176, + "step": 80720 + }, + { + "epoch": 12.02338397378612, + "grad_norm": 0.002168114995583892, + "learning_rate": 2.0558989798306395e-05, + "loss": 0.0, + "num_input_tokens_seen": 46832536, + "step": 80725 + }, + { + "epoch": 12.024128686327078, + "grad_norm": 0.00437611248344183, + "learning_rate": 2.0555792094908722e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46835480, + "step": 80730 + }, + { + "epoch": 12.024873398868037, + "grad_norm": 0.0002942899882327765, + "learning_rate": 2.055259446659142e-05, + "loss": 0.0, + "num_input_tokens_seen": 46838712, + "step": 80735 + }, + { + "epoch": 12.025618111408996, + "grad_norm": 0.0005875374772585928, + "learning_rate": 2.0549396913408522e-05, + "loss": 0.0011, + "num_input_tokens_seen": 46841624, + "step": 80740 + }, + { + "epoch": 12.026362823949956, + "grad_norm": 0.001088409568183124, + "learning_rate": 2.0546199435414028e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46844600, + "step": 80745 + }, + { + "epoch": 12.027107536490915, + "grad_norm": 0.24983741343021393, + "learning_rate": 2.054300203266198e-05, + "loss": 0.0016, + "num_input_tokens_seen": 46847256, + "step": 80750 + }, + { + "epoch": 12.027852249031874, + "grad_norm": 0.0001930349535541609, + "learning_rate": 2.0539804705206378e-05, + "loss": 0.1532, + "num_input_tokens_seen": 46849816, + "step": 80755 + }, + { + "epoch": 12.028596961572832, + "grad_norm": 0.0025530073326081038, + "learning_rate": 2.0536607453101236e-05, + "loss": 0.0001, + "num_input_tokens_seen": 46853112, + "step": 80760 + }, + { + "epoch": 12.029341674113793, + "grad_norm": 60.93095016479492, + "learning_rate": 2.0533410276400582e-05, + "loss": 0.1893, + "num_input_tokens_seen": 46855800, + "step": 80765 + }, + { + "epoch": 12.030086386654752, + "grad_norm": 2.1034958362579346, + "learning_rate": 2.053021317515842e-05, + "loss": 0.0119, + "num_input_tokens_seen": 46858552, + "step": 80770 + }, + { + "epoch": 12.03083109919571, + "grad_norm": 109.85137176513672, + "learning_rate": 2.0527016149428767e-05, + "loss": 0.0365, + "num_input_tokens_seen": 46861688, + "step": 80775 + }, + { + "epoch": 12.03157581173667, + "grad_norm": 0.00037842747406102717, + "learning_rate": 2.052381919926562e-05, + "loss": 0.0, + "num_input_tokens_seen": 46864280, + "step": 80780 + }, + { + "epoch": 12.03232052427763, + "grad_norm": 2.9801109121763147e-05, + "learning_rate": 2.052062232472301e-05, + "loss": 0.0, + "num_input_tokens_seen": 46866840, + "step": 80785 + }, + { + "epoch": 12.033065236818588, + "grad_norm": 0.0017466999124735594, + "learning_rate": 2.0517425525854926e-05, + "loss": 0.0, + "num_input_tokens_seen": 46869464, + "step": 80790 + }, + { + "epoch": 12.033809949359547, + "grad_norm": 78.34420013427734, + "learning_rate": 2.051422880271538e-05, + "loss": 0.0154, + "num_input_tokens_seen": 46872408, + "step": 80795 + }, + { + "epoch": 12.034554661900506, + "grad_norm": 0.0032356795854866505, + "learning_rate": 2.051103215535839e-05, + "loss": 0.0, + "num_input_tokens_seen": 46875256, + "step": 80800 + }, + { + "epoch": 12.035299374441466, + "grad_norm": 0.001223643310368061, + "learning_rate": 2.0507835583837943e-05, + "loss": 0.0478, + "num_input_tokens_seen": 46878296, + "step": 80805 + }, + { + "epoch": 12.036044086982425, + "grad_norm": 0.0029441416263580322, + "learning_rate": 2.050463908820805e-05, + "loss": 0.0, + "num_input_tokens_seen": 46881112, + "step": 80810 + }, + { + "epoch": 12.036788799523384, + "grad_norm": 471.1427001953125, + "learning_rate": 2.0501442668522703e-05, + "loss": 0.1675, + "num_input_tokens_seen": 46883928, + "step": 80815 + }, + { + "epoch": 12.037533512064343, + "grad_norm": 0.0008538945112377405, + "learning_rate": 2.0498246324835918e-05, + "loss": 0.0, + "num_input_tokens_seen": 46886744, + "step": 80820 + }, + { + "epoch": 12.038278224605302, + "grad_norm": 0.000415499962400645, + "learning_rate": 2.0495050057201683e-05, + "loss": 0.0, + "num_input_tokens_seen": 46889848, + "step": 80825 + }, + { + "epoch": 12.039022937146262, + "grad_norm": 0.0006843662122264504, + "learning_rate": 2.0491853865674002e-05, + "loss": 0.0, + "num_input_tokens_seen": 46892888, + "step": 80830 + }, + { + "epoch": 12.03976764968722, + "grad_norm": 0.00033192368573509157, + "learning_rate": 2.0488657750306865e-05, + "loss": 0.0, + "num_input_tokens_seen": 46895928, + "step": 80835 + }, + { + "epoch": 12.04051236222818, + "grad_norm": 0.4393695592880249, + "learning_rate": 2.0485461711154265e-05, + "loss": 0.0009, + "num_input_tokens_seen": 46898840, + "step": 80840 + }, + { + "epoch": 12.041257074769138, + "grad_norm": 0.0007643544813618064, + "learning_rate": 2.0482265748270213e-05, + "loss": 0.0, + "num_input_tokens_seen": 46901368, + "step": 80845 + }, + { + "epoch": 12.042001787310099, + "grad_norm": 0.0001961257221410051, + "learning_rate": 2.0479069861708674e-05, + "loss": 0.0, + "num_input_tokens_seen": 46904312, + "step": 80850 + }, + { + "epoch": 12.042746499851058, + "grad_norm": 0.0029161495622247458, + "learning_rate": 2.047587405152367e-05, + "loss": 0.0489, + "num_input_tokens_seen": 46907064, + "step": 80855 + }, + { + "epoch": 12.043491212392016, + "grad_norm": 63.09646987915039, + "learning_rate": 2.0472678317769168e-05, + "loss": 0.1377, + "num_input_tokens_seen": 46910008, + "step": 80860 + }, + { + "epoch": 12.044235924932975, + "grad_norm": 0.00049935607239604, + "learning_rate": 2.0469482660499167e-05, + "loss": 0.0, + "num_input_tokens_seen": 46912824, + "step": 80865 + }, + { + "epoch": 12.044980637473936, + "grad_norm": 1.0433294773101807, + "learning_rate": 2.0466287079767646e-05, + "loss": 0.0014, + "num_input_tokens_seen": 46915736, + "step": 80870 + }, + { + "epoch": 12.045725350014894, + "grad_norm": 0.0040723360143601894, + "learning_rate": 2.0463091575628608e-05, + "loss": 0.0, + "num_input_tokens_seen": 46918904, + "step": 80875 + }, + { + "epoch": 12.046470062555853, + "grad_norm": 0.0021719499491155148, + "learning_rate": 2.045989614813602e-05, + "loss": 0.0453, + "num_input_tokens_seen": 46921816, + "step": 80880 + }, + { + "epoch": 12.047214775096812, + "grad_norm": 0.0006548056844621897, + "learning_rate": 2.0456700797343867e-05, + "loss": 0.2741, + "num_input_tokens_seen": 46924504, + "step": 80885 + }, + { + "epoch": 12.047959487637772, + "grad_norm": 0.001513988128863275, + "learning_rate": 2.0453505523306147e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46927416, + "step": 80890 + }, + { + "epoch": 12.048704200178731, + "grad_norm": 0.0004609464667737484, + "learning_rate": 2.0450310326076823e-05, + "loss": 0.0, + "num_input_tokens_seen": 46930360, + "step": 80895 + }, + { + "epoch": 12.04944891271969, + "grad_norm": 0.00031575249158777297, + "learning_rate": 2.0447115205709887e-05, + "loss": 0.0264, + "num_input_tokens_seen": 46933496, + "step": 80900 + }, + { + "epoch": 12.050193625260649, + "grad_norm": 0.0052542961202561855, + "learning_rate": 2.0443920162259306e-05, + "loss": 0.0, + "num_input_tokens_seen": 46936568, + "step": 80905 + }, + { + "epoch": 12.05093833780161, + "grad_norm": 0.0008018077351152897, + "learning_rate": 2.044072519577907e-05, + "loss": 0.0, + "num_input_tokens_seen": 46939480, + "step": 80910 + }, + { + "epoch": 12.051683050342568, + "grad_norm": 0.0001828983804443851, + "learning_rate": 2.043753030632315e-05, + "loss": 0.0, + "num_input_tokens_seen": 46942168, + "step": 80915 + }, + { + "epoch": 12.052427762883527, + "grad_norm": 0.0033937315456569195, + "learning_rate": 2.0434335493945506e-05, + "loss": 0.0, + "num_input_tokens_seen": 46944952, + "step": 80920 + }, + { + "epoch": 12.053172475424486, + "grad_norm": 0.00024291164299938828, + "learning_rate": 2.043114075870013e-05, + "loss": 0.0194, + "num_input_tokens_seen": 46947640, + "step": 80925 + }, + { + "epoch": 12.053917187965446, + "grad_norm": 0.0006710469024255872, + "learning_rate": 2.042794610064099e-05, + "loss": 0.0, + "num_input_tokens_seen": 46950264, + "step": 80930 + }, + { + "epoch": 12.054661900506405, + "grad_norm": 0.0006917245918884873, + "learning_rate": 2.0424751519822054e-05, + "loss": 0.0, + "num_input_tokens_seen": 46952984, + "step": 80935 + }, + { + "epoch": 12.055406613047364, + "grad_norm": 7.571143942186609e-05, + "learning_rate": 2.0421557016297283e-05, + "loss": 0.0, + "num_input_tokens_seen": 46956120, + "step": 80940 + }, + { + "epoch": 12.056151325588322, + "grad_norm": 0.008455880917608738, + "learning_rate": 2.041836259012066e-05, + "loss": 0.0, + "num_input_tokens_seen": 46958840, + "step": 80945 + }, + { + "epoch": 12.056896038129283, + "grad_norm": 4.391441822052002, + "learning_rate": 2.0415168241346138e-05, + "loss": 0.0003, + "num_input_tokens_seen": 46961496, + "step": 80950 + }, + { + "epoch": 12.057640750670242, + "grad_norm": 0.006141925696283579, + "learning_rate": 2.0411973970027698e-05, + "loss": 0.0, + "num_input_tokens_seen": 46964536, + "step": 80955 + }, + { + "epoch": 12.0583854632112, + "grad_norm": 2.6466086637810804e-05, + "learning_rate": 2.0408779776219295e-05, + "loss": 0.0, + "num_input_tokens_seen": 46967192, + "step": 80960 + }, + { + "epoch": 12.05913017575216, + "grad_norm": 119.84623718261719, + "learning_rate": 2.0405585659974885e-05, + "loss": 0.0975, + "num_input_tokens_seen": 46970136, + "step": 80965 + }, + { + "epoch": 12.05987488829312, + "grad_norm": 0.018343500792980194, + "learning_rate": 2.0402391621348444e-05, + "loss": 0.0, + "num_input_tokens_seen": 46972984, + "step": 80970 + }, + { + "epoch": 12.060619600834078, + "grad_norm": 47.959747314453125, + "learning_rate": 2.039919766039391e-05, + "loss": 0.023, + "num_input_tokens_seen": 46976088, + "step": 80975 + }, + { + "epoch": 12.061364313375037, + "grad_norm": 0.007652681786566973, + "learning_rate": 2.0396003777165266e-05, + "loss": 0.0, + "num_input_tokens_seen": 46978680, + "step": 80980 + }, + { + "epoch": 12.062109025915996, + "grad_norm": 0.0015946872299537063, + "learning_rate": 2.0392809971716448e-05, + "loss": 0.0, + "num_input_tokens_seen": 46981656, + "step": 80985 + }, + { + "epoch": 12.062853738456955, + "grad_norm": 0.00011009623995050788, + "learning_rate": 2.0389616244101437e-05, + "loss": 0.0, + "num_input_tokens_seen": 46984440, + "step": 80990 + }, + { + "epoch": 12.063598450997915, + "grad_norm": 0.015356472693383694, + "learning_rate": 2.038642259437417e-05, + "loss": 0.0002, + "num_input_tokens_seen": 46987064, + "step": 80995 + }, + { + "epoch": 12.064343163538874, + "grad_norm": 1.2105846405029297, + "learning_rate": 2.03832290225886e-05, + "loss": 0.0005, + "num_input_tokens_seen": 46989624, + "step": 81000 + }, + { + "epoch": 12.065087876079833, + "grad_norm": 0.0009298065560869873, + "learning_rate": 2.0380035528798692e-05, + "loss": 0.0005, + "num_input_tokens_seen": 46992504, + "step": 81005 + }, + { + "epoch": 12.065832588620792, + "grad_norm": 0.0011140280403196812, + "learning_rate": 2.0376842113058372e-05, + "loss": 0.0, + "num_input_tokens_seen": 46995608, + "step": 81010 + }, + { + "epoch": 12.066577301161752, + "grad_norm": 0.0024589351378381252, + "learning_rate": 2.037364877542162e-05, + "loss": 0.0, + "num_input_tokens_seen": 46998648, + "step": 81015 + }, + { + "epoch": 12.06732201370271, + "grad_norm": 0.0022919250186532736, + "learning_rate": 2.037045551594236e-05, + "loss": 0.0, + "num_input_tokens_seen": 47001688, + "step": 81020 + }, + { + "epoch": 12.06806672624367, + "grad_norm": 0.0005854542250744998, + "learning_rate": 2.0367262334674556e-05, + "loss": 0.0, + "num_input_tokens_seen": 47004536, + "step": 81025 + }, + { + "epoch": 12.068811438784628, + "grad_norm": 0.0003962008049711585, + "learning_rate": 2.0364069231672143e-05, + "loss": 0.0, + "num_input_tokens_seen": 47007160, + "step": 81030 + }, + { + "epoch": 12.069556151325589, + "grad_norm": 2.5002509573823772e-05, + "learning_rate": 2.0360876206989073e-05, + "loss": 0.0565, + "num_input_tokens_seen": 47009720, + "step": 81035 + }, + { + "epoch": 12.070300863866548, + "grad_norm": 8.858632645569742e-05, + "learning_rate": 2.0357683260679285e-05, + "loss": 0.1254, + "num_input_tokens_seen": 47012504, + "step": 81040 + }, + { + "epoch": 12.071045576407506, + "grad_norm": 0.0001405977236572653, + "learning_rate": 2.035449039279671e-05, + "loss": 0.0, + "num_input_tokens_seen": 47015576, + "step": 81045 + }, + { + "epoch": 12.071790288948465, + "grad_norm": 0.0003406915639061481, + "learning_rate": 2.035129760339531e-05, + "loss": 0.0, + "num_input_tokens_seen": 47018488, + "step": 81050 + }, + { + "epoch": 12.072535001489426, + "grad_norm": 0.0013311132788658142, + "learning_rate": 2.0348104892528998e-05, + "loss": 0.0, + "num_input_tokens_seen": 47021336, + "step": 81055 + }, + { + "epoch": 12.073279714030384, + "grad_norm": 0.003612447762861848, + "learning_rate": 2.0344912260251742e-05, + "loss": 0.0159, + "num_input_tokens_seen": 47024472, + "step": 81060 + }, + { + "epoch": 12.074024426571343, + "grad_norm": 0.0004088792484253645, + "learning_rate": 2.034171970661745e-05, + "loss": 0.0, + "num_input_tokens_seen": 47027480, + "step": 81065 + }, + { + "epoch": 12.074769139112302, + "grad_norm": 0.000392322224797681, + "learning_rate": 2.0338527231680078e-05, + "loss": 0.175, + "num_input_tokens_seen": 47030392, + "step": 81070 + }, + { + "epoch": 12.075513851653263, + "grad_norm": 0.03682922571897507, + "learning_rate": 2.033533483549354e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47033176, + "step": 81075 + }, + { + "epoch": 12.076258564194221, + "grad_norm": 0.0005544442101381719, + "learning_rate": 2.033214251811179e-05, + "loss": 0.0, + "num_input_tokens_seen": 47036088, + "step": 81080 + }, + { + "epoch": 12.07700327673518, + "grad_norm": 0.011089673265814781, + "learning_rate": 2.0328950279588748e-05, + "loss": 0.0004, + "num_input_tokens_seen": 47038872, + "step": 81085 + }, + { + "epoch": 12.077747989276139, + "grad_norm": 0.030072340741753578, + "learning_rate": 2.0325758119978334e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47041752, + "step": 81090 + }, + { + "epoch": 12.0784927018171, + "grad_norm": 0.0011609324719756842, + "learning_rate": 2.0322566039334497e-05, + "loss": 0.0008, + "num_input_tokens_seen": 47044440, + "step": 81095 + }, + { + "epoch": 12.079237414358058, + "grad_norm": 0.0011817271588370204, + "learning_rate": 2.0319374037711143e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47047224, + "step": 81100 + }, + { + "epoch": 12.079982126899017, + "grad_norm": 0.0030894239898771048, + "learning_rate": 2.0316182115162218e-05, + "loss": 0.0, + "num_input_tokens_seen": 47050136, + "step": 81105 + }, + { + "epoch": 12.080726839439976, + "grad_norm": 0.00021320150699466467, + "learning_rate": 2.031299027174162e-05, + "loss": 0.0007, + "num_input_tokens_seen": 47053208, + "step": 81110 + }, + { + "epoch": 12.081471551980936, + "grad_norm": 0.023173300549387932, + "learning_rate": 2.03097985075033e-05, + "loss": 0.1719, + "num_input_tokens_seen": 47056408, + "step": 81115 + }, + { + "epoch": 12.082216264521895, + "grad_norm": 4.254095983924344e-05, + "learning_rate": 2.030660682250117e-05, + "loss": 0.0, + "num_input_tokens_seen": 47059224, + "step": 81120 + }, + { + "epoch": 12.082960977062854, + "grad_norm": 96.04226684570312, + "learning_rate": 2.0303415216789135e-05, + "loss": 0.1438, + "num_input_tokens_seen": 47062232, + "step": 81125 + }, + { + "epoch": 12.083705689603812, + "grad_norm": 0.0013294497039169073, + "learning_rate": 2.0300223690421135e-05, + "loss": 0.0, + "num_input_tokens_seen": 47065048, + "step": 81130 + }, + { + "epoch": 12.084450402144773, + "grad_norm": 0.008624028414487839, + "learning_rate": 2.029703224345108e-05, + "loss": 0.0, + "num_input_tokens_seen": 47067640, + "step": 81135 + }, + { + "epoch": 12.085195114685732, + "grad_norm": 7.976515917107463e-06, + "learning_rate": 2.0293840875932886e-05, + "loss": 0.0, + "num_input_tokens_seen": 47070648, + "step": 81140 + }, + { + "epoch": 12.08593982722669, + "grad_norm": 8.513972716173157e-05, + "learning_rate": 2.029064958792046e-05, + "loss": 0.0, + "num_input_tokens_seen": 47073336, + "step": 81145 + }, + { + "epoch": 12.08668453976765, + "grad_norm": 36.64088439941406, + "learning_rate": 2.0287458379467728e-05, + "loss": 0.0028, + "num_input_tokens_seen": 47075960, + "step": 81150 + }, + { + "epoch": 12.08742925230861, + "grad_norm": 0.008148081600666046, + "learning_rate": 2.028426725062859e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47078776, + "step": 81155 + }, + { + "epoch": 12.088173964849569, + "grad_norm": 2.5616842322051525e-05, + "learning_rate": 2.0281076201456977e-05, + "loss": 0.0005, + "num_input_tokens_seen": 47081688, + "step": 81160 + }, + { + "epoch": 12.088918677390527, + "grad_norm": 82.3563232421875, + "learning_rate": 2.0277885232006776e-05, + "loss": 0.0378, + "num_input_tokens_seen": 47084472, + "step": 81165 + }, + { + "epoch": 12.089663389931486, + "grad_norm": 0.0012020573485642672, + "learning_rate": 2.0274694342331907e-05, + "loss": 0.0, + "num_input_tokens_seen": 47087288, + "step": 81170 + }, + { + "epoch": 12.090408102472445, + "grad_norm": 0.0016853375127539039, + "learning_rate": 2.027150353248628e-05, + "loss": 0.0004, + "num_input_tokens_seen": 47090104, + "step": 81175 + }, + { + "epoch": 12.091152815013405, + "grad_norm": 0.00806089211255312, + "learning_rate": 2.026831280252378e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47092920, + "step": 81180 + }, + { + "epoch": 12.091897527554364, + "grad_norm": 0.00010374587873229757, + "learning_rate": 2.026512215249834e-05, + "loss": 0.1473, + "num_input_tokens_seen": 47095736, + "step": 81185 + }, + { + "epoch": 12.092642240095323, + "grad_norm": 0.00013455758744385093, + "learning_rate": 2.0261931582463844e-05, + "loss": 0.0, + "num_input_tokens_seen": 47098424, + "step": 81190 + }, + { + "epoch": 12.093386952636282, + "grad_norm": 0.005429577548056841, + "learning_rate": 2.0258741092474204e-05, + "loss": 0.0, + "num_input_tokens_seen": 47101272, + "step": 81195 + }, + { + "epoch": 12.094131665177242, + "grad_norm": 0.0023107146844267845, + "learning_rate": 2.0255550682583313e-05, + "loss": 0.002, + "num_input_tokens_seen": 47104088, + "step": 81200 + }, + { + "epoch": 12.094876377718201, + "grad_norm": 8.533470827387646e-05, + "learning_rate": 2.025236035284506e-05, + "loss": 0.1128, + "num_input_tokens_seen": 47106904, + "step": 81205 + }, + { + "epoch": 12.09562109025916, + "grad_norm": 0.0009586786618456244, + "learning_rate": 2.0249170103313365e-05, + "loss": 0.1688, + "num_input_tokens_seen": 47109944, + "step": 81210 + }, + { + "epoch": 12.096365802800118, + "grad_norm": 0.001497541437856853, + "learning_rate": 2.0245979934042104e-05, + "loss": 0.0082, + "num_input_tokens_seen": 47113176, + "step": 81215 + }, + { + "epoch": 12.097110515341079, + "grad_norm": 0.0005114649538882077, + "learning_rate": 2.0242789845085187e-05, + "loss": 0.0, + "num_input_tokens_seen": 47116088, + "step": 81220 + }, + { + "epoch": 12.097855227882038, + "grad_norm": 0.0030999344307929277, + "learning_rate": 2.0239599836496497e-05, + "loss": 0.0, + "num_input_tokens_seen": 47119000, + "step": 81225 + }, + { + "epoch": 12.098599940422996, + "grad_norm": 28.735137939453125, + "learning_rate": 2.0236409908329933e-05, + "loss": 0.0018, + "num_input_tokens_seen": 47121720, + "step": 81230 + }, + { + "epoch": 12.099344652963955, + "grad_norm": 0.00040178652852773666, + "learning_rate": 2.0233220060639373e-05, + "loss": 0.2469, + "num_input_tokens_seen": 47124440, + "step": 81235 + }, + { + "epoch": 12.100089365504916, + "grad_norm": 0.14145299792289734, + "learning_rate": 2.023003029347873e-05, + "loss": 0.0004, + "num_input_tokens_seen": 47127480, + "step": 81240 + }, + { + "epoch": 12.100834078045875, + "grad_norm": 0.0008182511664927006, + "learning_rate": 2.0226840606901872e-05, + "loss": 0.1595, + "num_input_tokens_seen": 47130360, + "step": 81245 + }, + { + "epoch": 12.101578790586833, + "grad_norm": 0.0007832840783521533, + "learning_rate": 2.022365100096268e-05, + "loss": 0.0, + "num_input_tokens_seen": 47133592, + "step": 81250 + }, + { + "epoch": 12.102323503127792, + "grad_norm": 0.0015755625208839774, + "learning_rate": 2.0220461475715063e-05, + "loss": 0.0922, + "num_input_tokens_seen": 47136376, + "step": 81255 + }, + { + "epoch": 12.103068215668753, + "grad_norm": 224.05615234375, + "learning_rate": 2.0217272031212887e-05, + "loss": 0.1035, + "num_input_tokens_seen": 47139288, + "step": 81260 + }, + { + "epoch": 12.103812928209711, + "grad_norm": 0.002107193460687995, + "learning_rate": 2.021408266751004e-05, + "loss": 0.0, + "num_input_tokens_seen": 47142072, + "step": 81265 + }, + { + "epoch": 12.10455764075067, + "grad_norm": 64.9444580078125, + "learning_rate": 2.0210893384660396e-05, + "loss": 0.2632, + "num_input_tokens_seen": 47145304, + "step": 81270 + }, + { + "epoch": 12.105302353291629, + "grad_norm": 100.54737091064453, + "learning_rate": 2.0207704182717852e-05, + "loss": 0.3205, + "num_input_tokens_seen": 47148280, + "step": 81275 + }, + { + "epoch": 12.10604706583259, + "grad_norm": 0.0016006303485482931, + "learning_rate": 2.0204515061736275e-05, + "loss": 0.0, + "num_input_tokens_seen": 47151160, + "step": 81280 + }, + { + "epoch": 12.106791778373548, + "grad_norm": 0.0015056147240102291, + "learning_rate": 2.0201326021769526e-05, + "loss": 0.0, + "num_input_tokens_seen": 47153880, + "step": 81285 + }, + { + "epoch": 12.107536490914507, + "grad_norm": 1.3095635175704956, + "learning_rate": 2.0198137062871512e-05, + "loss": 0.075, + "num_input_tokens_seen": 47156760, + "step": 81290 + }, + { + "epoch": 12.108281203455466, + "grad_norm": 0.0030292829032987356, + "learning_rate": 2.0194948185096086e-05, + "loss": 0.0018, + "num_input_tokens_seen": 47159640, + "step": 81295 + }, + { + "epoch": 12.109025915996426, + "grad_norm": 0.001201891922391951, + "learning_rate": 2.019175938849713e-05, + "loss": 0.0241, + "num_input_tokens_seen": 47162616, + "step": 81300 + }, + { + "epoch": 12.109770628537385, + "grad_norm": 0.0011797170154750347, + "learning_rate": 2.0188570673128504e-05, + "loss": 0.0064, + "num_input_tokens_seen": 47165368, + "step": 81305 + }, + { + "epoch": 12.110515341078344, + "grad_norm": 0.009866318665444851, + "learning_rate": 2.0185382039044094e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47168504, + "step": 81310 + }, + { + "epoch": 12.111260053619302, + "grad_norm": 15.735454559326172, + "learning_rate": 2.0182193486297755e-05, + "loss": 0.133, + "num_input_tokens_seen": 47171288, + "step": 81315 + }, + { + "epoch": 12.112004766160263, + "grad_norm": 0.010919748805463314, + "learning_rate": 2.017900501494337e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47173848, + "step": 81320 + }, + { + "epoch": 12.112749478701222, + "grad_norm": 0.0011421589879319072, + "learning_rate": 2.0175816625034795e-05, + "loss": 0.0068, + "num_input_tokens_seen": 47176664, + "step": 81325 + }, + { + "epoch": 12.11349419124218, + "grad_norm": 0.0006064675399102271, + "learning_rate": 2.0172628316625887e-05, + "loss": 0.0, + "num_input_tokens_seen": 47179640, + "step": 81330 + }, + { + "epoch": 12.11423890378314, + "grad_norm": 0.0035438942722976208, + "learning_rate": 2.0169440089770523e-05, + "loss": 0.0, + "num_input_tokens_seen": 47182456, + "step": 81335 + }, + { + "epoch": 12.114983616324098, + "grad_norm": 96.32376861572266, + "learning_rate": 2.0166251944522553e-05, + "loss": 0.126, + "num_input_tokens_seen": 47185400, + "step": 81340 + }, + { + "epoch": 12.115728328865059, + "grad_norm": 0.0038959146477282047, + "learning_rate": 2.016306388093585e-05, + "loss": 0.0, + "num_input_tokens_seen": 47188504, + "step": 81345 + }, + { + "epoch": 12.116473041406017, + "grad_norm": 13.883657455444336, + "learning_rate": 2.0159875899064258e-05, + "loss": 0.2106, + "num_input_tokens_seen": 47191448, + "step": 81350 + }, + { + "epoch": 12.117217753946976, + "grad_norm": 0.0025359895080327988, + "learning_rate": 2.0156687998961653e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47194264, + "step": 81355 + }, + { + "epoch": 12.117962466487935, + "grad_norm": 0.0019819026347249746, + "learning_rate": 2.015350018068188e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47197016, + "step": 81360 + }, + { + "epoch": 12.118707179028895, + "grad_norm": 0.0033256239257752895, + "learning_rate": 2.0150312444278795e-05, + "loss": 0.0772, + "num_input_tokens_seen": 47199640, + "step": 81365 + }, + { + "epoch": 12.119451891569854, + "grad_norm": 0.36514654755592346, + "learning_rate": 2.0147124789806254e-05, + "loss": 0.0003, + "num_input_tokens_seen": 47202488, + "step": 81370 + }, + { + "epoch": 12.120196604110813, + "grad_norm": 0.0019160283263772726, + "learning_rate": 2.01439372173181e-05, + "loss": 0.0017, + "num_input_tokens_seen": 47205432, + "step": 81375 + }, + { + "epoch": 12.120941316651772, + "grad_norm": 0.00030061023426242173, + "learning_rate": 2.0140749726868197e-05, + "loss": 0.0, + "num_input_tokens_seen": 47208376, + "step": 81380 + }, + { + "epoch": 12.121686029192732, + "grad_norm": 0.0075545646250247955, + "learning_rate": 2.013756231851038e-05, + "loss": 0.0, + "num_input_tokens_seen": 47211352, + "step": 81385 + }, + { + "epoch": 12.122430741733691, + "grad_norm": 0.0025887044612318277, + "learning_rate": 2.0134374992298515e-05, + "loss": 0.0, + "num_input_tokens_seen": 47214104, + "step": 81390 + }, + { + "epoch": 12.12317545427465, + "grad_norm": 0.00042969893547706306, + "learning_rate": 2.0131187748286438e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47216728, + "step": 81395 + }, + { + "epoch": 12.123920166815608, + "grad_norm": 0.031096143648028374, + "learning_rate": 2.0128000586528e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47219576, + "step": 81400 + }, + { + "epoch": 12.124664879356569, + "grad_norm": 0.0005172020755708218, + "learning_rate": 2.012481350707704e-05, + "loss": 0.0, + "num_input_tokens_seen": 47222744, + "step": 81405 + }, + { + "epoch": 12.125409591897528, + "grad_norm": 0.009543060325086117, + "learning_rate": 2.012162650998739e-05, + "loss": 0.1917, + "num_input_tokens_seen": 47225816, + "step": 81410 + }, + { + "epoch": 12.126154304438487, + "grad_norm": 0.0009801698615774512, + "learning_rate": 2.011843959531291e-05, + "loss": 0.0018, + "num_input_tokens_seen": 47228600, + "step": 81415 + }, + { + "epoch": 12.126899016979445, + "grad_norm": 0.0006679737125523388, + "learning_rate": 2.0115252763107424e-05, + "loss": 0.0225, + "num_input_tokens_seen": 47231672, + "step": 81420 + }, + { + "epoch": 12.127643729520406, + "grad_norm": 0.14087224006652832, + "learning_rate": 2.0112066013424785e-05, + "loss": 0.0647, + "num_input_tokens_seen": 47234392, + "step": 81425 + }, + { + "epoch": 12.128388442061365, + "grad_norm": 0.01634567230939865, + "learning_rate": 2.010887934631882e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47237624, + "step": 81430 + }, + { + "epoch": 12.129133154602323, + "grad_norm": 0.04691634699702263, + "learning_rate": 2.0105692761843375e-05, + "loss": 0.0019, + "num_input_tokens_seen": 47240600, + "step": 81435 + }, + { + "epoch": 12.129877867143282, + "grad_norm": 0.005830127280205488, + "learning_rate": 2.0102506260052273e-05, + "loss": 0.272, + "num_input_tokens_seen": 47243448, + "step": 81440 + }, + { + "epoch": 12.130622579684243, + "grad_norm": 0.00020420519285835326, + "learning_rate": 2.0099319840999343e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47246456, + "step": 81445 + }, + { + "epoch": 12.131367292225201, + "grad_norm": 0.0025518210604786873, + "learning_rate": 2.0096133504738428e-05, + "loss": 0.0, + "num_input_tokens_seen": 47249496, + "step": 81450 + }, + { + "epoch": 12.13211200476616, + "grad_norm": 0.13962902128696442, + "learning_rate": 2.009294725132335e-05, + "loss": 0.0068, + "num_input_tokens_seen": 47252312, + "step": 81455 + }, + { + "epoch": 12.132856717307119, + "grad_norm": 0.0062002381309866905, + "learning_rate": 2.0089761080807945e-05, + "loss": 0.0, + "num_input_tokens_seen": 47255224, + "step": 81460 + }, + { + "epoch": 12.13360142984808, + "grad_norm": 0.0064452593214809895, + "learning_rate": 2.0086574993246034e-05, + "loss": 0.0, + "num_input_tokens_seen": 47257944, + "step": 81465 + }, + { + "epoch": 12.134346142389038, + "grad_norm": 0.0019914372824132442, + "learning_rate": 2.008338898869145e-05, + "loss": 0.1785, + "num_input_tokens_seen": 47261080, + "step": 81470 + }, + { + "epoch": 12.135090854929997, + "grad_norm": 0.0007388035301119089, + "learning_rate": 2.0080203067198003e-05, + "loss": 0.1973, + "num_input_tokens_seen": 47263864, + "step": 81475 + }, + { + "epoch": 12.135835567470956, + "grad_norm": 0.08806592226028442, + "learning_rate": 2.0077017228819534e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47267064, + "step": 81480 + }, + { + "epoch": 12.136580280011916, + "grad_norm": 0.0003703661495819688, + "learning_rate": 2.0073831473609855e-05, + "loss": 0.0, + "num_input_tokens_seen": 47269976, + "step": 81485 + }, + { + "epoch": 12.137324992552875, + "grad_norm": 46.71285629272461, + "learning_rate": 2.007064580162278e-05, + "loss": 0.1674, + "num_input_tokens_seen": 47272952, + "step": 81490 + }, + { + "epoch": 12.138069705093834, + "grad_norm": 0.006757658440619707, + "learning_rate": 2.0067460212912137e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47275736, + "step": 81495 + }, + { + "epoch": 12.138814417634793, + "grad_norm": 0.0007945264806039631, + "learning_rate": 2.006427470753174e-05, + "loss": 0.2563, + "num_input_tokens_seen": 47278616, + "step": 81500 + }, + { + "epoch": 12.139559130175751, + "grad_norm": 0.003611049149185419, + "learning_rate": 2.006108928553541e-05, + "loss": 0.0119, + "num_input_tokens_seen": 47281368, + "step": 81505 + }, + { + "epoch": 12.140303842716712, + "grad_norm": 0.029068170115351677, + "learning_rate": 2.0057903946976944e-05, + "loss": 0.0, + "num_input_tokens_seen": 47284248, + "step": 81510 + }, + { + "epoch": 12.14104855525767, + "grad_norm": 0.0004926852998323739, + "learning_rate": 2.0054718691910178e-05, + "loss": 0.0, + "num_input_tokens_seen": 47287096, + "step": 81515 + }, + { + "epoch": 12.14179326779863, + "grad_norm": 0.0011912040645256639, + "learning_rate": 2.0051533520388918e-05, + "loss": 0.0, + "num_input_tokens_seen": 47290008, + "step": 81520 + }, + { + "epoch": 12.142537980339588, + "grad_norm": 0.49329158663749695, + "learning_rate": 2.0048348432466963e-05, + "loss": 0.0006, + "num_input_tokens_seen": 47292728, + "step": 81525 + }, + { + "epoch": 12.143282692880549, + "grad_norm": 204.734375, + "learning_rate": 2.004516342819813e-05, + "loss": 0.1615, + "num_input_tokens_seen": 47295416, + "step": 81530 + }, + { + "epoch": 12.144027405421507, + "grad_norm": 0.37005311250686646, + "learning_rate": 2.0041978507636222e-05, + "loss": 0.0003, + "num_input_tokens_seen": 47298136, + "step": 81535 + }, + { + "epoch": 12.144772117962466, + "grad_norm": 0.00010819497401826084, + "learning_rate": 2.0038793670835054e-05, + "loss": 0.0, + "num_input_tokens_seen": 47300824, + "step": 81540 + }, + { + "epoch": 12.145516830503425, + "grad_norm": 16.49363136291504, + "learning_rate": 2.0035608917848415e-05, + "loss": 0.0023, + "num_input_tokens_seen": 47303480, + "step": 81545 + }, + { + "epoch": 12.146261543044385, + "grad_norm": 0.001241065445356071, + "learning_rate": 2.0032424248730124e-05, + "loss": 0.2367, + "num_input_tokens_seen": 47306360, + "step": 81550 + }, + { + "epoch": 12.147006255585344, + "grad_norm": 10.416339874267578, + "learning_rate": 2.0029239663533977e-05, + "loss": 0.1798, + "num_input_tokens_seen": 47309304, + "step": 81555 + }, + { + "epoch": 12.147750968126303, + "grad_norm": 0.006478317081928253, + "learning_rate": 2.0026055162313778e-05, + "loss": 0.0, + "num_input_tokens_seen": 47312088, + "step": 81560 + }, + { + "epoch": 12.148495680667262, + "grad_norm": 0.0007751924567855895, + "learning_rate": 2.002287074512332e-05, + "loss": 0.0, + "num_input_tokens_seen": 47314840, + "step": 81565 + }, + { + "epoch": 12.149240393208222, + "grad_norm": 0.0014072316698729992, + "learning_rate": 2.001968641201639e-05, + "loss": 0.0241, + "num_input_tokens_seen": 47317528, + "step": 81570 + }, + { + "epoch": 12.149985105749181, + "grad_norm": 0.0001817514275899157, + "learning_rate": 2.0016502163046815e-05, + "loss": 0.0029, + "num_input_tokens_seen": 47320312, + "step": 81575 + }, + { + "epoch": 12.15072981829014, + "grad_norm": 0.002410560380667448, + "learning_rate": 2.0013317998268352e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47323384, + "step": 81580 + }, + { + "epoch": 12.151474530831099, + "grad_norm": 0.0011558720143511891, + "learning_rate": 2.0010133917734825e-05, + "loss": 0.0025, + "num_input_tokens_seen": 47326648, + "step": 81585 + }, + { + "epoch": 12.152219243372059, + "grad_norm": 0.00992498081177473, + "learning_rate": 2.0006949921500012e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47329560, + "step": 81590 + }, + { + "epoch": 12.152963955913018, + "grad_norm": 0.00031138354097492993, + "learning_rate": 2.0003766009617707e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47332248, + "step": 81595 + }, + { + "epoch": 12.153708668453977, + "grad_norm": 2.236753463745117, + "learning_rate": 2.00005821821417e-05, + "loss": 0.2551, + "num_input_tokens_seen": 47335800, + "step": 81600 + }, + { + "epoch": 12.154453380994935, + "grad_norm": 0.0008039356907829642, + "learning_rate": 1.9997398439125763e-05, + "loss": 0.101, + "num_input_tokens_seen": 47338680, + "step": 81605 + }, + { + "epoch": 12.155198093535896, + "grad_norm": 0.010003768838942051, + "learning_rate": 1.999421478062371e-05, + "loss": 0.0776, + "num_input_tokens_seen": 47341688, + "step": 81610 + }, + { + "epoch": 12.155942806076855, + "grad_norm": 0.0008027907460927963, + "learning_rate": 1.9991031206689294e-05, + "loss": 0.1969, + "num_input_tokens_seen": 47344440, + "step": 81615 + }, + { + "epoch": 12.156687518617813, + "grad_norm": 0.04916774109005928, + "learning_rate": 1.998784771737633e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47347800, + "step": 81620 + }, + { + "epoch": 12.157432231158772, + "grad_norm": 0.004999978933483362, + "learning_rate": 1.9984664312738578e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47350648, + "step": 81625 + }, + { + "epoch": 12.158176943699733, + "grad_norm": 0.01053868792951107, + "learning_rate": 1.9981480992829832e-05, + "loss": 0.0027, + "num_input_tokens_seen": 47353496, + "step": 81630 + }, + { + "epoch": 12.158921656240691, + "grad_norm": 126.10496520996094, + "learning_rate": 1.997829775770385e-05, + "loss": 0.1222, + "num_input_tokens_seen": 47356408, + "step": 81635 + }, + { + "epoch": 12.15966636878165, + "grad_norm": 0.00042286200914531946, + "learning_rate": 1.9975114607414434e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47359480, + "step": 81640 + }, + { + "epoch": 12.160411081322609, + "grad_norm": 0.006191960535943508, + "learning_rate": 1.9971931542015355e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47362392, + "step": 81645 + }, + { + "epoch": 12.16115579386357, + "grad_norm": 0.004797452595084906, + "learning_rate": 1.9968748561560366e-05, + "loss": 0.0, + "num_input_tokens_seen": 47365432, + "step": 81650 + }, + { + "epoch": 12.161900506404528, + "grad_norm": 0.00021163839846849442, + "learning_rate": 1.996556566610327e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47368504, + "step": 81655 + }, + { + "epoch": 12.162645218945487, + "grad_norm": 0.018916185945272446, + "learning_rate": 1.996238285569782e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47371352, + "step": 81660 + }, + { + "epoch": 12.163389931486446, + "grad_norm": 0.00031808781204745173, + "learning_rate": 1.9959200130397795e-05, + "loss": 0.0, + "num_input_tokens_seen": 47374232, + "step": 81665 + }, + { + "epoch": 12.164134644027406, + "grad_norm": 0.0001574615598656237, + "learning_rate": 1.995601749025695e-05, + "loss": 0.0014, + "num_input_tokens_seen": 47376888, + "step": 81670 + }, + { + "epoch": 12.164879356568365, + "grad_norm": 0.009135515429079533, + "learning_rate": 1.9952834935329077e-05, + "loss": 0.0003, + "num_input_tokens_seen": 47379736, + "step": 81675 + }, + { + "epoch": 12.165624069109324, + "grad_norm": 0.016627926379442215, + "learning_rate": 1.9949652465667915e-05, + "loss": 0.0052, + "num_input_tokens_seen": 47382744, + "step": 81680 + }, + { + "epoch": 12.166368781650283, + "grad_norm": 8.552456855773926, + "learning_rate": 1.9946470081327253e-05, + "loss": 0.0008, + "num_input_tokens_seen": 47385624, + "step": 81685 + }, + { + "epoch": 12.167113494191241, + "grad_norm": 0.00020362684153951705, + "learning_rate": 1.9943287782360844e-05, + "loss": 0.0, + "num_input_tokens_seen": 47388696, + "step": 81690 + }, + { + "epoch": 12.167858206732202, + "grad_norm": 0.009794950485229492, + "learning_rate": 1.9940105568822437e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47391256, + "step": 81695 + }, + { + "epoch": 12.16860291927316, + "grad_norm": 0.0007650390616618097, + "learning_rate": 1.9936923440765813e-05, + "loss": 0.1252, + "num_input_tokens_seen": 47394200, + "step": 81700 + }, + { + "epoch": 12.16934763181412, + "grad_norm": 0.005103296134620905, + "learning_rate": 1.9933741398244714e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47397048, + "step": 81705 + }, + { + "epoch": 12.170092344355078, + "grad_norm": 0.004222202580422163, + "learning_rate": 1.9930559441312913e-05, + "loss": 0.0752, + "num_input_tokens_seen": 47399768, + "step": 81710 + }, + { + "epoch": 12.170837056896039, + "grad_norm": 0.0018113235710188746, + "learning_rate": 1.9927377570024146e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47402392, + "step": 81715 + }, + { + "epoch": 12.171581769436997, + "grad_norm": 3.57122802734375, + "learning_rate": 1.9924195784432192e-05, + "loss": 0.0016, + "num_input_tokens_seen": 47405496, + "step": 81720 + }, + { + "epoch": 12.172326481977956, + "grad_norm": 0.00020351496641524136, + "learning_rate": 1.992101408459079e-05, + "loss": 0.0, + "num_input_tokens_seen": 47408376, + "step": 81725 + }, + { + "epoch": 12.173071194518915, + "grad_norm": 0.0005755664897151291, + "learning_rate": 1.9917832470553692e-05, + "loss": 0.0, + "num_input_tokens_seen": 47411160, + "step": 81730 + }, + { + "epoch": 12.173815907059875, + "grad_norm": 0.002199030015617609, + "learning_rate": 1.9914650942374648e-05, + "loss": 0.2846, + "num_input_tokens_seen": 47414360, + "step": 81735 + }, + { + "epoch": 12.174560619600834, + "grad_norm": 0.0003353858774062246, + "learning_rate": 1.9911469500107398e-05, + "loss": 0.0, + "num_input_tokens_seen": 47417112, + "step": 81740 + }, + { + "epoch": 12.175305332141793, + "grad_norm": 0.00018019623530562967, + "learning_rate": 1.9908288143805714e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47420120, + "step": 81745 + }, + { + "epoch": 12.176050044682752, + "grad_norm": 230.57437133789062, + "learning_rate": 1.9905106873523316e-05, + "loss": 0.0561, + "num_input_tokens_seen": 47423000, + "step": 81750 + }, + { + "epoch": 12.176794757223712, + "grad_norm": 0.0021973876282572746, + "learning_rate": 1.9901925689313967e-05, + "loss": 0.1209, + "num_input_tokens_seen": 47425880, + "step": 81755 + }, + { + "epoch": 12.177539469764671, + "grad_norm": 0.01579725369811058, + "learning_rate": 1.9898744591231396e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47429080, + "step": 81760 + }, + { + "epoch": 12.17828418230563, + "grad_norm": 0.002584967529401183, + "learning_rate": 1.989556357932936e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47432248, + "step": 81765 + }, + { + "epoch": 12.179028894846589, + "grad_norm": 0.00018503209867049009, + "learning_rate": 1.9892382653661584e-05, + "loss": 0.0, + "num_input_tokens_seen": 47434872, + "step": 81770 + }, + { + "epoch": 12.179773607387549, + "grad_norm": 0.0026881012599915266, + "learning_rate": 1.9889201814281804e-05, + "loss": 0.0, + "num_input_tokens_seen": 47437624, + "step": 81775 + }, + { + "epoch": 12.180518319928508, + "grad_norm": 87.99720764160156, + "learning_rate": 1.9886021061243775e-05, + "loss": 0.04, + "num_input_tokens_seen": 47440376, + "step": 81780 + }, + { + "epoch": 12.181263032469467, + "grad_norm": 17.437440872192383, + "learning_rate": 1.9882840394601213e-05, + "loss": 0.3688, + "num_input_tokens_seen": 47443672, + "step": 81785 + }, + { + "epoch": 12.182007745010425, + "grad_norm": 17.459392547607422, + "learning_rate": 1.987965981440787e-05, + "loss": 0.0506, + "num_input_tokens_seen": 47446584, + "step": 81790 + }, + { + "epoch": 12.182752457551386, + "grad_norm": 0.007444618735462427, + "learning_rate": 1.987647932071747e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47449496, + "step": 81795 + }, + { + "epoch": 12.183497170092345, + "grad_norm": 0.005053847562521696, + "learning_rate": 1.9873298913583746e-05, + "loss": 0.0008, + "num_input_tokens_seen": 47452344, + "step": 81800 + }, + { + "epoch": 12.184241882633303, + "grad_norm": 0.00035785234649665654, + "learning_rate": 1.987011859306043e-05, + "loss": 0.0207, + "num_input_tokens_seen": 47455192, + "step": 81805 + }, + { + "epoch": 12.184986595174262, + "grad_norm": 0.012669106014072895, + "learning_rate": 1.986693835920123e-05, + "loss": 0.0742, + "num_input_tokens_seen": 47458072, + "step": 81810 + }, + { + "epoch": 12.185731307715223, + "grad_norm": 0.004027973860502243, + "learning_rate": 1.9863758212059902e-05, + "loss": 0.0003, + "num_input_tokens_seen": 47460888, + "step": 81815 + }, + { + "epoch": 12.186476020256181, + "grad_norm": 0.006396500393748283, + "learning_rate": 1.9860578151690154e-05, + "loss": 0.0052, + "num_input_tokens_seen": 47463480, + "step": 81820 + }, + { + "epoch": 12.18722073279714, + "grad_norm": 0.0005127727054059505, + "learning_rate": 1.9857398178145718e-05, + "loss": 0.0102, + "num_input_tokens_seen": 47466648, + "step": 81825 + }, + { + "epoch": 12.187965445338099, + "grad_norm": 18.14069366455078, + "learning_rate": 1.985421829148031e-05, + "loss": 0.0769, + "num_input_tokens_seen": 47469752, + "step": 81830 + }, + { + "epoch": 12.18871015787906, + "grad_norm": 0.0021714267786592245, + "learning_rate": 1.985103849174766e-05, + "loss": 0.0, + "num_input_tokens_seen": 47472792, + "step": 81835 + }, + { + "epoch": 12.189454870420018, + "grad_norm": 0.02478507161140442, + "learning_rate": 1.984785877900147e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47475384, + "step": 81840 + }, + { + "epoch": 12.190199582960977, + "grad_norm": 2.6629157066345215, + "learning_rate": 1.984467915329548e-05, + "loss": 0.1816, + "num_input_tokens_seen": 47478296, + "step": 81845 + }, + { + "epoch": 12.190944295501936, + "grad_norm": 0.0009576304000802338, + "learning_rate": 1.9841499614683394e-05, + "loss": 0.0023, + "num_input_tokens_seen": 47481368, + "step": 81850 + }, + { + "epoch": 12.191689008042895, + "grad_norm": 0.0004316676640883088, + "learning_rate": 1.9838320163218927e-05, + "loss": 0.0, + "num_input_tokens_seen": 47484248, + "step": 81855 + }, + { + "epoch": 12.192433720583855, + "grad_norm": 0.03419219329953194, + "learning_rate": 1.98351407989558e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47487544, + "step": 81860 + }, + { + "epoch": 12.193178433124814, + "grad_norm": 0.0003325477009639144, + "learning_rate": 1.983196152194771e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47490424, + "step": 81865 + }, + { + "epoch": 12.193923145665773, + "grad_norm": 0.0008438165532425046, + "learning_rate": 1.9828782332248385e-05, + "loss": 0.0023, + "num_input_tokens_seen": 47493304, + "step": 81870 + }, + { + "epoch": 12.194667858206731, + "grad_norm": 0.0026887415442615747, + "learning_rate": 1.982560322991152e-05, + "loss": 0.0008, + "num_input_tokens_seen": 47496184, + "step": 81875 + }, + { + "epoch": 12.195412570747692, + "grad_norm": 0.0008334979647770524, + "learning_rate": 1.9822424214990837e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47498840, + "step": 81880 + }, + { + "epoch": 12.19615728328865, + "grad_norm": 8.67209309944883e-05, + "learning_rate": 1.9819245287540036e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47501880, + "step": 81885 + }, + { + "epoch": 12.19690199582961, + "grad_norm": 0.0011045114370062947, + "learning_rate": 1.9816066447612815e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47504728, + "step": 81890 + }, + { + "epoch": 12.197646708370568, + "grad_norm": 0.10729437321424484, + "learning_rate": 1.9812887695262887e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47507640, + "step": 81895 + }, + { + "epoch": 12.198391420911529, + "grad_norm": 0.005676631815731525, + "learning_rate": 1.980970903054394e-05, + "loss": 0.0, + "num_input_tokens_seen": 47510328, + "step": 81900 + }, + { + "epoch": 12.199136133452487, + "grad_norm": 0.0074171032756567, + "learning_rate": 1.9806530453509693e-05, + "loss": 0.0, + "num_input_tokens_seen": 47513016, + "step": 81905 + }, + { + "epoch": 12.199880845993446, + "grad_norm": 0.10931623727083206, + "learning_rate": 1.9803351964213827e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47516024, + "step": 81910 + }, + { + "epoch": 12.200625558534405, + "grad_norm": 0.0006878420826978981, + "learning_rate": 1.9800173562710055e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47518776, + "step": 81915 + }, + { + "epoch": 12.201370271075366, + "grad_norm": 0.0018711310112848878, + "learning_rate": 1.9796995249052064e-05, + "loss": 0.0, + "num_input_tokens_seen": 47521592, + "step": 81920 + }, + { + "epoch": 12.202114983616324, + "grad_norm": 0.001978443469852209, + "learning_rate": 1.9793817023293555e-05, + "loss": 0.1377, + "num_input_tokens_seen": 47524632, + "step": 81925 + }, + { + "epoch": 12.202859696157283, + "grad_norm": 21.134065628051758, + "learning_rate": 1.9790638885488216e-05, + "loss": 0.1251, + "num_input_tokens_seen": 47527768, + "step": 81930 + }, + { + "epoch": 12.203604408698242, + "grad_norm": 0.15208443999290466, + "learning_rate": 1.9787460835689726e-05, + "loss": 0.0003, + "num_input_tokens_seen": 47530616, + "step": 81935 + }, + { + "epoch": 12.204349121239202, + "grad_norm": 0.18584147095680237, + "learning_rate": 1.9784282873951797e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47533976, + "step": 81940 + }, + { + "epoch": 12.205093833780161, + "grad_norm": 0.0010405645007267594, + "learning_rate": 1.97811050003281e-05, + "loss": 0.0009, + "num_input_tokens_seen": 47536792, + "step": 81945 + }, + { + "epoch": 12.20583854632112, + "grad_norm": 0.7647454142570496, + "learning_rate": 1.977792721487234e-05, + "loss": 0.0003, + "num_input_tokens_seen": 47539768, + "step": 81950 + }, + { + "epoch": 12.206583258862079, + "grad_norm": 5.1403661927906796e-05, + "learning_rate": 1.9774749517638188e-05, + "loss": 0.0017, + "num_input_tokens_seen": 47542392, + "step": 81955 + }, + { + "epoch": 12.20732797140304, + "grad_norm": 4.138384974794462e-05, + "learning_rate": 1.9771571908679337e-05, + "loss": 0.0, + "num_input_tokens_seen": 47545304, + "step": 81960 + }, + { + "epoch": 12.208072683943998, + "grad_norm": 0.0015458118868991733, + "learning_rate": 1.976839438804946e-05, + "loss": 0.0, + "num_input_tokens_seen": 47548088, + "step": 81965 + }, + { + "epoch": 12.208817396484957, + "grad_norm": 27.184629440307617, + "learning_rate": 1.976521695580224e-05, + "loss": 0.0883, + "num_input_tokens_seen": 47551032, + "step": 81970 + }, + { + "epoch": 12.209562109025915, + "grad_norm": 0.01575857773423195, + "learning_rate": 1.9762039611991365e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47553848, + "step": 81975 + }, + { + "epoch": 12.210306821566876, + "grad_norm": 0.002561155939474702, + "learning_rate": 1.9758862356670498e-05, + "loss": 0.0, + "num_input_tokens_seen": 47556728, + "step": 81980 + }, + { + "epoch": 12.211051534107835, + "grad_norm": 0.0008991347276605666, + "learning_rate": 1.9755685189893332e-05, + "loss": 0.1266, + "num_input_tokens_seen": 47559640, + "step": 81985 + }, + { + "epoch": 12.211796246648793, + "grad_norm": 0.0006194633315317333, + "learning_rate": 1.9752508111713532e-05, + "loss": 0.0, + "num_input_tokens_seen": 47562712, + "step": 81990 + }, + { + "epoch": 12.212540959189752, + "grad_norm": 0.00038666126783937216, + "learning_rate": 1.974933112218478e-05, + "loss": 0.0005, + "num_input_tokens_seen": 47565720, + "step": 81995 + }, + { + "epoch": 12.213285671730713, + "grad_norm": 4.7380266189575195, + "learning_rate": 1.9746154221360732e-05, + "loss": 0.0388, + "num_input_tokens_seen": 47568600, + "step": 82000 + }, + { + "epoch": 12.214030384271672, + "grad_norm": 0.00170428107958287, + "learning_rate": 1.9742977409295076e-05, + "loss": 0.0, + "num_input_tokens_seen": 47571448, + "step": 82005 + }, + { + "epoch": 12.21477509681263, + "grad_norm": 0.00011192975216545165, + "learning_rate": 1.9739800686041478e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47574328, + "step": 82010 + }, + { + "epoch": 12.215519809353589, + "grad_norm": 0.00010547599231358618, + "learning_rate": 1.973662405165359e-05, + "loss": 0.005, + "num_input_tokens_seen": 47577336, + "step": 82015 + }, + { + "epoch": 12.216264521894548, + "grad_norm": 0.0002240089961560443, + "learning_rate": 1.9733447506185095e-05, + "loss": 0.0, + "num_input_tokens_seen": 47580152, + "step": 82020 + }, + { + "epoch": 12.217009234435508, + "grad_norm": 0.003818856319412589, + "learning_rate": 1.973027104968965e-05, + "loss": 0.0058, + "num_input_tokens_seen": 47583096, + "step": 82025 + }, + { + "epoch": 12.217753946976467, + "grad_norm": 10.653895378112793, + "learning_rate": 1.9727094682220925e-05, + "loss": 0.0678, + "num_input_tokens_seen": 47585976, + "step": 82030 + }, + { + "epoch": 12.218498659517426, + "grad_norm": 0.003476529847830534, + "learning_rate": 1.9723918403832565e-05, + "loss": 0.1036, + "num_input_tokens_seen": 47588664, + "step": 82035 + }, + { + "epoch": 12.219243372058385, + "grad_norm": 0.0008422223036177456, + "learning_rate": 1.9720742214578254e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47591608, + "step": 82040 + }, + { + "epoch": 12.219988084599345, + "grad_norm": 0.12080670893192291, + "learning_rate": 1.9717566114511634e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47595512, + "step": 82045 + }, + { + "epoch": 12.220732797140304, + "grad_norm": 0.0001534744951641187, + "learning_rate": 1.9714390103686355e-05, + "loss": 0.0427, + "num_input_tokens_seen": 47598328, + "step": 82050 + }, + { + "epoch": 12.221477509681263, + "grad_norm": 0.0029741968028247356, + "learning_rate": 1.9711214182156096e-05, + "loss": 0.0734, + "num_input_tokens_seen": 47601304, + "step": 82055 + }, + { + "epoch": 12.222222222222221, + "grad_norm": 0.00043119871406815946, + "learning_rate": 1.9708038349974494e-05, + "loss": 0.0, + "num_input_tokens_seen": 47604120, + "step": 82060 + }, + { + "epoch": 12.222966934763182, + "grad_norm": 3.9522157749161124e-05, + "learning_rate": 1.9704862607195207e-05, + "loss": 0.0, + "num_input_tokens_seen": 47607224, + "step": 82065 + }, + { + "epoch": 12.22371164730414, + "grad_norm": 0.0015586279332637787, + "learning_rate": 1.970168695387188e-05, + "loss": 0.0, + "num_input_tokens_seen": 47610168, + "step": 82070 + }, + { + "epoch": 12.2244563598451, + "grad_norm": 0.014736959710717201, + "learning_rate": 1.9698511390058172e-05, + "loss": 0.0, + "num_input_tokens_seen": 47612920, + "step": 82075 + }, + { + "epoch": 12.225201072386058, + "grad_norm": 0.0002745638776104897, + "learning_rate": 1.9695335915807716e-05, + "loss": 0.0, + "num_input_tokens_seen": 47616056, + "step": 82080 + }, + { + "epoch": 12.225945784927019, + "grad_norm": 1.1882753372192383, + "learning_rate": 1.969216053117418e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47619000, + "step": 82085 + }, + { + "epoch": 12.226690497467978, + "grad_norm": 8.287122182082385e-05, + "learning_rate": 1.9688985236211197e-05, + "loss": 0.1751, + "num_input_tokens_seen": 47621848, + "step": 82090 + }, + { + "epoch": 12.227435210008936, + "grad_norm": 0.013718818314373493, + "learning_rate": 1.9685810030972405e-05, + "loss": 0.0, + "num_input_tokens_seen": 47624856, + "step": 82095 + }, + { + "epoch": 12.228179922549895, + "grad_norm": 0.22866395115852356, + "learning_rate": 1.9682634915511455e-05, + "loss": 0.0311, + "num_input_tokens_seen": 47627928, + "step": 82100 + }, + { + "epoch": 12.228924635090856, + "grad_norm": 0.0003661657392513007, + "learning_rate": 1.9679459889881977e-05, + "loss": 0.0, + "num_input_tokens_seen": 47630904, + "step": 82105 + }, + { + "epoch": 12.229669347631814, + "grad_norm": 0.0004561800742521882, + "learning_rate": 1.9676284954137624e-05, + "loss": 0.0019, + "num_input_tokens_seen": 47633624, + "step": 82110 + }, + { + "epoch": 12.230414060172773, + "grad_norm": 0.002823109505698085, + "learning_rate": 1.9673110108332014e-05, + "loss": 0.1376, + "num_input_tokens_seen": 47636536, + "step": 82115 + }, + { + "epoch": 12.231158772713732, + "grad_norm": 0.0011738298926502466, + "learning_rate": 1.966993535251881e-05, + "loss": 0.0, + "num_input_tokens_seen": 47639640, + "step": 82120 + }, + { + "epoch": 12.231903485254692, + "grad_norm": 0.003031938336789608, + "learning_rate": 1.966676068675163e-05, + "loss": 0.0003, + "num_input_tokens_seen": 47642584, + "step": 82125 + }, + { + "epoch": 12.232648197795651, + "grad_norm": 3.354511260986328, + "learning_rate": 1.96635861110841e-05, + "loss": 0.0023, + "num_input_tokens_seen": 47645208, + "step": 82130 + }, + { + "epoch": 12.23339291033661, + "grad_norm": 2.2480132579803467, + "learning_rate": 1.9660411625569867e-05, + "loss": 0.0011, + "num_input_tokens_seen": 47647992, + "step": 82135 + }, + { + "epoch": 12.234137622877569, + "grad_norm": 0.0012860748684033751, + "learning_rate": 1.965723723026254e-05, + "loss": 0.0031, + "num_input_tokens_seen": 47650936, + "step": 82140 + }, + { + "epoch": 12.23488233541853, + "grad_norm": 0.0005883016274310648, + "learning_rate": 1.965406292521577e-05, + "loss": 0.2, + "num_input_tokens_seen": 47653784, + "step": 82145 + }, + { + "epoch": 12.235627047959488, + "grad_norm": 0.006691537797451019, + "learning_rate": 1.965088871048317e-05, + "loss": 0.0, + "num_input_tokens_seen": 47656504, + "step": 82150 + }, + { + "epoch": 12.236371760500447, + "grad_norm": 0.0003054591652471572, + "learning_rate": 1.964771458611837e-05, + "loss": 0.0, + "num_input_tokens_seen": 47659224, + "step": 82155 + }, + { + "epoch": 12.237116473041405, + "grad_norm": 0.0005226214998401701, + "learning_rate": 1.964454055217499e-05, + "loss": 0.0003, + "num_input_tokens_seen": 47662712, + "step": 82160 + }, + { + "epoch": 12.237861185582366, + "grad_norm": 0.002749337349087, + "learning_rate": 1.9641366608706656e-05, + "loss": 0.0044, + "num_input_tokens_seen": 47665720, + "step": 82165 + }, + { + "epoch": 12.238605898123325, + "grad_norm": 0.00010671794734662399, + "learning_rate": 1.9638192755766993e-05, + "loss": 0.0003, + "num_input_tokens_seen": 47668792, + "step": 82170 + }, + { + "epoch": 12.239350610664284, + "grad_norm": 0.006992478854954243, + "learning_rate": 1.9635018993409602e-05, + "loss": 0.0, + "num_input_tokens_seen": 47671736, + "step": 82175 + }, + { + "epoch": 12.240095323205242, + "grad_norm": 0.0005379419890232384, + "learning_rate": 1.963184532168812e-05, + "loss": 0.0, + "num_input_tokens_seen": 47674264, + "step": 82180 + }, + { + "epoch": 12.240840035746203, + "grad_norm": 0.0008312002755701542, + "learning_rate": 1.9628671740656154e-05, + "loss": 0.3156, + "num_input_tokens_seen": 47676920, + "step": 82185 + }, + { + "epoch": 12.241584748287162, + "grad_norm": 0.0014099546242505312, + "learning_rate": 1.962549825036732e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47679832, + "step": 82190 + }, + { + "epoch": 12.24232946082812, + "grad_norm": 0.019480286166071892, + "learning_rate": 1.9622324850875227e-05, + "loss": 0.0004, + "num_input_tokens_seen": 47682392, + "step": 82195 + }, + { + "epoch": 12.243074173369079, + "grad_norm": 23.476099014282227, + "learning_rate": 1.9619151542233494e-05, + "loss": 0.3348, + "num_input_tokens_seen": 47685240, + "step": 82200 + }, + { + "epoch": 12.243818885910038, + "grad_norm": 0.002255214611068368, + "learning_rate": 1.9615978324495733e-05, + "loss": 0.0, + "num_input_tokens_seen": 47688248, + "step": 82205 + }, + { + "epoch": 12.244563598450998, + "grad_norm": 0.0021381427068263292, + "learning_rate": 1.961280519771553e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47691288, + "step": 82210 + }, + { + "epoch": 12.245308310991957, + "grad_norm": 0.0003676466003526002, + "learning_rate": 1.960963216194652e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47694040, + "step": 82215 + }, + { + "epoch": 12.246053023532916, + "grad_norm": 0.00253973132930696, + "learning_rate": 1.960645921724229e-05, + "loss": 0.3947, + "num_input_tokens_seen": 47696856, + "step": 82220 + }, + { + "epoch": 12.246797736073875, + "grad_norm": 0.0007703456212766469, + "learning_rate": 1.960328636365646e-05, + "loss": 0.0026, + "num_input_tokens_seen": 47699832, + "step": 82225 + }, + { + "epoch": 12.247542448614835, + "grad_norm": 0.007208331022411585, + "learning_rate": 1.9600113601242605e-05, + "loss": 0.0004, + "num_input_tokens_seen": 47702904, + "step": 82230 + }, + { + "epoch": 12.248287161155794, + "grad_norm": 0.003254655282944441, + "learning_rate": 1.9596940930054358e-05, + "loss": 0.0, + "num_input_tokens_seen": 47705912, + "step": 82235 + }, + { + "epoch": 12.249031873696753, + "grad_norm": 0.004966160748153925, + "learning_rate": 1.9593768350145288e-05, + "loss": 0.1035, + "num_input_tokens_seen": 47708536, + "step": 82240 + }, + { + "epoch": 12.249776586237711, + "grad_norm": 0.0001981308014364913, + "learning_rate": 1.9590595861569023e-05, + "loss": 0.1534, + "num_input_tokens_seen": 47711544, + "step": 82245 + }, + { + "epoch": 12.250521298778672, + "grad_norm": 0.002228622091934085, + "learning_rate": 1.9587423464379136e-05, + "loss": 0.2032, + "num_input_tokens_seen": 47714424, + "step": 82250 + }, + { + "epoch": 12.25126601131963, + "grad_norm": 0.18630170822143555, + "learning_rate": 1.9584251158629228e-05, + "loss": 0.0508, + "num_input_tokens_seen": 47717272, + "step": 82255 + }, + { + "epoch": 12.25201072386059, + "grad_norm": 5.53316593170166, + "learning_rate": 1.9581078944372897e-05, + "loss": 0.0008, + "num_input_tokens_seen": 47720056, + "step": 82260 + }, + { + "epoch": 12.252755436401548, + "grad_norm": 23.229629516601562, + "learning_rate": 1.957790682166372e-05, + "loss": 0.0942, + "num_input_tokens_seen": 47722936, + "step": 82265 + }, + { + "epoch": 12.253500148942509, + "grad_norm": 0.00202429061755538, + "learning_rate": 1.9574734790555305e-05, + "loss": 0.0402, + "num_input_tokens_seen": 47725656, + "step": 82270 + }, + { + "epoch": 12.254244861483468, + "grad_norm": 0.0010504645761102438, + "learning_rate": 1.9571562851101223e-05, + "loss": 0.0067, + "num_input_tokens_seen": 47728536, + "step": 82275 + }, + { + "epoch": 12.254989574024426, + "grad_norm": 0.017347386106848717, + "learning_rate": 1.9568391003355073e-05, + "loss": 0.0138, + "num_input_tokens_seen": 47732024, + "step": 82280 + }, + { + "epoch": 12.255734286565385, + "grad_norm": 3.1686418056488037, + "learning_rate": 1.956521924737044e-05, + "loss": 0.125, + "num_input_tokens_seen": 47734712, + "step": 82285 + }, + { + "epoch": 12.256478999106346, + "grad_norm": 0.01031174510717392, + "learning_rate": 1.9562047583200906e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47737432, + "step": 82290 + }, + { + "epoch": 12.257223711647304, + "grad_norm": 0.010400739498436451, + "learning_rate": 1.955887601090005e-05, + "loss": 0.0245, + "num_input_tokens_seen": 47739896, + "step": 82295 + }, + { + "epoch": 12.257968424188263, + "grad_norm": 0.00824765209108591, + "learning_rate": 1.9555704530521445e-05, + "loss": 0.0008, + "num_input_tokens_seen": 47742968, + "step": 82300 + }, + { + "epoch": 12.258713136729222, + "grad_norm": 0.005437248852103949, + "learning_rate": 1.955253314211869e-05, + "loss": 0.0331, + "num_input_tokens_seen": 47745528, + "step": 82305 + }, + { + "epoch": 12.259457849270182, + "grad_norm": 0.04289887845516205, + "learning_rate": 1.9549361845745338e-05, + "loss": 0.0013, + "num_input_tokens_seen": 47748376, + "step": 82310 + }, + { + "epoch": 12.260202561811141, + "grad_norm": 0.015544060617685318, + "learning_rate": 1.9546190641454993e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47751288, + "step": 82315 + }, + { + "epoch": 12.2609472743521, + "grad_norm": 0.005504722241312265, + "learning_rate": 1.9543019529301203e-05, + "loss": 0.0393, + "num_input_tokens_seen": 47754008, + "step": 82320 + }, + { + "epoch": 12.261691986893059, + "grad_norm": 0.03183088079094887, + "learning_rate": 1.953984850933756e-05, + "loss": 0.1339, + "num_input_tokens_seen": 47756888, + "step": 82325 + }, + { + "epoch": 12.26243669943402, + "grad_norm": 0.007080337032675743, + "learning_rate": 1.953667758161763e-05, + "loss": 0.2003, + "num_input_tokens_seen": 47760088, + "step": 82330 + }, + { + "epoch": 12.263181411974978, + "grad_norm": 1.1592199802398682, + "learning_rate": 1.9533506746194964e-05, + "loss": 0.0005, + "num_input_tokens_seen": 47763160, + "step": 82335 + }, + { + "epoch": 12.263926124515937, + "grad_norm": 0.010395549237728119, + "learning_rate": 1.9530336003123156e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47766104, + "step": 82340 + }, + { + "epoch": 12.264670837056896, + "grad_norm": 0.005754091311246157, + "learning_rate": 1.9527165352455755e-05, + "loss": 0.0, + "num_input_tokens_seen": 47768952, + "step": 82345 + }, + { + "epoch": 12.265415549597856, + "grad_norm": 0.0013304139720275998, + "learning_rate": 1.9523994794246344e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47771640, + "step": 82350 + }, + { + "epoch": 12.266160262138815, + "grad_norm": 0.039211712777614594, + "learning_rate": 1.9520824328548465e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47774552, + "step": 82355 + }, + { + "epoch": 12.266904974679774, + "grad_norm": 0.002131977817043662, + "learning_rate": 1.9517653955415698e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47777432, + "step": 82360 + }, + { + "epoch": 12.267649687220732, + "grad_norm": 0.0001238041731994599, + "learning_rate": 1.9514483674901586e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47780280, + "step": 82365 + }, + { + "epoch": 12.268394399761693, + "grad_norm": 0.0011854033218696713, + "learning_rate": 1.9511313487059706e-05, + "loss": 0.1844, + "num_input_tokens_seen": 47783064, + "step": 82370 + }, + { + "epoch": 12.269139112302652, + "grad_norm": 0.0033412626944482327, + "learning_rate": 1.950814339194361e-05, + "loss": 0.0316, + "num_input_tokens_seen": 47786072, + "step": 82375 + }, + { + "epoch": 12.26988382484361, + "grad_norm": 0.26015281677246094, + "learning_rate": 1.9504973389606834e-05, + "loss": 0.0003, + "num_input_tokens_seen": 47788888, + "step": 82380 + }, + { + "epoch": 12.27062853738457, + "grad_norm": 0.0013882708735764027, + "learning_rate": 1.9501803480102962e-05, + "loss": 0.0, + "num_input_tokens_seen": 47792056, + "step": 82385 + }, + { + "epoch": 12.271373249925528, + "grad_norm": 0.0007521227234974504, + "learning_rate": 1.9498633663485526e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47794712, + "step": 82390 + }, + { + "epoch": 12.272117962466488, + "grad_norm": 0.003071576589718461, + "learning_rate": 1.9495463939808085e-05, + "loss": 0.0, + "num_input_tokens_seen": 47797464, + "step": 82395 + }, + { + "epoch": 12.272862675007447, + "grad_norm": 0.0006595616578124464, + "learning_rate": 1.9492294309124183e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47800120, + "step": 82400 + }, + { + "epoch": 12.273607387548406, + "grad_norm": 0.06192093342542648, + "learning_rate": 1.9489124771487375e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47803032, + "step": 82405 + }, + { + "epoch": 12.274352100089365, + "grad_norm": 0.04726339131593704, + "learning_rate": 1.9485955326951204e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47805912, + "step": 82410 + }, + { + "epoch": 12.275096812630325, + "grad_norm": 0.0466814860701561, + "learning_rate": 1.9482785975569202e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47809144, + "step": 82415 + }, + { + "epoch": 12.275841525171284, + "grad_norm": 0.002016163896769285, + "learning_rate": 1.9479616717394937e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47811800, + "step": 82420 + }, + { + "epoch": 12.276586237712243, + "grad_norm": 0.0011004774132743478, + "learning_rate": 1.947644755248193e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47814616, + "step": 82425 + }, + { + "epoch": 12.277330950253202, + "grad_norm": 0.00031444523483514786, + "learning_rate": 1.9473278480883733e-05, + "loss": 0.1098, + "num_input_tokens_seen": 47817848, + "step": 82430 + }, + { + "epoch": 12.278075662794162, + "grad_norm": 3.676062624435872e-05, + "learning_rate": 1.947010950265387e-05, + "loss": 0.3979, + "num_input_tokens_seen": 47820504, + "step": 82435 + }, + { + "epoch": 12.27882037533512, + "grad_norm": 0.11490090191364288, + "learning_rate": 1.9466940617845897e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47823096, + "step": 82440 + }, + { + "epoch": 12.27956508787608, + "grad_norm": 0.10231226682662964, + "learning_rate": 1.9463771826513326e-05, + "loss": 0.2948, + "num_input_tokens_seen": 47826072, + "step": 82445 + }, + { + "epoch": 12.280309800417038, + "grad_norm": 0.00862889178097248, + "learning_rate": 1.9460603128709715e-05, + "loss": 0.1549, + "num_input_tokens_seen": 47828824, + "step": 82450 + }, + { + "epoch": 12.281054512957999, + "grad_norm": 98.77619171142578, + "learning_rate": 1.9457434524488582e-05, + "loss": 0.0453, + "num_input_tokens_seen": 47831576, + "step": 82455 + }, + { + "epoch": 12.281799225498958, + "grad_norm": 0.10553158819675446, + "learning_rate": 1.9454266013903458e-05, + "loss": 0.2026, + "num_input_tokens_seen": 47834328, + "step": 82460 + }, + { + "epoch": 12.282543938039916, + "grad_norm": 0.0027764919213950634, + "learning_rate": 1.945109759700788e-05, + "loss": 0.0192, + "num_input_tokens_seen": 47837304, + "step": 82465 + }, + { + "epoch": 12.283288650580875, + "grad_norm": 0.03150872886180878, + "learning_rate": 1.9447929273855354e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47840088, + "step": 82470 + }, + { + "epoch": 12.284033363121836, + "grad_norm": 5.88239049911499, + "learning_rate": 1.944476104449943e-05, + "loss": 0.0023, + "num_input_tokens_seen": 47843000, + "step": 82475 + }, + { + "epoch": 12.284778075662794, + "grad_norm": 0.0030787114519625902, + "learning_rate": 1.9441592908993616e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47845688, + "step": 82480 + }, + { + "epoch": 12.285522788203753, + "grad_norm": 0.03491469472646713, + "learning_rate": 1.9438424867391444e-05, + "loss": 0.0004, + "num_input_tokens_seen": 47848568, + "step": 82485 + }, + { + "epoch": 12.286267500744712, + "grad_norm": 0.003006754443049431, + "learning_rate": 1.9435256919746436e-05, + "loss": 0.0704, + "num_input_tokens_seen": 47851224, + "step": 82490 + }, + { + "epoch": 12.287012213285673, + "grad_norm": 22.516834259033203, + "learning_rate": 1.94320890661121e-05, + "loss": 0.3411, + "num_input_tokens_seen": 47853944, + "step": 82495 + }, + { + "epoch": 12.287756925826631, + "grad_norm": 0.0044794753193855286, + "learning_rate": 1.9428921306541963e-05, + "loss": 0.0, + "num_input_tokens_seen": 47856760, + "step": 82500 + }, + { + "epoch": 12.28850163836759, + "grad_norm": 0.0339195653796196, + "learning_rate": 1.9425753641089535e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47859928, + "step": 82505 + }, + { + "epoch": 12.289246350908549, + "grad_norm": 0.013411682099103928, + "learning_rate": 1.9422586069808337e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47863128, + "step": 82510 + }, + { + "epoch": 12.28999106344951, + "grad_norm": 0.01118384301662445, + "learning_rate": 1.941941859275188e-05, + "loss": 0.166, + "num_input_tokens_seen": 47866136, + "step": 82515 + }, + { + "epoch": 12.290735775990468, + "grad_norm": 0.008944693021476269, + "learning_rate": 1.9416251209973672e-05, + "loss": 0.1752, + "num_input_tokens_seen": 47869112, + "step": 82520 + }, + { + "epoch": 12.291480488531427, + "grad_norm": 0.013351790606975555, + "learning_rate": 1.941308392152722e-05, + "loss": 0.0329, + "num_input_tokens_seen": 47871960, + "step": 82525 + }, + { + "epoch": 12.292225201072386, + "grad_norm": 0.0022148811258375645, + "learning_rate": 1.9409916727466047e-05, + "loss": 0.0004, + "num_input_tokens_seen": 47874808, + "step": 82530 + }, + { + "epoch": 12.292969913613344, + "grad_norm": 0.006045065820217133, + "learning_rate": 1.9406749627843645e-05, + "loss": 0.1533, + "num_input_tokens_seen": 47877496, + "step": 82535 + }, + { + "epoch": 12.293714626154305, + "grad_norm": 0.015866423025727272, + "learning_rate": 1.940358262271352e-05, + "loss": 0.0003, + "num_input_tokens_seen": 47880248, + "step": 82540 + }, + { + "epoch": 12.294459338695264, + "grad_norm": 0.01169922947883606, + "learning_rate": 1.9400415712129184e-05, + "loss": 0.0006, + "num_input_tokens_seen": 47882808, + "step": 82545 + }, + { + "epoch": 12.295204051236222, + "grad_norm": 0.035007670521736145, + "learning_rate": 1.9397248896144127e-05, + "loss": 0.0111, + "num_input_tokens_seen": 47885528, + "step": 82550 + }, + { + "epoch": 12.295948763777181, + "grad_norm": 0.0007719750865362585, + "learning_rate": 1.939408217481186e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47888504, + "step": 82555 + }, + { + "epoch": 12.296693476318142, + "grad_norm": 0.01540551707148552, + "learning_rate": 1.939091554818587e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47891288, + "step": 82560 + }, + { + "epoch": 12.2974381888591, + "grad_norm": 0.00609793933108449, + "learning_rate": 1.9387749016319673e-05, + "loss": 0.0125, + "num_input_tokens_seen": 47894232, + "step": 82565 + }, + { + "epoch": 12.29818290140006, + "grad_norm": 0.03354198485612869, + "learning_rate": 1.938458257926675e-05, + "loss": 0.0132, + "num_input_tokens_seen": 47897112, + "step": 82570 + }, + { + "epoch": 12.298927613941018, + "grad_norm": 0.004629089962691069, + "learning_rate": 1.9381416237080586e-05, + "loss": 0.0006, + "num_input_tokens_seen": 47900152, + "step": 82575 + }, + { + "epoch": 12.299672326481979, + "grad_norm": 0.003784204600378871, + "learning_rate": 1.9378249989814697e-05, + "loss": 0.0014, + "num_input_tokens_seen": 47903256, + "step": 82580 + }, + { + "epoch": 12.300417039022937, + "grad_norm": 0.036980412900447845, + "learning_rate": 1.937508383752255e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47906584, + "step": 82585 + }, + { + "epoch": 12.301161751563896, + "grad_norm": 36.2213249206543, + "learning_rate": 1.9371917780257657e-05, + "loss": 0.2365, + "num_input_tokens_seen": 47909592, + "step": 82590 + }, + { + "epoch": 12.301906464104855, + "grad_norm": 0.009387624450027943, + "learning_rate": 1.9368751818073476e-05, + "loss": 0.1475, + "num_input_tokens_seen": 47912600, + "step": 82595 + }, + { + "epoch": 12.302651176645815, + "grad_norm": 8.246294021606445, + "learning_rate": 1.9365585951023523e-05, + "loss": 0.1106, + "num_input_tokens_seen": 47915480, + "step": 82600 + }, + { + "epoch": 12.303395889186774, + "grad_norm": 0.09216266125440598, + "learning_rate": 1.9362420179161262e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47918168, + "step": 82605 + }, + { + "epoch": 12.304140601727733, + "grad_norm": 0.010259195230901241, + "learning_rate": 1.935925450254019e-05, + "loss": 0.0004, + "num_input_tokens_seen": 47921272, + "step": 82610 + }, + { + "epoch": 12.304885314268692, + "grad_norm": 0.0009772561024874449, + "learning_rate": 1.935608892121378e-05, + "loss": 0.001, + "num_input_tokens_seen": 47924184, + "step": 82615 + }, + { + "epoch": 12.305630026809652, + "grad_norm": 4.159929275512695, + "learning_rate": 1.9352923435235505e-05, + "loss": 0.0014, + "num_input_tokens_seen": 47927448, + "step": 82620 + }, + { + "epoch": 12.30637473935061, + "grad_norm": 0.1226499006152153, + "learning_rate": 1.9349758044658854e-05, + "loss": 0.0288, + "num_input_tokens_seen": 47930328, + "step": 82625 + }, + { + "epoch": 12.30711945189157, + "grad_norm": 0.00916716642677784, + "learning_rate": 1.934659274953729e-05, + "loss": 0.0002, + "num_input_tokens_seen": 47932952, + "step": 82630 + }, + { + "epoch": 12.307864164432528, + "grad_norm": 0.014076471328735352, + "learning_rate": 1.9343427549924302e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47935832, + "step": 82635 + }, + { + "epoch": 12.308608876973489, + "grad_norm": 0.0005308111431077123, + "learning_rate": 1.9340262445873354e-05, + "loss": 0.001, + "num_input_tokens_seen": 47939000, + "step": 82640 + }, + { + "epoch": 12.309353589514448, + "grad_norm": 0.05567372217774391, + "learning_rate": 1.9337097437437924e-05, + "loss": 0.0025, + "num_input_tokens_seen": 47941976, + "step": 82645 + }, + { + "epoch": 12.310098302055406, + "grad_norm": 0.008110321126878262, + "learning_rate": 1.9333932524671478e-05, + "loss": 0.0029, + "num_input_tokens_seen": 47944696, + "step": 82650 + }, + { + "epoch": 12.310843014596365, + "grad_norm": 0.0007571915630251169, + "learning_rate": 1.933076770762748e-05, + "loss": 0.0, + "num_input_tokens_seen": 47947928, + "step": 82655 + }, + { + "epoch": 12.311587727137326, + "grad_norm": 0.02830692008137703, + "learning_rate": 1.93276029863594e-05, + "loss": 0.0152, + "num_input_tokens_seen": 47950840, + "step": 82660 + }, + { + "epoch": 12.312332439678285, + "grad_norm": 0.0036059722770005465, + "learning_rate": 1.9324438360920696e-05, + "loss": 0.0, + "num_input_tokens_seen": 47953688, + "step": 82665 + }, + { + "epoch": 12.313077152219243, + "grad_norm": 0.02935948595404625, + "learning_rate": 1.9321273831364847e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47956856, + "step": 82670 + }, + { + "epoch": 12.313821864760202, + "grad_norm": 0.05994074046611786, + "learning_rate": 1.9318109397745295e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47959896, + "step": 82675 + }, + { + "epoch": 12.314566577301163, + "grad_norm": 0.020181531086564064, + "learning_rate": 1.9314945060115517e-05, + "loss": 0.0562, + "num_input_tokens_seen": 47962584, + "step": 82680 + }, + { + "epoch": 12.315311289842121, + "grad_norm": 0.001373188802972436, + "learning_rate": 1.9311780818528966e-05, + "loss": 0.0, + "num_input_tokens_seen": 47965432, + "step": 82685 + }, + { + "epoch": 12.31605600238308, + "grad_norm": 16.564937591552734, + "learning_rate": 1.93086166730391e-05, + "loss": 0.2258, + "num_input_tokens_seen": 47968088, + "step": 82690 + }, + { + "epoch": 12.316800714924039, + "grad_norm": 0.0007991141173988581, + "learning_rate": 1.9305452623699367e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47971096, + "step": 82695 + }, + { + "epoch": 12.317545427465, + "grad_norm": 0.0020685766357928514, + "learning_rate": 1.9302288670563215e-05, + "loss": 0.0065, + "num_input_tokens_seen": 47974072, + "step": 82700 + }, + { + "epoch": 12.318290140005958, + "grad_norm": 0.004018813371658325, + "learning_rate": 1.9299124813684117e-05, + "loss": 0.0, + "num_input_tokens_seen": 47976664, + "step": 82705 + }, + { + "epoch": 12.319034852546917, + "grad_norm": 0.0006989121320657432, + "learning_rate": 1.92959610531155e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47979736, + "step": 82710 + }, + { + "epoch": 12.319779565087876, + "grad_norm": 0.009337034076452255, + "learning_rate": 1.9292797388910832e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47982648, + "step": 82715 + }, + { + "epoch": 12.320524277628834, + "grad_norm": 68.81959533691406, + "learning_rate": 1.928963382112355e-05, + "loss": 0.285, + "num_input_tokens_seen": 47985624, + "step": 82720 + }, + { + "epoch": 12.321268990169795, + "grad_norm": 1.8639235496520996, + "learning_rate": 1.9286470349807108e-05, + "loss": 0.0022, + "num_input_tokens_seen": 47988376, + "step": 82725 + }, + { + "epoch": 12.322013702710754, + "grad_norm": 0.03377570956945419, + "learning_rate": 1.9283306975014935e-05, + "loss": 0.1292, + "num_input_tokens_seen": 47991320, + "step": 82730 + }, + { + "epoch": 12.322758415251712, + "grad_norm": 0.0057259476743638515, + "learning_rate": 1.9280143696800473e-05, + "loss": 0.0029, + "num_input_tokens_seen": 47994168, + "step": 82735 + }, + { + "epoch": 12.323503127792671, + "grad_norm": 0.00846626702696085, + "learning_rate": 1.9276980515217183e-05, + "loss": 0.0001, + "num_input_tokens_seen": 47997400, + "step": 82740 + }, + { + "epoch": 12.324247840333632, + "grad_norm": 0.023689011111855507, + "learning_rate": 1.927381743031848e-05, + "loss": 0.0, + "num_input_tokens_seen": 48000248, + "step": 82745 + }, + { + "epoch": 12.32499255287459, + "grad_norm": 0.011332071386277676, + "learning_rate": 1.927065444215782e-05, + "loss": 0.0025, + "num_input_tokens_seen": 48003576, + "step": 82750 + }, + { + "epoch": 12.32573726541555, + "grad_norm": 0.2164546251296997, + "learning_rate": 1.9267491550788626e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48006616, + "step": 82755 + }, + { + "epoch": 12.326481977956508, + "grad_norm": 0.032604265958070755, + "learning_rate": 1.926432875626434e-05, + "loss": 0.0031, + "num_input_tokens_seen": 48009336, + "step": 82760 + }, + { + "epoch": 12.327226690497469, + "grad_norm": 0.35487908124923706, + "learning_rate": 1.926116605863838e-05, + "loss": 0.002, + "num_input_tokens_seen": 48012024, + "step": 82765 + }, + { + "epoch": 12.327971403038427, + "grad_norm": 0.003998967818915844, + "learning_rate": 1.9258003457964198e-05, + "loss": 0.0047, + "num_input_tokens_seen": 48014840, + "step": 82770 + }, + { + "epoch": 12.328716115579386, + "grad_norm": 0.02862567827105522, + "learning_rate": 1.925484095429521e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48017400, + "step": 82775 + }, + { + "epoch": 12.329460828120345, + "grad_norm": 0.005489968694746494, + "learning_rate": 1.9251678547684836e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48020440, + "step": 82780 + }, + { + "epoch": 12.330205540661305, + "grad_norm": 0.0012837518006563187, + "learning_rate": 1.924851623818652e-05, + "loss": 0.0004, + "num_input_tokens_seen": 48023320, + "step": 82785 + }, + { + "epoch": 12.330950253202264, + "grad_norm": 0.002078711986541748, + "learning_rate": 1.9245354025853673e-05, + "loss": 0.1137, + "num_input_tokens_seen": 48026168, + "step": 82790 + }, + { + "epoch": 12.331694965743223, + "grad_norm": 0.6443603038787842, + "learning_rate": 1.9242191910739727e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48029048, + "step": 82795 + }, + { + "epoch": 12.332439678284182, + "grad_norm": 0.0004101708182133734, + "learning_rate": 1.9239029892898083e-05, + "loss": 0.0, + "num_input_tokens_seen": 48031704, + "step": 82800 + }, + { + "epoch": 12.333184390825142, + "grad_norm": 0.015209564007818699, + "learning_rate": 1.9235867972382188e-05, + "loss": 0.0132, + "num_input_tokens_seen": 48034552, + "step": 82805 + }, + { + "epoch": 12.333929103366101, + "grad_norm": 0.016801564022898674, + "learning_rate": 1.9232706149245443e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48037432, + "step": 82810 + }, + { + "epoch": 12.33467381590706, + "grad_norm": 0.008313710801303387, + "learning_rate": 1.9229544423541254e-05, + "loss": 0.1566, + "num_input_tokens_seen": 48040184, + "step": 82815 + }, + { + "epoch": 12.335418528448018, + "grad_norm": 0.22437632083892822, + "learning_rate": 1.922638279532306e-05, + "loss": 0.0168, + "num_input_tokens_seen": 48042776, + "step": 82820 + }, + { + "epoch": 12.336163240988979, + "grad_norm": 0.022503092885017395, + "learning_rate": 1.9223221264644253e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48045720, + "step": 82825 + }, + { + "epoch": 12.336907953529938, + "grad_norm": 0.002249267417937517, + "learning_rate": 1.922005983155826e-05, + "loss": 0.0, + "num_input_tokens_seen": 48048952, + "step": 82830 + }, + { + "epoch": 12.337652666070897, + "grad_norm": 0.0630689486861229, + "learning_rate": 1.921689849611847e-05, + "loss": 0.0047, + "num_input_tokens_seen": 48052056, + "step": 82835 + }, + { + "epoch": 12.338397378611855, + "grad_norm": 0.010972948744893074, + "learning_rate": 1.921373725837831e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48055160, + "step": 82840 + }, + { + "epoch": 12.339142091152816, + "grad_norm": 0.00204795622266829, + "learning_rate": 1.9210576118391177e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48058136, + "step": 82845 + }, + { + "epoch": 12.339886803693775, + "grad_norm": 0.0412171296775341, + "learning_rate": 1.920741507621048e-05, + "loss": 0.0003, + "num_input_tokens_seen": 48060888, + "step": 82850 + }, + { + "epoch": 12.340631516234733, + "grad_norm": 0.004675925709307194, + "learning_rate": 1.9204254131889612e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48063768, + "step": 82855 + }, + { + "epoch": 12.341376228775692, + "grad_norm": 0.006717314478009939, + "learning_rate": 1.920109328548198e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48066808, + "step": 82860 + }, + { + "epoch": 12.342120941316653, + "grad_norm": 0.0004358981386758387, + "learning_rate": 1.919793253704099e-05, + "loss": 0.0, + "num_input_tokens_seen": 48070072, + "step": 82865 + }, + { + "epoch": 12.342865653857611, + "grad_norm": 0.00886361114680767, + "learning_rate": 1.9194771886620023e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48072952, + "step": 82870 + }, + { + "epoch": 12.34361036639857, + "grad_norm": 0.0006181010394357145, + "learning_rate": 1.919161133427249e-05, + "loss": 0.0144, + "num_input_tokens_seen": 48075672, + "step": 82875 + }, + { + "epoch": 12.344355078939529, + "grad_norm": 0.0010672284988686442, + "learning_rate": 1.918845088005178e-05, + "loss": 0.0005, + "num_input_tokens_seen": 48078648, + "step": 82880 + }, + { + "epoch": 12.34509979148049, + "grad_norm": 0.0018800960388034582, + "learning_rate": 1.918529052401129e-05, + "loss": 0.0145, + "num_input_tokens_seen": 48081912, + "step": 82885 + }, + { + "epoch": 12.345844504021448, + "grad_norm": 0.003808419918641448, + "learning_rate": 1.9182130266204396e-05, + "loss": 0.0, + "num_input_tokens_seen": 48085176, + "step": 82890 + }, + { + "epoch": 12.346589216562407, + "grad_norm": 0.002259044675156474, + "learning_rate": 1.9178970106684506e-05, + "loss": 0.1479, + "num_input_tokens_seen": 48088056, + "step": 82895 + }, + { + "epoch": 12.347333929103366, + "grad_norm": 0.0006279717199504375, + "learning_rate": 1.9175810045505006e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48091224, + "step": 82900 + }, + { + "epoch": 12.348078641644324, + "grad_norm": 0.0030204481445252895, + "learning_rate": 1.917265008271926e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48094168, + "step": 82905 + }, + { + "epoch": 12.348823354185285, + "grad_norm": 0.005679841618984938, + "learning_rate": 1.916949021838068e-05, + "loss": 0.0064, + "num_input_tokens_seen": 48097208, + "step": 82910 + }, + { + "epoch": 12.349568066726244, + "grad_norm": 0.0018005688907578588, + "learning_rate": 1.916633045254263e-05, + "loss": 0.0072, + "num_input_tokens_seen": 48100440, + "step": 82915 + }, + { + "epoch": 12.350312779267203, + "grad_norm": 0.005807303823530674, + "learning_rate": 1.9163170785258507e-05, + "loss": 0.0225, + "num_input_tokens_seen": 48102968, + "step": 82920 + }, + { + "epoch": 12.351057491808161, + "grad_norm": 0.00649911817163229, + "learning_rate": 1.916001121658167e-05, + "loss": 0.0004, + "num_input_tokens_seen": 48105656, + "step": 82925 + }, + { + "epoch": 12.351802204349122, + "grad_norm": 0.0011836913181468844, + "learning_rate": 1.9156851746565514e-05, + "loss": 0.0014, + "num_input_tokens_seen": 48108440, + "step": 82930 + }, + { + "epoch": 12.35254691689008, + "grad_norm": 0.07521563768386841, + "learning_rate": 1.9153692375263413e-05, + "loss": 0.0008, + "num_input_tokens_seen": 48111544, + "step": 82935 + }, + { + "epoch": 12.35329162943104, + "grad_norm": 0.002691032597795129, + "learning_rate": 1.9150533102728728e-05, + "loss": 0.0003, + "num_input_tokens_seen": 48114488, + "step": 82940 + }, + { + "epoch": 12.354036341971998, + "grad_norm": 0.002131872111931443, + "learning_rate": 1.914737392901485e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48117240, + "step": 82945 + }, + { + "epoch": 12.354781054512959, + "grad_norm": 0.009140117093920708, + "learning_rate": 1.9144214854175136e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48119832, + "step": 82950 + }, + { + "epoch": 12.355525767053917, + "grad_norm": 0.009941251948475838, + "learning_rate": 1.9141055878262963e-05, + "loss": 0.0005, + "num_input_tokens_seen": 48122840, + "step": 82955 + }, + { + "epoch": 12.356270479594876, + "grad_norm": 0.006024542730301619, + "learning_rate": 1.913789700133169e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48125976, + "step": 82960 + }, + { + "epoch": 12.357015192135835, + "grad_norm": 0.012507579289376736, + "learning_rate": 1.9134738223434697e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48128920, + "step": 82965 + }, + { + "epoch": 12.357759904676795, + "grad_norm": 0.0030834749341011047, + "learning_rate": 1.913157954462533e-05, + "loss": 0.1319, + "num_input_tokens_seen": 48131640, + "step": 82970 + }, + { + "epoch": 12.358504617217754, + "grad_norm": 0.03162268549203873, + "learning_rate": 1.9128420964956972e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48134584, + "step": 82975 + }, + { + "epoch": 12.359249329758713, + "grad_norm": 0.11380218714475632, + "learning_rate": 1.912526248448298e-05, + "loss": 0.0005, + "num_input_tokens_seen": 48137432, + "step": 82980 + }, + { + "epoch": 12.359994042299672, + "grad_norm": 38.516231536865234, + "learning_rate": 1.9122104103256693e-05, + "loss": 0.1567, + "num_input_tokens_seen": 48140152, + "step": 82985 + }, + { + "epoch": 12.360738754840632, + "grad_norm": 0.00018098295549862087, + "learning_rate": 1.9118945821331495e-05, + "loss": 0.0011, + "num_input_tokens_seen": 48142712, + "step": 82990 + }, + { + "epoch": 12.361483467381591, + "grad_norm": 0.00870627909898758, + "learning_rate": 1.9115787638760717e-05, + "loss": 0.2344, + "num_input_tokens_seen": 48145944, + "step": 82995 + }, + { + "epoch": 12.36222817992255, + "grad_norm": 0.0010426974622532725, + "learning_rate": 1.911262955559774e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48148408, + "step": 83000 + }, + { + "epoch": 12.362972892463509, + "grad_norm": 0.019964875653386116, + "learning_rate": 1.910947157189589e-05, + "loss": 0.0149, + "num_input_tokens_seen": 48151320, + "step": 83005 + }, + { + "epoch": 12.363717605004469, + "grad_norm": 0.0004600205284077674, + "learning_rate": 1.9106313687708543e-05, + "loss": 0.0194, + "num_input_tokens_seen": 48154104, + "step": 83010 + }, + { + "epoch": 12.364462317545428, + "grad_norm": 0.0038602317217737436, + "learning_rate": 1.9103155903089036e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48156888, + "step": 83015 + }, + { + "epoch": 12.365207030086387, + "grad_norm": 0.2530195713043213, + "learning_rate": 1.9099998218090707e-05, + "loss": 0.0008, + "num_input_tokens_seen": 48159960, + "step": 83020 + }, + { + "epoch": 12.365951742627345, + "grad_norm": 0.018316136673092842, + "learning_rate": 1.9096840632766923e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48162712, + "step": 83025 + }, + { + "epoch": 12.366696455168306, + "grad_norm": 0.001058434136211872, + "learning_rate": 1.9093683147171002e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48165752, + "step": 83030 + }, + { + "epoch": 12.367441167709265, + "grad_norm": 0.0025697401724755764, + "learning_rate": 1.9090525761356315e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48168632, + "step": 83035 + }, + { + "epoch": 12.368185880250223, + "grad_norm": 47.415225982666016, + "learning_rate": 1.9087368475376176e-05, + "loss": 0.0324, + "num_input_tokens_seen": 48171768, + "step": 83040 + }, + { + "epoch": 12.368930592791182, + "grad_norm": 1.4854224920272827, + "learning_rate": 1.908421128928395e-05, + "loss": 0.0003, + "num_input_tokens_seen": 48174552, + "step": 83045 + }, + { + "epoch": 12.36967530533214, + "grad_norm": 0.012432673014700413, + "learning_rate": 1.9081054203132955e-05, + "loss": 0.0206, + "num_input_tokens_seen": 48177656, + "step": 83050 + }, + { + "epoch": 12.370420017873101, + "grad_norm": 0.0053986492566764355, + "learning_rate": 1.9077897216976537e-05, + "loss": 0.0405, + "num_input_tokens_seen": 48180472, + "step": 83055 + }, + { + "epoch": 12.37116473041406, + "grad_norm": 0.009703539311885834, + "learning_rate": 1.907474033086803e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48183256, + "step": 83060 + }, + { + "epoch": 12.371909442955019, + "grad_norm": 0.0004498611087910831, + "learning_rate": 1.907158354486075e-05, + "loss": 0.0, + "num_input_tokens_seen": 48186168, + "step": 83065 + }, + { + "epoch": 12.37265415549598, + "grad_norm": 0.001526186242699623, + "learning_rate": 1.9068426859008055e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48188984, + "step": 83070 + }, + { + "epoch": 12.373398868036938, + "grad_norm": 0.0041166082955896854, + "learning_rate": 1.9065270273363244e-05, + "loss": 0.0, + "num_input_tokens_seen": 48191736, + "step": 83075 + }, + { + "epoch": 12.374143580577897, + "grad_norm": 0.01784558594226837, + "learning_rate": 1.9062113787979674e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48194520, + "step": 83080 + }, + { + "epoch": 12.374888293118856, + "grad_norm": 0.03537580370903015, + "learning_rate": 1.905895740291065e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48197304, + "step": 83085 + }, + { + "epoch": 12.375633005659815, + "grad_norm": 0.007835369557142258, + "learning_rate": 1.9055801118209507e-05, + "loss": 0.2063, + "num_input_tokens_seen": 48200024, + "step": 83090 + }, + { + "epoch": 12.376377718200775, + "grad_norm": 0.013711562380194664, + "learning_rate": 1.9052644933929564e-05, + "loss": 0.0029, + "num_input_tokens_seen": 48202808, + "step": 83095 + }, + { + "epoch": 12.377122430741734, + "grad_norm": 0.01013209018856287, + "learning_rate": 1.9049488850124128e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48205432, + "step": 83100 + }, + { + "epoch": 12.377867143282693, + "grad_norm": 0.0011862876126542687, + "learning_rate": 1.9046332866846544e-05, + "loss": 0.0975, + "num_input_tokens_seen": 48208600, + "step": 83105 + }, + { + "epoch": 12.378611855823651, + "grad_norm": 11.717453956604004, + "learning_rate": 1.90431769841501e-05, + "loss": 0.1106, + "num_input_tokens_seen": 48211544, + "step": 83110 + }, + { + "epoch": 12.379356568364612, + "grad_norm": 0.0015662453370168805, + "learning_rate": 1.904002120208814e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48214520, + "step": 83115 + }, + { + "epoch": 12.38010128090557, + "grad_norm": 0.0040227873250842094, + "learning_rate": 1.903686552071396e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48217368, + "step": 83120 + }, + { + "epoch": 12.38084599344653, + "grad_norm": 0.0022432697005569935, + "learning_rate": 1.903370994008088e-05, + "loss": 0.0, + "num_input_tokens_seen": 48220504, + "step": 83125 + }, + { + "epoch": 12.381590705987488, + "grad_norm": 0.1512085348367691, + "learning_rate": 1.9030554460242194e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48223288, + "step": 83130 + }, + { + "epoch": 12.382335418528449, + "grad_norm": 0.005812250543385744, + "learning_rate": 1.902739908125124e-05, + "loss": 0.0, + "num_input_tokens_seen": 48225912, + "step": 83135 + }, + { + "epoch": 12.383080131069407, + "grad_norm": 0.00612568249925971, + "learning_rate": 1.9024243803161304e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48228792, + "step": 83140 + }, + { + "epoch": 12.383824843610366, + "grad_norm": 0.004192037507891655, + "learning_rate": 1.9021088626025684e-05, + "loss": 0.0225, + "num_input_tokens_seen": 48231416, + "step": 83145 + }, + { + "epoch": 12.384569556151325, + "grad_norm": 0.0005272850394248962, + "learning_rate": 1.9017933549897706e-05, + "loss": 0.0033, + "num_input_tokens_seen": 48234296, + "step": 83150 + }, + { + "epoch": 12.385314268692285, + "grad_norm": 0.0005061785341240466, + "learning_rate": 1.901477857483066e-05, + "loss": 0.1904, + "num_input_tokens_seen": 48237048, + "step": 83155 + }, + { + "epoch": 12.386058981233244, + "grad_norm": 0.0038964489940553904, + "learning_rate": 1.9011623700877845e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48239896, + "step": 83160 + }, + { + "epoch": 12.386803693774203, + "grad_norm": 0.006327218376100063, + "learning_rate": 1.9008468928092555e-05, + "loss": 0.0705, + "num_input_tokens_seen": 48243032, + "step": 83165 + }, + { + "epoch": 12.387548406315162, + "grad_norm": 0.0005574068054556847, + "learning_rate": 1.9005314256528103e-05, + "loss": 0.0465, + "num_input_tokens_seen": 48245880, + "step": 83170 + }, + { + "epoch": 12.388293118856122, + "grad_norm": 0.024181168526411057, + "learning_rate": 1.9002159686237776e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48248760, + "step": 83175 + }, + { + "epoch": 12.389037831397081, + "grad_norm": 0.002979999640956521, + "learning_rate": 1.8999005217274857e-05, + "loss": 0.0016, + "num_input_tokens_seen": 48251672, + "step": 83180 + }, + { + "epoch": 12.38978254393804, + "grad_norm": 23.881114959716797, + "learning_rate": 1.8995850849692646e-05, + "loss": 0.144, + "num_input_tokens_seen": 48254552, + "step": 83185 + }, + { + "epoch": 12.390527256478999, + "grad_norm": 0.0032273002434521914, + "learning_rate": 1.8992696583544434e-05, + "loss": 0.009, + "num_input_tokens_seen": 48257432, + "step": 83190 + }, + { + "epoch": 12.391271969019959, + "grad_norm": 0.010906531475484371, + "learning_rate": 1.898954241888351e-05, + "loss": 0.0, + "num_input_tokens_seen": 48260120, + "step": 83195 + }, + { + "epoch": 12.392016681560918, + "grad_norm": 0.007310579530894756, + "learning_rate": 1.8986388355763147e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48263000, + "step": 83200 + }, + { + "epoch": 12.392761394101877, + "grad_norm": 0.002593564335256815, + "learning_rate": 1.8983234394236657e-05, + "loss": 0.1626, + "num_input_tokens_seen": 48265720, + "step": 83205 + }, + { + "epoch": 12.393506106642835, + "grad_norm": 46.1706428527832, + "learning_rate": 1.8980080534357298e-05, + "loss": 0.1636, + "num_input_tokens_seen": 48268664, + "step": 83210 + }, + { + "epoch": 12.394250819183796, + "grad_norm": 0.0018917496781796217, + "learning_rate": 1.8976926776178366e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48271320, + "step": 83215 + }, + { + "epoch": 12.394995531724755, + "grad_norm": 4.383171558380127, + "learning_rate": 1.8973773119753132e-05, + "loss": 0.0068, + "num_input_tokens_seen": 48274296, + "step": 83220 + }, + { + "epoch": 12.395740244265713, + "grad_norm": 0.0033346121199429035, + "learning_rate": 1.8970619565134866e-05, + "loss": 0.0, + "num_input_tokens_seen": 48277240, + "step": 83225 + }, + { + "epoch": 12.396484956806672, + "grad_norm": 0.001084042596630752, + "learning_rate": 1.896746611237687e-05, + "loss": 0.1848, + "num_input_tokens_seen": 48279928, + "step": 83230 + }, + { + "epoch": 12.397229669347631, + "grad_norm": 0.024449143558740616, + "learning_rate": 1.8964312761532388e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48282680, + "step": 83235 + }, + { + "epoch": 12.397974381888591, + "grad_norm": 0.0003163099754601717, + "learning_rate": 1.896115951265472e-05, + "loss": 0.0, + "num_input_tokens_seen": 48285784, + "step": 83240 + }, + { + "epoch": 12.39871909442955, + "grad_norm": 0.003156222403049469, + "learning_rate": 1.8958006365797118e-05, + "loss": 0.0033, + "num_input_tokens_seen": 48288952, + "step": 83245 + }, + { + "epoch": 12.399463806970509, + "grad_norm": 0.024873534217476845, + "learning_rate": 1.8954853321012865e-05, + "loss": 0.1377, + "num_input_tokens_seen": 48292120, + "step": 83250 + }, + { + "epoch": 12.400208519511468, + "grad_norm": 10.78536605834961, + "learning_rate": 1.8951700378355218e-05, + "loss": 0.1102, + "num_input_tokens_seen": 48294872, + "step": 83255 + }, + { + "epoch": 12.400953232052428, + "grad_norm": 317.3923034667969, + "learning_rate": 1.8948547537877436e-05, + "loss": 0.0883, + "num_input_tokens_seen": 48297592, + "step": 83260 + }, + { + "epoch": 12.401697944593387, + "grad_norm": 0.002000748412683606, + "learning_rate": 1.8945394799632804e-05, + "loss": 0.0414, + "num_input_tokens_seen": 48300472, + "step": 83265 + }, + { + "epoch": 12.402442657134346, + "grad_norm": 0.0033664817456156015, + "learning_rate": 1.8942242163674563e-05, + "loss": 0.0029, + "num_input_tokens_seen": 48303448, + "step": 83270 + }, + { + "epoch": 12.403187369675305, + "grad_norm": 0.0006343313725665212, + "learning_rate": 1.8939089630055994e-05, + "loss": 0.2202, + "num_input_tokens_seen": 48306424, + "step": 83275 + }, + { + "epoch": 12.403932082216265, + "grad_norm": 0.018509183079004288, + "learning_rate": 1.8935937198830343e-05, + "loss": 0.0563, + "num_input_tokens_seen": 48309144, + "step": 83280 + }, + { + "epoch": 12.404676794757224, + "grad_norm": 0.0010727735934779048, + "learning_rate": 1.893278487005087e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48311864, + "step": 83285 + }, + { + "epoch": 12.405421507298183, + "grad_norm": 0.03466612845659256, + "learning_rate": 1.8929632643770824e-05, + "loss": 0.0403, + "num_input_tokens_seen": 48314680, + "step": 83290 + }, + { + "epoch": 12.406166219839141, + "grad_norm": 0.004834064748138189, + "learning_rate": 1.8926480520043472e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48317816, + "step": 83295 + }, + { + "epoch": 12.406910932380102, + "grad_norm": 0.00048759017954580486, + "learning_rate": 1.892332849892206e-05, + "loss": 0.0006, + "num_input_tokens_seen": 48320600, + "step": 83300 + }, + { + "epoch": 12.40765564492106, + "grad_norm": 0.061508242040872574, + "learning_rate": 1.8920176580459827e-05, + "loss": 0.1222, + "num_input_tokens_seen": 48323768, + "step": 83305 + }, + { + "epoch": 12.40840035746202, + "grad_norm": 0.006603944581001997, + "learning_rate": 1.8917024764710043e-05, + "loss": 0.1069, + "num_input_tokens_seen": 48326968, + "step": 83310 + }, + { + "epoch": 12.409145070002978, + "grad_norm": 0.0005897486698813736, + "learning_rate": 1.8913873051725935e-05, + "loss": 0.1191, + "num_input_tokens_seen": 48329784, + "step": 83315 + }, + { + "epoch": 12.409889782543939, + "grad_norm": 0.023776108399033546, + "learning_rate": 1.8910721441560765e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48332760, + "step": 83320 + }, + { + "epoch": 12.410634495084897, + "grad_norm": 0.04864604398608208, + "learning_rate": 1.8907569934267756e-05, + "loss": 0.0801, + "num_input_tokens_seen": 48335512, + "step": 83325 + }, + { + "epoch": 12.411379207625856, + "grad_norm": 0.006425455212593079, + "learning_rate": 1.890441852990017e-05, + "loss": 0.0003, + "num_input_tokens_seen": 48338648, + "step": 83330 + }, + { + "epoch": 12.412123920166815, + "grad_norm": 0.0009815279627218843, + "learning_rate": 1.890126722851124e-05, + "loss": 0.0479, + "num_input_tokens_seen": 48341624, + "step": 83335 + }, + { + "epoch": 12.412868632707776, + "grad_norm": 0.00059878034517169, + "learning_rate": 1.8898116030154185e-05, + "loss": 0.0008, + "num_input_tokens_seen": 48344472, + "step": 83340 + }, + { + "epoch": 12.413613345248734, + "grad_norm": 0.031511444598436356, + "learning_rate": 1.8894964934882274e-05, + "loss": 0.1019, + "num_input_tokens_seen": 48347384, + "step": 83345 + }, + { + "epoch": 12.414358057789693, + "grad_norm": 0.07173248380422592, + "learning_rate": 1.8891813942748717e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48350040, + "step": 83350 + }, + { + "epoch": 12.415102770330652, + "grad_norm": 0.09676659107208252, + "learning_rate": 1.8888663053806765e-05, + "loss": 0.0054, + "num_input_tokens_seen": 48352888, + "step": 83355 + }, + { + "epoch": 12.415847482871612, + "grad_norm": 37.064727783203125, + "learning_rate": 1.8885512268109625e-05, + "loss": 0.004, + "num_input_tokens_seen": 48355768, + "step": 83360 + }, + { + "epoch": 12.416592195412571, + "grad_norm": 1.3100559711456299, + "learning_rate": 1.8882361585710554e-05, + "loss": 0.015, + "num_input_tokens_seen": 48358840, + "step": 83365 + }, + { + "epoch": 12.41733690795353, + "grad_norm": 0.002229951787739992, + "learning_rate": 1.887921100666275e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48361816, + "step": 83370 + }, + { + "epoch": 12.418081620494489, + "grad_norm": 0.00030191431869752705, + "learning_rate": 1.8876060531019474e-05, + "loss": 0.0, + "num_input_tokens_seen": 48364600, + "step": 83375 + }, + { + "epoch": 12.41882633303545, + "grad_norm": 0.0003454240213613957, + "learning_rate": 1.887291015883393e-05, + "loss": 0.0, + "num_input_tokens_seen": 48367384, + "step": 83380 + }, + { + "epoch": 12.419571045576408, + "grad_norm": 0.011491983197629452, + "learning_rate": 1.8869759890159333e-05, + "loss": 0.0073, + "num_input_tokens_seen": 48370392, + "step": 83385 + }, + { + "epoch": 12.420315758117367, + "grad_norm": 0.02700946480035782, + "learning_rate": 1.8866609725048918e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48373240, + "step": 83390 + }, + { + "epoch": 12.421060470658325, + "grad_norm": 0.000343196967151016, + "learning_rate": 1.8863459663555885e-05, + "loss": 0.1221, + "num_input_tokens_seen": 48376408, + "step": 83395 + }, + { + "epoch": 12.421805183199286, + "grad_norm": 0.0009603679063729942, + "learning_rate": 1.8860309705733477e-05, + "loss": 0.0006, + "num_input_tokens_seen": 48379256, + "step": 83400 + }, + { + "epoch": 12.422549895740245, + "grad_norm": 0.0008867747965268791, + "learning_rate": 1.8857159851634888e-05, + "loss": 0.0003, + "num_input_tokens_seen": 48382072, + "step": 83405 + }, + { + "epoch": 12.423294608281203, + "grad_norm": 0.0050738537684082985, + "learning_rate": 1.885401010131335e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48385528, + "step": 83410 + }, + { + "epoch": 12.424039320822162, + "grad_norm": 0.004725703038275242, + "learning_rate": 1.8850860454822056e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48388440, + "step": 83415 + }, + { + "epoch": 12.424784033363121, + "grad_norm": 0.0009289290755987167, + "learning_rate": 1.8847710912214233e-05, + "loss": 0.0289, + "num_input_tokens_seen": 48391320, + "step": 83420 + }, + { + "epoch": 12.425528745904082, + "grad_norm": 9.493379184277728e-05, + "learning_rate": 1.8844561473543082e-05, + "loss": 0.0978, + "num_input_tokens_seen": 48394008, + "step": 83425 + }, + { + "epoch": 12.42627345844504, + "grad_norm": 20.722814559936523, + "learning_rate": 1.8841412138861797e-05, + "loss": 0.0027, + "num_input_tokens_seen": 48397080, + "step": 83430 + }, + { + "epoch": 12.427018170985999, + "grad_norm": 22.295730590820312, + "learning_rate": 1.8838262908223602e-05, + "loss": 0.0071, + "num_input_tokens_seen": 48400056, + "step": 83435 + }, + { + "epoch": 12.427762883526958, + "grad_norm": 0.012610040605068207, + "learning_rate": 1.8835113781681686e-05, + "loss": 0.0, + "num_input_tokens_seen": 48402808, + "step": 83440 + }, + { + "epoch": 12.428507596067918, + "grad_norm": 0.017770064994692802, + "learning_rate": 1.8831964759289265e-05, + "loss": 0.089, + "num_input_tokens_seen": 48405560, + "step": 83445 + }, + { + "epoch": 12.429252308608877, + "grad_norm": 95.49100494384766, + "learning_rate": 1.8828815841099528e-05, + "loss": 0.013, + "num_input_tokens_seen": 48408568, + "step": 83450 + }, + { + "epoch": 12.429997021149836, + "grad_norm": 0.8775875568389893, + "learning_rate": 1.882566702716568e-05, + "loss": 0.0007, + "num_input_tokens_seen": 48411544, + "step": 83455 + }, + { + "epoch": 12.430741733690795, + "grad_norm": 0.001807410386390984, + "learning_rate": 1.8822518317540913e-05, + "loss": 0.0854, + "num_input_tokens_seen": 48414808, + "step": 83460 + }, + { + "epoch": 12.431486446231755, + "grad_norm": 0.005124798510223627, + "learning_rate": 1.8819369712278408e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48417656, + "step": 83465 + }, + { + "epoch": 12.432231158772714, + "grad_norm": 0.0735197514295578, + "learning_rate": 1.8816221211431382e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48420536, + "step": 83470 + }, + { + "epoch": 12.432975871313673, + "grad_norm": 0.13104109466075897, + "learning_rate": 1.8813072815053003e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48423288, + "step": 83475 + }, + { + "epoch": 12.433720583854631, + "grad_norm": 0.0008054655627347529, + "learning_rate": 1.880992452319648e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48426488, + "step": 83480 + }, + { + "epoch": 12.434465296395592, + "grad_norm": 0.0033817575313150883, + "learning_rate": 1.8806776335914986e-05, + "loss": 0.0019, + "num_input_tokens_seen": 48429400, + "step": 83485 + }, + { + "epoch": 12.43521000893655, + "grad_norm": 0.05396902933716774, + "learning_rate": 1.8803628253261717e-05, + "loss": 0.0, + "num_input_tokens_seen": 48432408, + "step": 83490 + }, + { + "epoch": 12.43595472147751, + "grad_norm": 0.0009447966585867107, + "learning_rate": 1.880048027528984e-05, + "loss": 0.0, + "num_input_tokens_seen": 48435800, + "step": 83495 + }, + { + "epoch": 12.436699434018468, + "grad_norm": 0.00011089140753028914, + "learning_rate": 1.879733240205256e-05, + "loss": 0.0465, + "num_input_tokens_seen": 48438648, + "step": 83500 + }, + { + "epoch": 12.437444146559429, + "grad_norm": 0.00370609643869102, + "learning_rate": 1.879418463360304e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48441368, + "step": 83505 + }, + { + "epoch": 12.438188859100388, + "grad_norm": 8.139329293044284e-05, + "learning_rate": 1.8791036969994462e-05, + "loss": 0.0, + "num_input_tokens_seen": 48444312, + "step": 83510 + }, + { + "epoch": 12.438933571641346, + "grad_norm": 0.012143961153924465, + "learning_rate": 1.8787889411280005e-05, + "loss": 0.0, + "num_input_tokens_seen": 48447288, + "step": 83515 + }, + { + "epoch": 12.439678284182305, + "grad_norm": 0.003395702689886093, + "learning_rate": 1.8784741957512842e-05, + "loss": 0.0914, + "num_input_tokens_seen": 48450200, + "step": 83520 + }, + { + "epoch": 12.440422996723266, + "grad_norm": 3.525496244430542, + "learning_rate": 1.878159460874615e-05, + "loss": 0.0017, + "num_input_tokens_seen": 48453048, + "step": 83525 + }, + { + "epoch": 12.441167709264224, + "grad_norm": 0.0500684455037117, + "learning_rate": 1.8778447365033085e-05, + "loss": 0.0, + "num_input_tokens_seen": 48455896, + "step": 83530 + }, + { + "epoch": 12.441912421805183, + "grad_norm": 0.0009822063148021698, + "learning_rate": 1.877530022642684e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48458776, + "step": 83535 + }, + { + "epoch": 12.442657134346142, + "grad_norm": 0.37294745445251465, + "learning_rate": 1.8772153192980578e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48461688, + "step": 83540 + }, + { + "epoch": 12.443401846887102, + "grad_norm": 0.00029150553746148944, + "learning_rate": 1.8769006264747445e-05, + "loss": 0.0, + "num_input_tokens_seen": 48464376, + "step": 83545 + }, + { + "epoch": 12.444146559428061, + "grad_norm": 0.6233388185501099, + "learning_rate": 1.8765859441780625e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48467096, + "step": 83550 + }, + { + "epoch": 12.44489127196902, + "grad_norm": 0.0844317376613617, + "learning_rate": 1.8762712724133266e-05, + "loss": 0.0003, + "num_input_tokens_seen": 48469880, + "step": 83555 + }, + { + "epoch": 12.445635984509979, + "grad_norm": 0.01714310795068741, + "learning_rate": 1.8759566111858544e-05, + "loss": 0.0007, + "num_input_tokens_seen": 48472920, + "step": 83560 + }, + { + "epoch": 12.44638069705094, + "grad_norm": 0.00035165701410733163, + "learning_rate": 1.87564196050096e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48476120, + "step": 83565 + }, + { + "epoch": 12.447125409591898, + "grad_norm": 0.0005141255678609014, + "learning_rate": 1.8753273203639614e-05, + "loss": 0.1284, + "num_input_tokens_seen": 48478936, + "step": 83570 + }, + { + "epoch": 12.447870122132857, + "grad_norm": 40.02486038208008, + "learning_rate": 1.875012690780172e-05, + "loss": 0.1939, + "num_input_tokens_seen": 48481592, + "step": 83575 + }, + { + "epoch": 12.448614834673815, + "grad_norm": 0.0033616386353969574, + "learning_rate": 1.8746980717549088e-05, + "loss": 0.1782, + "num_input_tokens_seen": 48484280, + "step": 83580 + }, + { + "epoch": 12.449359547214776, + "grad_norm": 0.00028848214424215257, + "learning_rate": 1.8743834632934858e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48487224, + "step": 83585 + }, + { + "epoch": 12.450104259755735, + "grad_norm": 0.0009941714815795422, + "learning_rate": 1.8740688654012172e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48490104, + "step": 83590 + }, + { + "epoch": 12.450848972296694, + "grad_norm": 0.00023388474073726684, + "learning_rate": 1.8737542780834205e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48492888, + "step": 83595 + }, + { + "epoch": 12.451593684837652, + "grad_norm": 0.0039053023792803288, + "learning_rate": 1.8734397013454075e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48495672, + "step": 83600 + }, + { + "epoch": 12.452338397378611, + "grad_norm": 0.0014804223319515586, + "learning_rate": 1.873125135192495e-05, + "loss": 0.0045, + "num_input_tokens_seen": 48498584, + "step": 83605 + }, + { + "epoch": 12.453083109919572, + "grad_norm": 0.019364580512046814, + "learning_rate": 1.8728105796299954e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48501496, + "step": 83610 + }, + { + "epoch": 12.45382782246053, + "grad_norm": 0.021952476352453232, + "learning_rate": 1.8724960346632247e-05, + "loss": 0.0009, + "num_input_tokens_seen": 48504312, + "step": 83615 + }, + { + "epoch": 12.454572535001489, + "grad_norm": 0.9615231156349182, + "learning_rate": 1.8721815002974954e-05, + "loss": 0.0003, + "num_input_tokens_seen": 48507096, + "step": 83620 + }, + { + "epoch": 12.455317247542448, + "grad_norm": 0.0010281651047989726, + "learning_rate": 1.8718669765381207e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48510200, + "step": 83625 + }, + { + "epoch": 12.456061960083408, + "grad_norm": 0.0009953661356121302, + "learning_rate": 1.8715524633904157e-05, + "loss": 0.0, + "num_input_tokens_seen": 48513240, + "step": 83630 + }, + { + "epoch": 12.456806672624367, + "grad_norm": 0.00017801397189032286, + "learning_rate": 1.8712379608596926e-05, + "loss": 0.0005, + "num_input_tokens_seen": 48516120, + "step": 83635 + }, + { + "epoch": 12.457551385165326, + "grad_norm": 0.0004700259305536747, + "learning_rate": 1.8709234689512656e-05, + "loss": 0.0, + "num_input_tokens_seen": 48519128, + "step": 83640 + }, + { + "epoch": 12.458296097706285, + "grad_norm": 0.004129879642277956, + "learning_rate": 1.8706089876704468e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48522008, + "step": 83645 + }, + { + "epoch": 12.459040810247245, + "grad_norm": 0.001217045122757554, + "learning_rate": 1.8702945170225504e-05, + "loss": 0.0645, + "num_input_tokens_seen": 48524856, + "step": 83650 + }, + { + "epoch": 12.459785522788204, + "grad_norm": 0.013571255840361118, + "learning_rate": 1.8699800570128868e-05, + "loss": 0.0, + "num_input_tokens_seen": 48527736, + "step": 83655 + }, + { + "epoch": 12.460530235329163, + "grad_norm": 113.85453796386719, + "learning_rate": 1.8696656076467705e-05, + "loss": 0.106, + "num_input_tokens_seen": 48530648, + "step": 83660 + }, + { + "epoch": 12.461274947870121, + "grad_norm": 0.0022680810652673244, + "learning_rate": 1.8693511689295138e-05, + "loss": 0.0474, + "num_input_tokens_seen": 48533560, + "step": 83665 + }, + { + "epoch": 12.462019660411082, + "grad_norm": 31.206619262695312, + "learning_rate": 1.8690367408664265e-05, + "loss": 0.0032, + "num_input_tokens_seen": 48536376, + "step": 83670 + }, + { + "epoch": 12.46276437295204, + "grad_norm": 2.263149872305803e-05, + "learning_rate": 1.8687223234628237e-05, + "loss": 0.1688, + "num_input_tokens_seen": 48539160, + "step": 83675 + }, + { + "epoch": 12.463509085493, + "grad_norm": 0.003675320418551564, + "learning_rate": 1.868407916724015e-05, + "loss": 0.0011, + "num_input_tokens_seen": 48542328, + "step": 83680 + }, + { + "epoch": 12.464253798033958, + "grad_norm": 15.167122840881348, + "learning_rate": 1.868093520655313e-05, + "loss": 0.039, + "num_input_tokens_seen": 48545240, + "step": 83685 + }, + { + "epoch": 12.464998510574919, + "grad_norm": 0.003559547709301114, + "learning_rate": 1.8677791352620278e-05, + "loss": 0.2344, + "num_input_tokens_seen": 48548376, + "step": 83690 + }, + { + "epoch": 12.465743223115878, + "grad_norm": 0.0028839618898928165, + "learning_rate": 1.8674647605494727e-05, + "loss": 0.0675, + "num_input_tokens_seen": 48551576, + "step": 83695 + }, + { + "epoch": 12.466487935656836, + "grad_norm": 0.0018530999077484012, + "learning_rate": 1.8671503965229572e-05, + "loss": 0.0494, + "num_input_tokens_seen": 48554296, + "step": 83700 + }, + { + "epoch": 12.467232648197795, + "grad_norm": 0.26023316383361816, + "learning_rate": 1.8668360431877918e-05, + "loss": 0.0014, + "num_input_tokens_seen": 48556824, + "step": 83705 + }, + { + "epoch": 12.467977360738756, + "grad_norm": 0.024234987795352936, + "learning_rate": 1.8665217005492892e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48560056, + "step": 83710 + }, + { + "epoch": 12.468722073279714, + "grad_norm": 0.00311780977062881, + "learning_rate": 1.8662073686127575e-05, + "loss": 0.0156, + "num_input_tokens_seen": 48562936, + "step": 83715 + }, + { + "epoch": 12.469466785820673, + "grad_norm": 0.0004254875238984823, + "learning_rate": 1.865893047383509e-05, + "loss": 0.0047, + "num_input_tokens_seen": 48565912, + "step": 83720 + }, + { + "epoch": 12.470211498361632, + "grad_norm": 0.004643264226615429, + "learning_rate": 1.865578736866852e-05, + "loss": 0.0004, + "num_input_tokens_seen": 48568728, + "step": 83725 + }, + { + "epoch": 12.470956210902592, + "grad_norm": 0.049761511385440826, + "learning_rate": 1.8652644370680986e-05, + "loss": 0.0004, + "num_input_tokens_seen": 48571640, + "step": 83730 + }, + { + "epoch": 12.471700923443551, + "grad_norm": 0.03579426556825638, + "learning_rate": 1.8649501479925562e-05, + "loss": 0.0453, + "num_input_tokens_seen": 48574648, + "step": 83735 + }, + { + "epoch": 12.47244563598451, + "grad_norm": 0.0010872173588722944, + "learning_rate": 1.8646358696455365e-05, + "loss": 0.114, + "num_input_tokens_seen": 48577400, + "step": 83740 + }, + { + "epoch": 12.473190348525469, + "grad_norm": 12.47389030456543, + "learning_rate": 1.8643216020323483e-05, + "loss": 0.1502, + "num_input_tokens_seen": 48580408, + "step": 83745 + }, + { + "epoch": 12.473935061066427, + "grad_norm": 0.0027448430191725492, + "learning_rate": 1.8640073451583003e-05, + "loss": 0.0104, + "num_input_tokens_seen": 48583384, + "step": 83750 + }, + { + "epoch": 12.474679773607388, + "grad_norm": 0.001909988117404282, + "learning_rate": 1.8636930990287015e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48586456, + "step": 83755 + }, + { + "epoch": 12.475424486148347, + "grad_norm": 0.0005448790034279227, + "learning_rate": 1.8633788636488605e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48589048, + "step": 83760 + }, + { + "epoch": 12.476169198689306, + "grad_norm": 0.0016361671732738614, + "learning_rate": 1.8630646390240876e-05, + "loss": 0.0003, + "num_input_tokens_seen": 48592056, + "step": 83765 + }, + { + "epoch": 12.476913911230264, + "grad_norm": 0.0029151649214327335, + "learning_rate": 1.8627504251596895e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48595128, + "step": 83770 + }, + { + "epoch": 12.477658623771225, + "grad_norm": 0.0011944060679525137, + "learning_rate": 1.862436222060976e-05, + "loss": 0.0024, + "num_input_tokens_seen": 48597880, + "step": 83775 + }, + { + "epoch": 12.478403336312184, + "grad_norm": 0.0029628898482769728, + "learning_rate": 1.8621220297332544e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48600888, + "step": 83780 + }, + { + "epoch": 12.479148048853142, + "grad_norm": 0.003217072458937764, + "learning_rate": 1.8618078481818324e-05, + "loss": 0.2596, + "num_input_tokens_seen": 48603864, + "step": 83785 + }, + { + "epoch": 12.479892761394101, + "grad_norm": 0.0015564956702291965, + "learning_rate": 1.861493677412019e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48606456, + "step": 83790 + }, + { + "epoch": 12.480637473935062, + "grad_norm": 0.0028756640385836363, + "learning_rate": 1.8611795174291198e-05, + "loss": 0.0003, + "num_input_tokens_seen": 48609240, + "step": 83795 + }, + { + "epoch": 12.48138218647602, + "grad_norm": 0.001820755423977971, + "learning_rate": 1.8608653682384442e-05, + "loss": 0.0306, + "num_input_tokens_seen": 48612088, + "step": 83800 + }, + { + "epoch": 12.48212689901698, + "grad_norm": 0.011202524416148663, + "learning_rate": 1.8605512298452977e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48615128, + "step": 83805 + }, + { + "epoch": 12.482871611557938, + "grad_norm": 0.010135861113667488, + "learning_rate": 1.8602371022549895e-05, + "loss": 0.0618, + "num_input_tokens_seen": 48618104, + "step": 83810 + }, + { + "epoch": 12.483616324098898, + "grad_norm": 0.0031495285220444202, + "learning_rate": 1.8599229854728244e-05, + "loss": 0.0008, + "num_input_tokens_seen": 48620984, + "step": 83815 + }, + { + "epoch": 12.484361036639857, + "grad_norm": 0.0018486728658899665, + "learning_rate": 1.8596088795041106e-05, + "loss": 0.0061, + "num_input_tokens_seen": 48623672, + "step": 83820 + }, + { + "epoch": 12.485105749180816, + "grad_norm": 0.016020875424146652, + "learning_rate": 1.859294784354154e-05, + "loss": 0.2282, + "num_input_tokens_seen": 48626648, + "step": 83825 + }, + { + "epoch": 12.485850461721775, + "grad_norm": 0.0007944080862216651, + "learning_rate": 1.8589807000282592e-05, + "loss": 0.1208, + "num_input_tokens_seen": 48629912, + "step": 83830 + }, + { + "epoch": 12.486595174262735, + "grad_norm": 20.76690101623535, + "learning_rate": 1.858666626531736e-05, + "loss": 0.0537, + "num_input_tokens_seen": 48633016, + "step": 83835 + }, + { + "epoch": 12.487339886803694, + "grad_norm": 0.0005462953122332692, + "learning_rate": 1.8583525638698873e-05, + "loss": 0.0004, + "num_input_tokens_seen": 48635928, + "step": 83840 + }, + { + "epoch": 12.488084599344653, + "grad_norm": 0.002933611860498786, + "learning_rate": 1.85803851204802e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48638776, + "step": 83845 + }, + { + "epoch": 12.488829311885612, + "grad_norm": 0.00705405417829752, + "learning_rate": 1.857724471071439e-05, + "loss": 0.0021, + "num_input_tokens_seen": 48642168, + "step": 83850 + }, + { + "epoch": 12.489574024426572, + "grad_norm": 0.007889771834015846, + "learning_rate": 1.8574104409454514e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48645208, + "step": 83855 + }, + { + "epoch": 12.49031873696753, + "grad_norm": 0.0030032226350158453, + "learning_rate": 1.857096421675361e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48648216, + "step": 83860 + }, + { + "epoch": 12.49106344950849, + "grad_norm": 0.005859095603227615, + "learning_rate": 1.8567824132664724e-05, + "loss": 0.0003, + "num_input_tokens_seen": 48651000, + "step": 83865 + }, + { + "epoch": 12.491808162049448, + "grad_norm": 0.047943927347660065, + "learning_rate": 1.856468415724092e-05, + "loss": 0.0004, + "num_input_tokens_seen": 48653560, + "step": 83870 + }, + { + "epoch": 12.492552874590409, + "grad_norm": 0.00522261206060648, + "learning_rate": 1.8561544290535234e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48656312, + "step": 83875 + }, + { + "epoch": 12.493297587131368, + "grad_norm": 0.051693521440029144, + "learning_rate": 1.8558404532600717e-05, + "loss": 0.0058, + "num_input_tokens_seen": 48659512, + "step": 83880 + }, + { + "epoch": 12.494042299672326, + "grad_norm": 0.8350756168365479, + "learning_rate": 1.8555264883490397e-05, + "loss": 0.0048, + "num_input_tokens_seen": 48662488, + "step": 83885 + }, + { + "epoch": 12.494787012213285, + "grad_norm": 50.86141586303711, + "learning_rate": 1.8552125343257337e-05, + "loss": 0.3715, + "num_input_tokens_seen": 48665368, + "step": 83890 + }, + { + "epoch": 12.495531724754246, + "grad_norm": 0.0007037526229396462, + "learning_rate": 1.8548985911954557e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48668056, + "step": 83895 + }, + { + "epoch": 12.496276437295204, + "grad_norm": 17.117488861083984, + "learning_rate": 1.8545846589635115e-05, + "loss": 0.0712, + "num_input_tokens_seen": 48670808, + "step": 83900 + }, + { + "epoch": 12.497021149836163, + "grad_norm": 0.009611174464225769, + "learning_rate": 1.8542707376352033e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48673816, + "step": 83905 + }, + { + "epoch": 12.497765862377122, + "grad_norm": 0.0018966204952448606, + "learning_rate": 1.853956827215834e-05, + "loss": 0.0286, + "num_input_tokens_seen": 48676632, + "step": 83910 + }, + { + "epoch": 12.498510574918082, + "grad_norm": 0.9773809909820557, + "learning_rate": 1.8536429277107086e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48679576, + "step": 83915 + }, + { + "epoch": 12.499255287459041, + "grad_norm": 0.49582743644714355, + "learning_rate": 1.8533290391251278e-05, + "loss": 0.0009, + "num_input_tokens_seen": 48682360, + "step": 83920 + }, + { + "epoch": 12.5, + "grad_norm": 0.006569486111402512, + "learning_rate": 1.8530151614643966e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48685272, + "step": 83925 + }, + { + "epoch": 12.500744712540959, + "grad_norm": 0.0266969483345747, + "learning_rate": 1.8527012947338155e-05, + "loss": 0.2333, + "num_input_tokens_seen": 48687800, + "step": 83930 + }, + { + "epoch": 12.501489425081918, + "grad_norm": 0.001603411161340773, + "learning_rate": 1.852387438938689e-05, + "loss": 0.0, + "num_input_tokens_seen": 48690808, + "step": 83935 + }, + { + "epoch": 12.502234137622878, + "grad_norm": 0.006692902185022831, + "learning_rate": 1.8520735940843187e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48693720, + "step": 83940 + }, + { + "epoch": 12.502978850163837, + "grad_norm": 0.0003486633358988911, + "learning_rate": 1.8517597601760062e-05, + "loss": 0.1533, + "num_input_tokens_seen": 48696440, + "step": 83945 + }, + { + "epoch": 12.503723562704796, + "grad_norm": 0.024544833227992058, + "learning_rate": 1.851445937219054e-05, + "loss": 0.0006, + "num_input_tokens_seen": 48699448, + "step": 83950 + }, + { + "epoch": 12.504468275245754, + "grad_norm": 0.004417131654918194, + "learning_rate": 1.8511321252187625e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48702168, + "step": 83955 + }, + { + "epoch": 12.505212987786715, + "grad_norm": 0.002616036683320999, + "learning_rate": 1.8508183241804356e-05, + "loss": 0.0507, + "num_input_tokens_seen": 48705176, + "step": 83960 + }, + { + "epoch": 12.505957700327674, + "grad_norm": 0.08919420838356018, + "learning_rate": 1.850504534109372e-05, + "loss": 0.0006, + "num_input_tokens_seen": 48708216, + "step": 83965 + }, + { + "epoch": 12.506702412868632, + "grad_norm": 0.11466941237449646, + "learning_rate": 1.8501907550108752e-05, + "loss": 0.0015, + "num_input_tokens_seen": 48711160, + "step": 83970 + }, + { + "epoch": 12.507447125409591, + "grad_norm": 6.697941716993228e-05, + "learning_rate": 1.8498769868902445e-05, + "loss": 0.0354, + "num_input_tokens_seen": 48713944, + "step": 83975 + }, + { + "epoch": 12.508191837950552, + "grad_norm": 0.0075372979044914246, + "learning_rate": 1.849563229752782e-05, + "loss": 0.007, + "num_input_tokens_seen": 48717080, + "step": 83980 + }, + { + "epoch": 12.50893655049151, + "grad_norm": 0.001587212085723877, + "learning_rate": 1.849249483603788e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48720120, + "step": 83985 + }, + { + "epoch": 12.50968126303247, + "grad_norm": 0.0002520551788620651, + "learning_rate": 1.8489357484485616e-05, + "loss": 0.0, + "num_input_tokens_seen": 48722936, + "step": 83990 + }, + { + "epoch": 12.510425975573428, + "grad_norm": 0.008086143992841244, + "learning_rate": 1.8486220242924042e-05, + "loss": 0.0679, + "num_input_tokens_seen": 48725944, + "step": 83995 + }, + { + "epoch": 12.511170688114388, + "grad_norm": 0.0008161774603649974, + "learning_rate": 1.8483083111406154e-05, + "loss": 0.1843, + "num_input_tokens_seen": 48728824, + "step": 84000 + }, + { + "epoch": 12.511915400655347, + "grad_norm": 0.004545184783637524, + "learning_rate": 1.8479946089984963e-05, + "loss": 0.1315, + "num_input_tokens_seen": 48731768, + "step": 84005 + }, + { + "epoch": 12.512660113196306, + "grad_norm": 3.998817192041315e-05, + "learning_rate": 1.8476809178713446e-05, + "loss": 0.0225, + "num_input_tokens_seen": 48734936, + "step": 84010 + }, + { + "epoch": 12.513404825737265, + "grad_norm": 0.0020465089473873377, + "learning_rate": 1.8473672377644617e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48737624, + "step": 84015 + }, + { + "epoch": 12.514149538278225, + "grad_norm": 0.02780010551214218, + "learning_rate": 1.8470535686831446e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48740568, + "step": 84020 + }, + { + "epoch": 12.514894250819184, + "grad_norm": 12.18772029876709, + "learning_rate": 1.8467399106326954e-05, + "loss": 0.0115, + "num_input_tokens_seen": 48743576, + "step": 84025 + }, + { + "epoch": 12.515638963360143, + "grad_norm": 0.0026594328228384256, + "learning_rate": 1.8464262636184117e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48746264, + "step": 84030 + }, + { + "epoch": 12.516383675901102, + "grad_norm": 0.002317757112905383, + "learning_rate": 1.8461126276455904e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48748984, + "step": 84035 + }, + { + "epoch": 12.517128388442062, + "grad_norm": 0.0008444222039543092, + "learning_rate": 1.8457990027195325e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48751672, + "step": 84040 + }, + { + "epoch": 12.51787310098302, + "grad_norm": 0.20277543365955353, + "learning_rate": 1.8454853888455352e-05, + "loss": 0.0004, + "num_input_tokens_seen": 48754552, + "step": 84045 + }, + { + "epoch": 12.51861781352398, + "grad_norm": 10.643206596374512, + "learning_rate": 1.845171786028898e-05, + "loss": 0.1296, + "num_input_tokens_seen": 48757240, + "step": 84050 + }, + { + "epoch": 12.519362526064938, + "grad_norm": 8.910054748412222e-05, + "learning_rate": 1.8448581942749167e-05, + "loss": 0.0, + "num_input_tokens_seen": 48760152, + "step": 84055 + }, + { + "epoch": 12.520107238605899, + "grad_norm": 0.21946080029010773, + "learning_rate": 1.844544613588891e-05, + "loss": 0.0463, + "num_input_tokens_seen": 48762872, + "step": 84060 + }, + { + "epoch": 12.520851951146858, + "grad_norm": 0.01962798833847046, + "learning_rate": 1.8442310439761185e-05, + "loss": 0.2065, + "num_input_tokens_seen": 48765752, + "step": 84065 + }, + { + "epoch": 12.521596663687816, + "grad_norm": 2.829439401626587, + "learning_rate": 1.8439174854418946e-05, + "loss": 0.0008, + "num_input_tokens_seen": 48768632, + "step": 84070 + }, + { + "epoch": 12.522341376228775, + "grad_norm": 6.024002686899621e-06, + "learning_rate": 1.843603937991519e-05, + "loss": 0.0797, + "num_input_tokens_seen": 48771416, + "step": 84075 + }, + { + "epoch": 12.523086088769734, + "grad_norm": 2.4750592708587646, + "learning_rate": 1.8432904016302872e-05, + "loss": 0.0016, + "num_input_tokens_seen": 48774232, + "step": 84080 + }, + { + "epoch": 12.523830801310694, + "grad_norm": 0.005578154698014259, + "learning_rate": 1.8429768763634974e-05, + "loss": 0.1255, + "num_input_tokens_seen": 48777368, + "step": 84085 + }, + { + "epoch": 12.524575513851653, + "grad_norm": 0.0018308605067431927, + "learning_rate": 1.8426633621964443e-05, + "loss": 0.0008, + "num_input_tokens_seen": 48780344, + "step": 84090 + }, + { + "epoch": 12.525320226392612, + "grad_norm": 0.01996670290827751, + "learning_rate": 1.8423498591344267e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48783032, + "step": 84095 + }, + { + "epoch": 12.526064938933573, + "grad_norm": 0.0017997437389567494, + "learning_rate": 1.8420363671827387e-05, + "loss": 0.0, + "num_input_tokens_seen": 48785816, + "step": 84100 + }, + { + "epoch": 12.526809651474531, + "grad_norm": 0.0008726027444936335, + "learning_rate": 1.8417228863466786e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48788696, + "step": 84105 + }, + { + "epoch": 12.52755436401549, + "grad_norm": 0.00011801223445218056, + "learning_rate": 1.841409416631541e-05, + "loss": 0.0004, + "num_input_tokens_seen": 48791544, + "step": 84110 + }, + { + "epoch": 12.528299076556449, + "grad_norm": 0.005591805558651686, + "learning_rate": 1.8410959580426222e-05, + "loss": 0.0, + "num_input_tokens_seen": 48794456, + "step": 84115 + }, + { + "epoch": 12.529043789097408, + "grad_norm": 0.003963560797274113, + "learning_rate": 1.8407825105852175e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48797528, + "step": 84120 + }, + { + "epoch": 12.529788501638368, + "grad_norm": 0.0012046018382534385, + "learning_rate": 1.8404690742646212e-05, + "loss": 0.0765, + "num_input_tokens_seen": 48800760, + "step": 84125 + }, + { + "epoch": 12.530533214179327, + "grad_norm": 0.02616754174232483, + "learning_rate": 1.840155649086131e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48803608, + "step": 84130 + }, + { + "epoch": 12.531277926720286, + "grad_norm": 0.010909679345786572, + "learning_rate": 1.8398422350550386e-05, + "loss": 0.0407, + "num_input_tokens_seen": 48806712, + "step": 84135 + }, + { + "epoch": 12.532022639261244, + "grad_norm": 0.009717622771859169, + "learning_rate": 1.8395288321766424e-05, + "loss": 0.0009, + "num_input_tokens_seen": 48809624, + "step": 84140 + }, + { + "epoch": 12.532767351802205, + "grad_norm": 0.0004073054587934166, + "learning_rate": 1.8392154404562354e-05, + "loss": 0.0004, + "num_input_tokens_seen": 48812472, + "step": 84145 + }, + { + "epoch": 12.533512064343164, + "grad_norm": 0.0014244545018300414, + "learning_rate": 1.8389020598991113e-05, + "loss": 0.0055, + "num_input_tokens_seen": 48815480, + "step": 84150 + }, + { + "epoch": 12.534256776884122, + "grad_norm": 0.0031891989056020975, + "learning_rate": 1.8385886905105653e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48818072, + "step": 84155 + }, + { + "epoch": 12.535001489425081, + "grad_norm": 0.0009117511217482388, + "learning_rate": 1.8382753322958902e-05, + "loss": 0.2971, + "num_input_tokens_seen": 48820920, + "step": 84160 + }, + { + "epoch": 12.535746201966042, + "grad_norm": 0.0038448774721473455, + "learning_rate": 1.837961985260382e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48823608, + "step": 84165 + }, + { + "epoch": 12.536490914507, + "grad_norm": 0.01133527047932148, + "learning_rate": 1.8376486494093327e-05, + "loss": 0.0007, + "num_input_tokens_seen": 48826616, + "step": 84170 + }, + { + "epoch": 12.53723562704796, + "grad_norm": 0.0012791515327990055, + "learning_rate": 1.837335324748036e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48829592, + "step": 84175 + }, + { + "epoch": 12.537980339588918, + "grad_norm": 0.00335636711679399, + "learning_rate": 1.8370220112817854e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48832472, + "step": 84180 + }, + { + "epoch": 12.538725052129879, + "grad_norm": 0.0007922389195300639, + "learning_rate": 1.836708709015875e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48835544, + "step": 84185 + }, + { + "epoch": 12.539469764670837, + "grad_norm": 1.2840983867645264, + "learning_rate": 1.836395417955597e-05, + "loss": 0.197, + "num_input_tokens_seen": 48838808, + "step": 84190 + }, + { + "epoch": 12.540214477211796, + "grad_norm": 0.017772698774933815, + "learning_rate": 1.836082138106242e-05, + "loss": 0.01, + "num_input_tokens_seen": 48841816, + "step": 84195 + }, + { + "epoch": 12.540959189752755, + "grad_norm": 0.0045549301430583, + "learning_rate": 1.8357688694731063e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48845208, + "step": 84200 + }, + { + "epoch": 12.541703902293715, + "grad_norm": 0.0035679196007549763, + "learning_rate": 1.8354556120614796e-05, + "loss": 0.0005, + "num_input_tokens_seen": 48848440, + "step": 84205 + }, + { + "epoch": 12.542448614834674, + "grad_norm": 0.07951401174068451, + "learning_rate": 1.8351423658766557e-05, + "loss": 0.0006, + "num_input_tokens_seen": 48851224, + "step": 84210 + }, + { + "epoch": 12.543193327375633, + "grad_norm": 0.001631109626032412, + "learning_rate": 1.8348291309239248e-05, + "loss": 0.0, + "num_input_tokens_seen": 48854200, + "step": 84215 + }, + { + "epoch": 12.543938039916592, + "grad_norm": 0.005001672077924013, + "learning_rate": 1.8345159072085803e-05, + "loss": 0.0012, + "num_input_tokens_seen": 48857368, + "step": 84220 + }, + { + "epoch": 12.544682752457552, + "grad_norm": 0.005474458448588848, + "learning_rate": 1.8342026947359137e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48860472, + "step": 84225 + }, + { + "epoch": 12.545427464998511, + "grad_norm": 0.00027678016340360045, + "learning_rate": 1.8338894935112144e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48863224, + "step": 84230 + }, + { + "epoch": 12.54617217753947, + "grad_norm": 0.0003459424478933215, + "learning_rate": 1.8335763035397765e-05, + "loss": 0.0, + "num_input_tokens_seen": 48866168, + "step": 84235 + }, + { + "epoch": 12.546916890080428, + "grad_norm": 33.383705139160156, + "learning_rate": 1.833263124826889e-05, + "loss": 0.1814, + "num_input_tokens_seen": 48869208, + "step": 84240 + }, + { + "epoch": 12.547661602621389, + "grad_norm": 329.4330139160156, + "learning_rate": 1.832949957377844e-05, + "loss": 0.0676, + "num_input_tokens_seen": 48871960, + "step": 84245 + }, + { + "epoch": 12.548406315162348, + "grad_norm": 3.152750253677368, + "learning_rate": 1.83263680119793e-05, + "loss": 0.072, + "num_input_tokens_seen": 48875064, + "step": 84250 + }, + { + "epoch": 12.549151027703306, + "grad_norm": 0.00047776271821931005, + "learning_rate": 1.8323236562924405e-05, + "loss": 0.0023, + "num_input_tokens_seen": 48877720, + "step": 84255 + }, + { + "epoch": 12.549895740244265, + "grad_norm": 0.11758042126893997, + "learning_rate": 1.8320105226666628e-05, + "loss": 0.0002, + "num_input_tokens_seen": 48880600, + "step": 84260 + }, + { + "epoch": 12.550640452785224, + "grad_norm": 0.0009572610724717379, + "learning_rate": 1.8316974003258898e-05, + "loss": 0.0, + "num_input_tokens_seen": 48883224, + "step": 84265 + }, + { + "epoch": 12.551385165326185, + "grad_norm": 0.0014121172716841102, + "learning_rate": 1.8313842892754097e-05, + "loss": 0.0, + "num_input_tokens_seen": 48885976, + "step": 84270 + }, + { + "epoch": 12.552129877867143, + "grad_norm": 0.11290369182825089, + "learning_rate": 1.8310711895205125e-05, + "loss": 0.0003, + "num_input_tokens_seen": 48888856, + "step": 84275 + }, + { + "epoch": 12.552874590408102, + "grad_norm": 0.004074270371347666, + "learning_rate": 1.8307581010664875e-05, + "loss": 0.0, + "num_input_tokens_seen": 48891832, + "step": 84280 + }, + { + "epoch": 12.553619302949063, + "grad_norm": 0.006661681924015284, + "learning_rate": 1.8304450239186235e-05, + "loss": 0.2909, + "num_input_tokens_seen": 48894808, + "step": 84285 + }, + { + "epoch": 12.554364015490021, + "grad_norm": 0.00025793397799134254, + "learning_rate": 1.8301319580822112e-05, + "loss": 0.0024, + "num_input_tokens_seen": 48898072, + "step": 84290 + }, + { + "epoch": 12.55510872803098, + "grad_norm": 0.00016083530499599874, + "learning_rate": 1.829818903562538e-05, + "loss": 0.0, + "num_input_tokens_seen": 48901080, + "step": 84295 + }, + { + "epoch": 12.555853440571939, + "grad_norm": 0.002504517324268818, + "learning_rate": 1.8295058603648942e-05, + "loss": 0.0011, + "num_input_tokens_seen": 48904344, + "step": 84300 + }, + { + "epoch": 12.556598153112898, + "grad_norm": 0.0002930997288785875, + "learning_rate": 1.8291928284945668e-05, + "loss": 0.0, + "num_input_tokens_seen": 48907192, + "step": 84305 + }, + { + "epoch": 12.557342865653858, + "grad_norm": 0.0005417441134341061, + "learning_rate": 1.828879807956845e-05, + "loss": 0.0098, + "num_input_tokens_seen": 48910072, + "step": 84310 + }, + { + "epoch": 12.558087578194817, + "grad_norm": 0.0006562280468642712, + "learning_rate": 1.828566798757017e-05, + "loss": 0.0917, + "num_input_tokens_seen": 48912824, + "step": 84315 + }, + { + "epoch": 12.558832290735776, + "grad_norm": 0.0009469297947362065, + "learning_rate": 1.8282538009003696e-05, + "loss": 0.0, + "num_input_tokens_seen": 48915416, + "step": 84320 + }, + { + "epoch": 12.559577003276734, + "grad_norm": 0.02225583791732788, + "learning_rate": 1.827940814392192e-05, + "loss": 0.0001, + "num_input_tokens_seen": 48918616, + "step": 84325 + }, + { + "epoch": 12.560321715817695, + "grad_norm": 0.000730288855265826, + "learning_rate": 1.827627839237771e-05, + "loss": 0.0042, + "num_input_tokens_seen": 48921528, + "step": 84330 + }, + { + "epoch": 12.561066428358654, + "grad_norm": 3.5504865081747994e-05, + "learning_rate": 1.8273148754423953e-05, + "loss": 0.0218, + "num_input_tokens_seen": 48924792, + "step": 84335 + }, + { + "epoch": 12.561811140899612, + "grad_norm": 0.00031409927760250866, + "learning_rate": 1.82700192301135e-05, + "loss": 0.0, + "num_input_tokens_seen": 48927992, + "step": 84340 + }, + { + "epoch": 12.562555853440571, + "grad_norm": 0.0005138531560078263, + "learning_rate": 1.826688981949924e-05, + "loss": 0.1131, + "num_input_tokens_seen": 48931064, + "step": 84345 + }, + { + "epoch": 12.563300565981532, + "grad_norm": 0.0011618410935625434, + "learning_rate": 1.8263760522634033e-05, + "loss": 0.0, + "num_input_tokens_seen": 48933752, + "step": 84350 + }, + { + "epoch": 12.56404527852249, + "grad_norm": 0.004730761982500553, + "learning_rate": 1.826063133957074e-05, + "loss": 0.0, + "num_input_tokens_seen": 48936472, + "step": 84355 + }, + { + "epoch": 12.56478999106345, + "grad_norm": 0.0003593335277400911, + "learning_rate": 1.8257502270362235e-05, + "loss": 0.0294, + "num_input_tokens_seen": 48939352, + "step": 84360 + }, + { + "epoch": 12.565534703604408, + "grad_norm": 0.0007046443060971797, + "learning_rate": 1.8254373315061364e-05, + "loss": 0.0, + "num_input_tokens_seen": 48942296, + "step": 84365 + }, + { + "epoch": 12.566279416145369, + "grad_norm": 0.00016688861069269478, + "learning_rate": 1.8251244473721017e-05, + "loss": 0.0016, + "num_input_tokens_seen": 48945304, + "step": 84370 + }, + { + "epoch": 12.567024128686327, + "grad_norm": 0.00017609029600862414, + "learning_rate": 1.8248115746394025e-05, + "loss": 0.0, + "num_input_tokens_seen": 48948056, + "step": 84375 + }, + { + "epoch": 12.567768841227286, + "grad_norm": 0.0027519508730620146, + "learning_rate": 1.8244987133133264e-05, + "loss": 0.0, + "num_input_tokens_seen": 48951288, + "step": 84380 + }, + { + "epoch": 12.568513553768245, + "grad_norm": 0.00010134329932043329, + "learning_rate": 1.8241858633991578e-05, + "loss": 0.0913, + "num_input_tokens_seen": 48954328, + "step": 84385 + }, + { + "epoch": 12.569258266309205, + "grad_norm": 0.0013618512311950326, + "learning_rate": 1.8238730249021812e-05, + "loss": 0.1314, + "num_input_tokens_seen": 48957368, + "step": 84390 + }, + { + "epoch": 12.570002978850164, + "grad_norm": 0.0005494683282449841, + "learning_rate": 1.8235601978276838e-05, + "loss": 0.0025, + "num_input_tokens_seen": 48960216, + "step": 84395 + }, + { + "epoch": 12.570747691391123, + "grad_norm": 0.008853240869939327, + "learning_rate": 1.823247382180948e-05, + "loss": 0.0065, + "num_input_tokens_seen": 48962872, + "step": 84400 + }, + { + "epoch": 12.571492403932082, + "grad_norm": 0.004332733806222677, + "learning_rate": 1.8229345779672613e-05, + "loss": 0.0, + "num_input_tokens_seen": 48965496, + "step": 84405 + }, + { + "epoch": 12.572237116473042, + "grad_norm": 0.00012634546146728098, + "learning_rate": 1.8226217851919062e-05, + "loss": 0.0, + "num_input_tokens_seen": 48968312, + "step": 84410 + }, + { + "epoch": 12.572981829014001, + "grad_norm": 0.004732914734631777, + "learning_rate": 1.8223090038601678e-05, + "loss": 0.0, + "num_input_tokens_seen": 48971256, + "step": 84415 + }, + { + "epoch": 12.57372654155496, + "grad_norm": 2.851046883733943e-05, + "learning_rate": 1.8219962339773292e-05, + "loss": 0.0, + "num_input_tokens_seen": 48973944, + "step": 84420 + }, + { + "epoch": 12.574471254095918, + "grad_norm": 8.396762132178992e-05, + "learning_rate": 1.8216834755486763e-05, + "loss": 0.0, + "num_input_tokens_seen": 48976856, + "step": 84425 + }, + { + "epoch": 12.575215966636879, + "grad_norm": 0.6011949181556702, + "learning_rate": 1.821370728579491e-05, + "loss": 0.0004, + "num_input_tokens_seen": 48979640, + "step": 84430 + }, + { + "epoch": 12.575960679177838, + "grad_norm": 0.0003993363643530756, + "learning_rate": 1.821057993075057e-05, + "loss": 0.0481, + "num_input_tokens_seen": 48982712, + "step": 84435 + }, + { + "epoch": 12.576705391718797, + "grad_norm": 0.00016730364586692303, + "learning_rate": 1.8207452690406594e-05, + "loss": 0.1831, + "num_input_tokens_seen": 48985432, + "step": 84440 + }, + { + "epoch": 12.577450104259755, + "grad_norm": 11.133498191833496, + "learning_rate": 1.8204325564815796e-05, + "loss": 0.216, + "num_input_tokens_seen": 48988376, + "step": 84445 + }, + { + "epoch": 12.578194816800714, + "grad_norm": 9.537516598356888e-05, + "learning_rate": 1.820119855403101e-05, + "loss": 0.0, + "num_input_tokens_seen": 48991000, + "step": 84450 + }, + { + "epoch": 12.578939529341675, + "grad_norm": 0.005183835979551077, + "learning_rate": 1.819807165810506e-05, + "loss": 0.25, + "num_input_tokens_seen": 48993944, + "step": 84455 + }, + { + "epoch": 12.579684241882633, + "grad_norm": 0.003652312094345689, + "learning_rate": 1.819494487709078e-05, + "loss": 0.1009, + "num_input_tokens_seen": 48996632, + "step": 84460 + }, + { + "epoch": 12.580428954423592, + "grad_norm": 0.0006891624070703983, + "learning_rate": 1.8191818211040997e-05, + "loss": 0.0, + "num_input_tokens_seen": 48999576, + "step": 84465 + }, + { + "epoch": 12.58117366696455, + "grad_norm": 8.312808990478516, + "learning_rate": 1.8188691660008513e-05, + "loss": 0.2051, + "num_input_tokens_seen": 49002264, + "step": 84470 + }, + { + "epoch": 12.581918379505511, + "grad_norm": 0.0007337001734413207, + "learning_rate": 1.818556522404617e-05, + "loss": 0.001, + "num_input_tokens_seen": 49005048, + "step": 84475 + }, + { + "epoch": 12.58266309204647, + "grad_norm": 0.0012280880473554134, + "learning_rate": 1.818243890320677e-05, + "loss": 0.1149, + "num_input_tokens_seen": 49007832, + "step": 84480 + }, + { + "epoch": 12.583407804587429, + "grad_norm": 0.005483729299157858, + "learning_rate": 1.8179312697543145e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49010872, + "step": 84485 + }, + { + "epoch": 12.584152517128388, + "grad_norm": 0.0003829426132142544, + "learning_rate": 1.8176186607108086e-05, + "loss": 0.2158, + "num_input_tokens_seen": 49013752, + "step": 84490 + }, + { + "epoch": 12.584897229669348, + "grad_norm": 0.007007166277617216, + "learning_rate": 1.817306063195443e-05, + "loss": 0.0, + "num_input_tokens_seen": 49016600, + "step": 84495 + }, + { + "epoch": 12.585641942210307, + "grad_norm": 0.001157092396169901, + "learning_rate": 1.8169934772134974e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49019352, + "step": 84500 + }, + { + "epoch": 12.586386654751266, + "grad_norm": 0.0005996339023113251, + "learning_rate": 1.8166809027702522e-05, + "loss": 0.0022, + "num_input_tokens_seen": 49022616, + "step": 84505 + }, + { + "epoch": 12.587131367292224, + "grad_norm": 0.09042917937040329, + "learning_rate": 1.8163683398709898e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49025464, + "step": 84510 + }, + { + "epoch": 12.587876079833185, + "grad_norm": 0.0010642692213878036, + "learning_rate": 1.8160557885209884e-05, + "loss": 0.0, + "num_input_tokens_seen": 49028632, + "step": 84515 + }, + { + "epoch": 12.588620792374144, + "grad_norm": 0.00040416058618575335, + "learning_rate": 1.81574324872553e-05, + "loss": 0.1059, + "num_input_tokens_seen": 49031384, + "step": 84520 + }, + { + "epoch": 12.589365504915103, + "grad_norm": 0.00235962588340044, + "learning_rate": 1.8154307204898933e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49034360, + "step": 84525 + }, + { + "epoch": 12.590110217456061, + "grad_norm": 0.2755001485347748, + "learning_rate": 1.8151182038193594e-05, + "loss": 0.0243, + "num_input_tokens_seen": 49037176, + "step": 84530 + }, + { + "epoch": 12.590854929997022, + "grad_norm": 0.0018152444390580058, + "learning_rate": 1.814805698719207e-05, + "loss": 0.0069, + "num_input_tokens_seen": 49040152, + "step": 84535 + }, + { + "epoch": 12.59159964253798, + "grad_norm": 0.00797313079237938, + "learning_rate": 1.8144932051947166e-05, + "loss": 0.2259, + "num_input_tokens_seen": 49043000, + "step": 84540 + }, + { + "epoch": 12.59234435507894, + "grad_norm": 0.0025977781042456627, + "learning_rate": 1.814180723251166e-05, + "loss": 0.0, + "num_input_tokens_seen": 49045880, + "step": 84545 + }, + { + "epoch": 12.593089067619898, + "grad_norm": 0.0003430694923736155, + "learning_rate": 1.8138682528938354e-05, + "loss": 0.0, + "num_input_tokens_seen": 49048920, + "step": 84550 + }, + { + "epoch": 12.593833780160859, + "grad_norm": 0.009763405658304691, + "learning_rate": 1.8135557941280035e-05, + "loss": 0.0, + "num_input_tokens_seen": 49051768, + "step": 84555 + }, + { + "epoch": 12.594578492701817, + "grad_norm": 0.0081079863011837, + "learning_rate": 1.813243346958948e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49054520, + "step": 84560 + }, + { + "epoch": 12.595323205242776, + "grad_norm": 0.0007058082264848053, + "learning_rate": 1.812930911391949e-05, + "loss": 0.0, + "num_input_tokens_seen": 49057560, + "step": 84565 + }, + { + "epoch": 12.596067917783735, + "grad_norm": 0.003863961435854435, + "learning_rate": 1.8126184874322837e-05, + "loss": 0.0, + "num_input_tokens_seen": 49060696, + "step": 84570 + }, + { + "epoch": 12.596812630324695, + "grad_norm": 0.0011181104928255081, + "learning_rate": 1.8123060750852305e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49063896, + "step": 84575 + }, + { + "epoch": 12.597557342865654, + "grad_norm": 0.10549619793891907, + "learning_rate": 1.8119936743560667e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49066584, + "step": 84580 + }, + { + "epoch": 12.598302055406613, + "grad_norm": 0.008818899281322956, + "learning_rate": 1.8116812852500713e-05, + "loss": 0.0004, + "num_input_tokens_seen": 49069304, + "step": 84585 + }, + { + "epoch": 12.599046767947572, + "grad_norm": 0.0007414090796373785, + "learning_rate": 1.811368907772521e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49072408, + "step": 84590 + }, + { + "epoch": 12.599791480488532, + "grad_norm": 0.0006957485456950963, + "learning_rate": 1.8110565419286916e-05, + "loss": 0.062, + "num_input_tokens_seen": 49075224, + "step": 84595 + }, + { + "epoch": 12.600536193029491, + "grad_norm": 0.002494822721928358, + "learning_rate": 1.8107441877238634e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49078264, + "step": 84600 + }, + { + "epoch": 12.60128090557045, + "grad_norm": 0.0027455338276922703, + "learning_rate": 1.8104318451633114e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49081272, + "step": 84605 + }, + { + "epoch": 12.602025618111409, + "grad_norm": 0.0038156379014253616, + "learning_rate": 1.810119514252312e-05, + "loss": 0.0, + "num_input_tokens_seen": 49083992, + "step": 84610 + }, + { + "epoch": 12.602770330652369, + "grad_norm": 0.0005464781424961984, + "learning_rate": 1.809807194996142e-05, + "loss": 0.0, + "num_input_tokens_seen": 49086616, + "step": 84615 + }, + { + "epoch": 12.603515043193328, + "grad_norm": 3.5796585083007812, + "learning_rate": 1.809494887400079e-05, + "loss": 0.005, + "num_input_tokens_seen": 49089624, + "step": 84620 + }, + { + "epoch": 12.604259755734287, + "grad_norm": 0.002455787966027856, + "learning_rate": 1.8091825914693966e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49092632, + "step": 84625 + }, + { + "epoch": 12.605004468275245, + "grad_norm": 0.0002732264983933419, + "learning_rate": 1.8088703072093735e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49095544, + "step": 84630 + }, + { + "epoch": 12.605749180816204, + "grad_norm": 0.001802208018489182, + "learning_rate": 1.808558034625284e-05, + "loss": 0.1782, + "num_input_tokens_seen": 49098456, + "step": 84635 + }, + { + "epoch": 12.606493893357165, + "grad_norm": 0.0010014978470280766, + "learning_rate": 1.8082457737224034e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49101176, + "step": 84640 + }, + { + "epoch": 12.607238605898123, + "grad_norm": 0.00014832988381385803, + "learning_rate": 1.8079335245060076e-05, + "loss": 0.0, + "num_input_tokens_seen": 49103992, + "step": 84645 + }, + { + "epoch": 12.607983318439082, + "grad_norm": 0.003952862229198217, + "learning_rate": 1.8076212869813706e-05, + "loss": 0.1345, + "num_input_tokens_seen": 49107128, + "step": 84650 + }, + { + "epoch": 12.608728030980041, + "grad_norm": 0.3316272795200348, + "learning_rate": 1.8073090611537697e-05, + "loss": 0.0004, + "num_input_tokens_seen": 49109944, + "step": 84655 + }, + { + "epoch": 12.609472743521001, + "grad_norm": 0.005923829507082701, + "learning_rate": 1.8069968470284768e-05, + "loss": 0.0, + "num_input_tokens_seen": 49112792, + "step": 84660 + }, + { + "epoch": 12.61021745606196, + "grad_norm": 0.006678272504359484, + "learning_rate": 1.806684644610769e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49115576, + "step": 84665 + }, + { + "epoch": 12.610962168602919, + "grad_norm": 0.06367991864681244, + "learning_rate": 1.8063724539059195e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49118520, + "step": 84670 + }, + { + "epoch": 12.611706881143878, + "grad_norm": 0.0043547251261770725, + "learning_rate": 1.806060274919202e-05, + "loss": 0.0923, + "num_input_tokens_seen": 49121656, + "step": 84675 + }, + { + "epoch": 12.612451593684838, + "grad_norm": 0.00017670972738415003, + "learning_rate": 1.8057481076558906e-05, + "loss": 0.0119, + "num_input_tokens_seen": 49124632, + "step": 84680 + }, + { + "epoch": 12.613196306225797, + "grad_norm": 0.007497507147490978, + "learning_rate": 1.8054359521212592e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49127704, + "step": 84685 + }, + { + "epoch": 12.613941018766756, + "grad_norm": 0.00031837282585911453, + "learning_rate": 1.805123808320582e-05, + "loss": 0.0, + "num_input_tokens_seen": 49130744, + "step": 84690 + }, + { + "epoch": 12.614685731307715, + "grad_norm": 53.34080123901367, + "learning_rate": 1.804811676259131e-05, + "loss": 0.0739, + "num_input_tokens_seen": 49133688, + "step": 84695 + }, + { + "epoch": 12.615430443848675, + "grad_norm": 0.022472279146313667, + "learning_rate": 1.8044995559421813e-05, + "loss": 0.0, + "num_input_tokens_seen": 49136472, + "step": 84700 + }, + { + "epoch": 12.616175156389634, + "grad_norm": 0.003511374816298485, + "learning_rate": 1.804187447375004e-05, + "loss": 0.0244, + "num_input_tokens_seen": 49139672, + "step": 84705 + }, + { + "epoch": 12.616919868930593, + "grad_norm": 0.0004974778275936842, + "learning_rate": 1.803875350562873e-05, + "loss": 0.0578, + "num_input_tokens_seen": 49142520, + "step": 84710 + }, + { + "epoch": 12.617664581471551, + "grad_norm": 0.01991400681436062, + "learning_rate": 1.8035632655110607e-05, + "loss": 0.2314, + "num_input_tokens_seen": 49145592, + "step": 84715 + }, + { + "epoch": 12.618409294012512, + "grad_norm": 0.4953649640083313, + "learning_rate": 1.803251192224838e-05, + "loss": 0.1037, + "num_input_tokens_seen": 49148376, + "step": 84720 + }, + { + "epoch": 12.61915400655347, + "grad_norm": 0.005440296605229378, + "learning_rate": 1.8029391307094796e-05, + "loss": 0.0329, + "num_input_tokens_seen": 49151160, + "step": 84725 + }, + { + "epoch": 12.61989871909443, + "grad_norm": 0.0016599145019426942, + "learning_rate": 1.8026270809702547e-05, + "loss": 0.2795, + "num_input_tokens_seen": 49154072, + "step": 84730 + }, + { + "epoch": 12.620643431635388, + "grad_norm": 0.04749586060643196, + "learning_rate": 1.8023150430124375e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49156792, + "step": 84735 + }, + { + "epoch": 12.621388144176349, + "grad_norm": 0.004687114153057337, + "learning_rate": 1.802003016841298e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49159960, + "step": 84740 + }, + { + "epoch": 12.622132856717307, + "grad_norm": 0.040151357650756836, + "learning_rate": 1.801691002462109e-05, + "loss": 0.0009, + "num_input_tokens_seen": 49162680, + "step": 84745 + }, + { + "epoch": 12.622877569258266, + "grad_norm": 0.04707447066903114, + "learning_rate": 1.8013789998801407e-05, + "loss": 0.0038, + "num_input_tokens_seen": 49165432, + "step": 84750 + }, + { + "epoch": 12.623622281799225, + "grad_norm": 0.006085275672376156, + "learning_rate": 1.801067009100663e-05, + "loss": 0.0287, + "num_input_tokens_seen": 49168504, + "step": 84755 + }, + { + "epoch": 12.624366994340185, + "grad_norm": 0.016154393553733826, + "learning_rate": 1.800755030128949e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49171000, + "step": 84760 + }, + { + "epoch": 12.625111706881144, + "grad_norm": 0.004469683393836021, + "learning_rate": 1.800443062970267e-05, + "loss": 0.0, + "num_input_tokens_seen": 49173720, + "step": 84765 + }, + { + "epoch": 12.625856419422103, + "grad_norm": 0.1823018491268158, + "learning_rate": 1.8001311076298895e-05, + "loss": 0.0005, + "num_input_tokens_seen": 49176376, + "step": 84770 + }, + { + "epoch": 12.626601131963062, + "grad_norm": 0.018166612833738327, + "learning_rate": 1.799819164113085e-05, + "loss": 0.179, + "num_input_tokens_seen": 49179160, + "step": 84775 + }, + { + "epoch": 12.62734584450402, + "grad_norm": 1.2017356157302856, + "learning_rate": 1.799507232425125e-05, + "loss": 0.1454, + "num_input_tokens_seen": 49181944, + "step": 84780 + }, + { + "epoch": 12.628090557044981, + "grad_norm": 0.0013049780391156673, + "learning_rate": 1.799195312571277e-05, + "loss": 0.0018, + "num_input_tokens_seen": 49184856, + "step": 84785 + }, + { + "epoch": 12.62883526958594, + "grad_norm": 0.0006562016787938774, + "learning_rate": 1.7988834045568126e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49188216, + "step": 84790 + }, + { + "epoch": 12.629579982126899, + "grad_norm": 0.06656896322965622, + "learning_rate": 1.7985715083870008e-05, + "loss": 0.0324, + "num_input_tokens_seen": 49191224, + "step": 84795 + }, + { + "epoch": 12.63032469466786, + "grad_norm": 0.009855825453996658, + "learning_rate": 1.7982596240671095e-05, + "loss": 0.0, + "num_input_tokens_seen": 49193912, + "step": 84800 + }, + { + "epoch": 12.631069407208818, + "grad_norm": 0.000857377948705107, + "learning_rate": 1.7979477516024096e-05, + "loss": 0.3492, + "num_input_tokens_seen": 49196536, + "step": 84805 + }, + { + "epoch": 12.631814119749777, + "grad_norm": 1.3098726272583008, + "learning_rate": 1.7976358909981686e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49200024, + "step": 84810 + }, + { + "epoch": 12.632558832290735, + "grad_norm": 0.016442079097032547, + "learning_rate": 1.7973240422596557e-05, + "loss": 0.0063, + "num_input_tokens_seen": 49203096, + "step": 84815 + }, + { + "epoch": 12.633303544831694, + "grad_norm": 0.0006710650632157922, + "learning_rate": 1.7970122053921378e-05, + "loss": 0.0578, + "num_input_tokens_seen": 49205816, + "step": 84820 + }, + { + "epoch": 12.634048257372655, + "grad_norm": 0.006775915157049894, + "learning_rate": 1.7967003804008855e-05, + "loss": 0.0, + "num_input_tokens_seen": 49208696, + "step": 84825 + }, + { + "epoch": 12.634792969913613, + "grad_norm": 0.005402425304055214, + "learning_rate": 1.7963885672911655e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49211928, + "step": 84830 + }, + { + "epoch": 12.635537682454572, + "grad_norm": 0.0015792251797392964, + "learning_rate": 1.7960767660682442e-05, + "loss": 0.1968, + "num_input_tokens_seen": 49215000, + "step": 84835 + }, + { + "epoch": 12.636282394995531, + "grad_norm": 0.004402521997690201, + "learning_rate": 1.7957649767373916e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49217848, + "step": 84840 + }, + { + "epoch": 12.637027107536491, + "grad_norm": 0.7287810444831848, + "learning_rate": 1.7954531993038737e-05, + "loss": 0.0036, + "num_input_tokens_seen": 49221080, + "step": 84845 + }, + { + "epoch": 12.63777182007745, + "grad_norm": 4.03787088394165, + "learning_rate": 1.7951414337729584e-05, + "loss": 0.0432, + "num_input_tokens_seen": 49223992, + "step": 84850 + }, + { + "epoch": 12.638516532618409, + "grad_norm": 6.20038366317749, + "learning_rate": 1.794829680149911e-05, + "loss": 0.3302, + "num_input_tokens_seen": 49226744, + "step": 84855 + }, + { + "epoch": 12.639261245159368, + "grad_norm": 0.040618591010570526, + "learning_rate": 1.7945179384400002e-05, + "loss": 0.0607, + "num_input_tokens_seen": 49229720, + "step": 84860 + }, + { + "epoch": 12.640005957700328, + "grad_norm": 0.0038051411975175142, + "learning_rate": 1.794206208648492e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49232856, + "step": 84865 + }, + { + "epoch": 12.640750670241287, + "grad_norm": 0.002692451700568199, + "learning_rate": 1.7938944907806523e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49235704, + "step": 84870 + }, + { + "epoch": 12.641495382782246, + "grad_norm": 0.016222475096583366, + "learning_rate": 1.7935827848417476e-05, + "loss": 0.0744, + "num_input_tokens_seen": 49238776, + "step": 84875 + }, + { + "epoch": 12.642240095323205, + "grad_norm": 0.0032386507373303175, + "learning_rate": 1.7932710908370434e-05, + "loss": 0.0037, + "num_input_tokens_seen": 49241432, + "step": 84880 + }, + { + "epoch": 12.642984807864165, + "grad_norm": 0.15448541939258575, + "learning_rate": 1.7929594087718067e-05, + "loss": 0.0011, + "num_input_tokens_seen": 49244472, + "step": 84885 + }, + { + "epoch": 12.643729520405124, + "grad_norm": 0.016085559502243996, + "learning_rate": 1.7926477386513008e-05, + "loss": 0.0, + "num_input_tokens_seen": 49247320, + "step": 84890 + }, + { + "epoch": 12.644474232946083, + "grad_norm": 0.04416299983859062, + "learning_rate": 1.7923360804807937e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49250264, + "step": 84895 + }, + { + "epoch": 12.645218945487041, + "grad_norm": 0.022061653435230255, + "learning_rate": 1.7920244342655485e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49253144, + "step": 84900 + }, + { + "epoch": 12.645963658028002, + "grad_norm": 0.056492485105991364, + "learning_rate": 1.791712800010832e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49256216, + "step": 84905 + }, + { + "epoch": 12.64670837056896, + "grad_norm": 0.13587507605552673, + "learning_rate": 1.7914011777219074e-05, + "loss": 0.0027, + "num_input_tokens_seen": 49258968, + "step": 84910 + }, + { + "epoch": 12.64745308310992, + "grad_norm": 0.01575060375034809, + "learning_rate": 1.7910895674040387e-05, + "loss": 0.0004, + "num_input_tokens_seen": 49261912, + "step": 84915 + }, + { + "epoch": 12.648197795650878, + "grad_norm": 0.012226514518260956, + "learning_rate": 1.7907779690624923e-05, + "loss": 0.0, + "num_input_tokens_seen": 49264952, + "step": 84920 + }, + { + "epoch": 12.648942508191839, + "grad_norm": 0.0009317737421952188, + "learning_rate": 1.7904663827025304e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49268248, + "step": 84925 + }, + { + "epoch": 12.649687220732797, + "grad_norm": 0.003334528999403119, + "learning_rate": 1.790154808329419e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49271032, + "step": 84930 + }, + { + "epoch": 12.650431933273756, + "grad_norm": 0.0001700078573776409, + "learning_rate": 1.78984324594842e-05, + "loss": 0.0, + "num_input_tokens_seen": 49273816, + "step": 84935 + }, + { + "epoch": 12.651176645814715, + "grad_norm": 0.0006276224157772958, + "learning_rate": 1.7895316955647977e-05, + "loss": 0.0, + "num_input_tokens_seen": 49276568, + "step": 84940 + }, + { + "epoch": 12.651921358355676, + "grad_norm": 0.086771659553051, + "learning_rate": 1.7892201571838147e-05, + "loss": 0.0011, + "num_input_tokens_seen": 49279352, + "step": 84945 + }, + { + "epoch": 12.652666070896634, + "grad_norm": 0.011440953239798546, + "learning_rate": 1.788908630810736e-05, + "loss": 0.004, + "num_input_tokens_seen": 49281944, + "step": 84950 + }, + { + "epoch": 12.653410783437593, + "grad_norm": 0.012316924519836903, + "learning_rate": 1.7885971164508227e-05, + "loss": 0.1722, + "num_input_tokens_seen": 49284888, + "step": 84955 + }, + { + "epoch": 12.654155495978552, + "grad_norm": 0.0010919320629909635, + "learning_rate": 1.7882856141093372e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49287896, + "step": 84960 + }, + { + "epoch": 12.65490020851951, + "grad_norm": 0.006749879568815231, + "learning_rate": 1.7879741237915444e-05, + "loss": 0.0735, + "num_input_tokens_seen": 49290616, + "step": 84965 + }, + { + "epoch": 12.655644921060471, + "grad_norm": 0.002867969684302807, + "learning_rate": 1.787662645502704e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49293688, + "step": 84970 + }, + { + "epoch": 12.65638963360143, + "grad_norm": 0.0032832787837833166, + "learning_rate": 1.78735117924808e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49296760, + "step": 84975 + }, + { + "epoch": 12.657134346142389, + "grad_norm": 0.011277641169726849, + "learning_rate": 1.7870397250329325e-05, + "loss": 0.0, + "num_input_tokens_seen": 49299736, + "step": 84980 + }, + { + "epoch": 12.657879058683347, + "grad_norm": 0.0004009264812339097, + "learning_rate": 1.7867282828625253e-05, + "loss": 0.0, + "num_input_tokens_seen": 49302488, + "step": 84985 + }, + { + "epoch": 12.658623771224308, + "grad_norm": 0.058031369000673294, + "learning_rate": 1.786416852742119e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49305720, + "step": 84990 + }, + { + "epoch": 12.659368483765267, + "grad_norm": 0.027789823710918427, + "learning_rate": 1.786105434676973e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49308504, + "step": 84995 + }, + { + "epoch": 12.660113196306225, + "grad_norm": 229.00088500976562, + "learning_rate": 1.785794028672352e-05, + "loss": 0.0108, + "num_input_tokens_seen": 49311768, + "step": 85000 + }, + { + "epoch": 12.660857908847184, + "grad_norm": 140.4142608642578, + "learning_rate": 1.785482634733514e-05, + "loss": 0.1353, + "num_input_tokens_seen": 49314584, + "step": 85005 + }, + { + "epoch": 12.661602621388145, + "grad_norm": 0.008179997093975544, + "learning_rate": 1.785171252865721e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49317336, + "step": 85010 + }, + { + "epoch": 12.662347333929103, + "grad_norm": 0.0016577121568843722, + "learning_rate": 1.7848598830742323e-05, + "loss": 0.2063, + "num_input_tokens_seen": 49320184, + "step": 85015 + }, + { + "epoch": 12.663092046470062, + "grad_norm": 0.00043077795999124646, + "learning_rate": 1.78454852536431e-05, + "loss": 0.0, + "num_input_tokens_seen": 49323096, + "step": 85020 + }, + { + "epoch": 12.663836759011021, + "grad_norm": 0.06373150646686554, + "learning_rate": 1.784237179741213e-05, + "loss": 0.0, + "num_input_tokens_seen": 49325880, + "step": 85025 + }, + { + "epoch": 12.664581471551982, + "grad_norm": 0.018728584051132202, + "learning_rate": 1.7839258462102015e-05, + "loss": 0.0017, + "num_input_tokens_seen": 49328888, + "step": 85030 + }, + { + "epoch": 12.66532618409294, + "grad_norm": 2.8987131372559816e-05, + "learning_rate": 1.783614524776535e-05, + "loss": 0.0, + "num_input_tokens_seen": 49331960, + "step": 85035 + }, + { + "epoch": 12.666070896633899, + "grad_norm": 0.003432114142924547, + "learning_rate": 1.783303215445473e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49334744, + "step": 85040 + }, + { + "epoch": 12.666815609174858, + "grad_norm": 8.344666275661439e-05, + "learning_rate": 1.7829919182222752e-05, + "loss": 0.0208, + "num_input_tokens_seen": 49337528, + "step": 85045 + }, + { + "epoch": 12.667560321715818, + "grad_norm": 0.005886800587177277, + "learning_rate": 1.7826806331121987e-05, + "loss": 0.1595, + "num_input_tokens_seen": 49340472, + "step": 85050 + }, + { + "epoch": 12.668305034256777, + "grad_norm": 0.0021284883841872215, + "learning_rate": 1.7823693601205054e-05, + "loss": 0.0825, + "num_input_tokens_seen": 49343512, + "step": 85055 + }, + { + "epoch": 12.669049746797736, + "grad_norm": 0.004961594007909298, + "learning_rate": 1.782058099252451e-05, + "loss": 0.238, + "num_input_tokens_seen": 49346456, + "step": 85060 + }, + { + "epoch": 12.669794459338695, + "grad_norm": 0.0022621985990554094, + "learning_rate": 1.7817468505132966e-05, + "loss": 0.3188, + "num_input_tokens_seen": 49349272, + "step": 85065 + }, + { + "epoch": 12.670539171879655, + "grad_norm": 1.123431502492167e-05, + "learning_rate": 1.7814356139082993e-05, + "loss": 0.0, + "num_input_tokens_seen": 49352024, + "step": 85070 + }, + { + "epoch": 12.671283884420614, + "grad_norm": 84.49727630615234, + "learning_rate": 1.781124389442716e-05, + "loss": 0.2132, + "num_input_tokens_seen": 49354648, + "step": 85075 + }, + { + "epoch": 12.672028596961573, + "grad_norm": 0.010499960742890835, + "learning_rate": 1.7808131771218065e-05, + "loss": 0.0, + "num_input_tokens_seen": 49357592, + "step": 85080 + }, + { + "epoch": 12.672773309502531, + "grad_norm": 0.004938928876072168, + "learning_rate": 1.7805019769508262e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49360408, + "step": 85085 + }, + { + "epoch": 12.673518022043492, + "grad_norm": 0.0008291396661661565, + "learning_rate": 1.7801907889350346e-05, + "loss": 0.0949, + "num_input_tokens_seen": 49363096, + "step": 85090 + }, + { + "epoch": 12.67426273458445, + "grad_norm": 40.42744445800781, + "learning_rate": 1.7798796130796875e-05, + "loss": 0.4564, + "num_input_tokens_seen": 49366552, + "step": 85095 + }, + { + "epoch": 12.67500744712541, + "grad_norm": 0.012072456069290638, + "learning_rate": 1.779568449390043e-05, + "loss": 0.0, + "num_input_tokens_seen": 49369240, + "step": 85100 + }, + { + "epoch": 12.675752159666368, + "grad_norm": 8.368843555217609e-05, + "learning_rate": 1.7792572978713567e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49372152, + "step": 85105 + }, + { + "epoch": 12.676496872207329, + "grad_norm": 0.0054687028750777245, + "learning_rate": 1.778946158528887e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49375096, + "step": 85110 + }, + { + "epoch": 12.677241584748288, + "grad_norm": 0.001508135930635035, + "learning_rate": 1.7786350313678885e-05, + "loss": 0.0, + "num_input_tokens_seen": 49377752, + "step": 85115 + }, + { + "epoch": 12.677986297289246, + "grad_norm": 74.26902770996094, + "learning_rate": 1.7783239163936172e-05, + "loss": 0.0428, + "num_input_tokens_seen": 49380600, + "step": 85120 + }, + { + "epoch": 12.678731009830205, + "grad_norm": 0.11799287050962448, + "learning_rate": 1.7780128136113305e-05, + "loss": 0.1293, + "num_input_tokens_seen": 49383512, + "step": 85125 + }, + { + "epoch": 12.679475722371166, + "grad_norm": 41.40006637573242, + "learning_rate": 1.777701723026283e-05, + "loss": 0.3176, + "num_input_tokens_seen": 49386360, + "step": 85130 + }, + { + "epoch": 12.680220434912124, + "grad_norm": 0.0026378037873655558, + "learning_rate": 1.7773906446437316e-05, + "loss": 0.1336, + "num_input_tokens_seen": 49389304, + "step": 85135 + }, + { + "epoch": 12.680965147453083, + "grad_norm": 0.01439505536109209, + "learning_rate": 1.7770795784689302e-05, + "loss": 0.0095, + "num_input_tokens_seen": 49392024, + "step": 85140 + }, + { + "epoch": 12.681709859994042, + "grad_norm": 0.03898108750581741, + "learning_rate": 1.7767685245071353e-05, + "loss": 0.0012, + "num_input_tokens_seen": 49394936, + "step": 85145 + }, + { + "epoch": 12.682454572535, + "grad_norm": 0.10759814828634262, + "learning_rate": 1.776457482763601e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49397784, + "step": 85150 + }, + { + "epoch": 12.683199285075961, + "grad_norm": 0.0030293904710561037, + "learning_rate": 1.776146453243581e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49400504, + "step": 85155 + }, + { + "epoch": 12.68394399761692, + "grad_norm": 1.5050073862075806, + "learning_rate": 1.7758354359523328e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49403064, + "step": 85160 + }, + { + "epoch": 12.684688710157879, + "grad_norm": 2.521357297897339, + "learning_rate": 1.775524430895107e-05, + "loss": 0.0052, + "num_input_tokens_seen": 49406104, + "step": 85165 + }, + { + "epoch": 12.685433422698837, + "grad_norm": 0.002452034270390868, + "learning_rate": 1.775213438077161e-05, + "loss": 0.0005, + "num_input_tokens_seen": 49408984, + "step": 85170 + }, + { + "epoch": 12.686178135239798, + "grad_norm": 0.0048150671645998955, + "learning_rate": 1.7749024575037466e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49411736, + "step": 85175 + }, + { + "epoch": 12.686922847780757, + "grad_norm": 0.004024558234959841, + "learning_rate": 1.774591489180119e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49414456, + "step": 85180 + }, + { + "epoch": 12.687667560321715, + "grad_norm": 0.00140563293825835, + "learning_rate": 1.77428053311153e-05, + "loss": 0.0857, + "num_input_tokens_seen": 49417272, + "step": 85185 + }, + { + "epoch": 12.688412272862674, + "grad_norm": 0.00017577456310391426, + "learning_rate": 1.7739695893032344e-05, + "loss": 0.0005, + "num_input_tokens_seen": 49420312, + "step": 85190 + }, + { + "epoch": 12.689156985403635, + "grad_norm": 0.0017198449932038784, + "learning_rate": 1.7736586577604846e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49423256, + "step": 85195 + }, + { + "epoch": 12.689901697944594, + "grad_norm": 29.5783748626709, + "learning_rate": 1.7733477384885333e-05, + "loss": 0.0827, + "num_input_tokens_seen": 49426040, + "step": 85200 + }, + { + "epoch": 12.690646410485552, + "grad_norm": 5.95854377746582, + "learning_rate": 1.7730368314926336e-05, + "loss": 0.2134, + "num_input_tokens_seen": 49428920, + "step": 85205 + }, + { + "epoch": 12.691391123026511, + "grad_norm": 0.0015305866254493594, + "learning_rate": 1.7727259367780374e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49431864, + "step": 85210 + }, + { + "epoch": 12.692135835567472, + "grad_norm": 0.0053578875958919525, + "learning_rate": 1.772415054349998e-05, + "loss": 0.0225, + "num_input_tokens_seen": 49434776, + "step": 85215 + }, + { + "epoch": 12.69288054810843, + "grad_norm": 23.701148986816406, + "learning_rate": 1.772104184213766e-05, + "loss": 0.01, + "num_input_tokens_seen": 49437592, + "step": 85220 + }, + { + "epoch": 12.69362526064939, + "grad_norm": 0.11188438534736633, + "learning_rate": 1.7717933263745947e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49440248, + "step": 85225 + }, + { + "epoch": 12.694369973190348, + "grad_norm": 0.0015039823483675718, + "learning_rate": 1.7714824808377346e-05, + "loss": 0.1292, + "num_input_tokens_seen": 49443288, + "step": 85230 + }, + { + "epoch": 12.695114685731308, + "grad_norm": 0.0040121800266206264, + "learning_rate": 1.7711716476084384e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49446360, + "step": 85235 + }, + { + "epoch": 12.695859398272267, + "grad_norm": 0.014517301693558693, + "learning_rate": 1.770860826691956e-05, + "loss": 0.1541, + "num_input_tokens_seen": 49449208, + "step": 85240 + }, + { + "epoch": 12.696604110813226, + "grad_norm": 0.0002752428699750453, + "learning_rate": 1.770550018093538e-05, + "loss": 0.0344, + "num_input_tokens_seen": 49451992, + "step": 85245 + }, + { + "epoch": 12.697348823354185, + "grad_norm": 0.006573130376636982, + "learning_rate": 1.770239221818437e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49454808, + "step": 85250 + }, + { + "epoch": 12.698093535895145, + "grad_norm": 0.012767911888659, + "learning_rate": 1.7699284378719017e-05, + "loss": 0.0174, + "num_input_tokens_seen": 49457720, + "step": 85255 + }, + { + "epoch": 12.698838248436104, + "grad_norm": 0.004626764450222254, + "learning_rate": 1.7696176662591844e-05, + "loss": 0.2176, + "num_input_tokens_seen": 49460664, + "step": 85260 + }, + { + "epoch": 12.699582960977063, + "grad_norm": 0.002405972685664892, + "learning_rate": 1.7693069069855343e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49463192, + "step": 85265 + }, + { + "epoch": 12.700327673518021, + "grad_norm": 20.512557983398438, + "learning_rate": 1.7689961600562014e-05, + "loss": 0.2702, + "num_input_tokens_seen": 49466008, + "step": 85270 + }, + { + "epoch": 12.701072386058982, + "grad_norm": 0.013461980037391186, + "learning_rate": 1.7686854254764355e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49468664, + "step": 85275 + }, + { + "epoch": 12.70181709859994, + "grad_norm": 0.002740357303991914, + "learning_rate": 1.768374703251485e-05, + "loss": 0.0, + "num_input_tokens_seen": 49471512, + "step": 85280 + }, + { + "epoch": 12.7025618111409, + "grad_norm": 0.18117885291576385, + "learning_rate": 1.768063993386601e-05, + "loss": 0.0004, + "num_input_tokens_seen": 49474264, + "step": 85285 + }, + { + "epoch": 12.703306523681858, + "grad_norm": 0.005165584851056337, + "learning_rate": 1.7677532958870315e-05, + "loss": 0.0648, + "num_input_tokens_seen": 49477144, + "step": 85290 + }, + { + "epoch": 12.704051236222817, + "grad_norm": 0.006424382328987122, + "learning_rate": 1.7674426107580268e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49479896, + "step": 85295 + }, + { + "epoch": 12.704795948763778, + "grad_norm": 0.00992517452687025, + "learning_rate": 1.767131938004834e-05, + "loss": 0.0, + "num_input_tokens_seen": 49482616, + "step": 85300 + }, + { + "epoch": 12.705540661304736, + "grad_norm": 0.00152075185906142, + "learning_rate": 1.7668212776327025e-05, + "loss": 0.0762, + "num_input_tokens_seen": 49485176, + "step": 85305 + }, + { + "epoch": 12.706285373845695, + "grad_norm": 0.0026437605265527964, + "learning_rate": 1.7665106296468793e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49488056, + "step": 85310 + }, + { + "epoch": 12.707030086386656, + "grad_norm": 0.0014337406028062105, + "learning_rate": 1.766199994052615e-05, + "loss": 0.0022, + "num_input_tokens_seen": 49491000, + "step": 85315 + }, + { + "epoch": 12.707774798927614, + "grad_norm": 0.5888291001319885, + "learning_rate": 1.7658893708551557e-05, + "loss": 0.1075, + "num_input_tokens_seen": 49494072, + "step": 85320 + }, + { + "epoch": 12.708519511468573, + "grad_norm": 0.004409740678966045, + "learning_rate": 1.7655787600597486e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49497016, + "step": 85325 + }, + { + "epoch": 12.709264224009532, + "grad_norm": 0.013685100711882114, + "learning_rate": 1.7652681616716428e-05, + "loss": 0.001, + "num_input_tokens_seen": 49499896, + "step": 85330 + }, + { + "epoch": 12.71000893655049, + "grad_norm": 0.011495106853544712, + "learning_rate": 1.764957575696084e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49502648, + "step": 85335 + }, + { + "epoch": 12.710753649091451, + "grad_norm": 0.005007172003388405, + "learning_rate": 1.7646470021383204e-05, + "loss": 0.0734, + "num_input_tokens_seen": 49506616, + "step": 85340 + }, + { + "epoch": 12.71149836163241, + "grad_norm": 0.0002701386983972043, + "learning_rate": 1.7643364410035974e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49509528, + "step": 85345 + }, + { + "epoch": 12.712243074173369, + "grad_norm": 0.005321064498275518, + "learning_rate": 1.7640258922971636e-05, + "loss": 0.1439, + "num_input_tokens_seen": 49512376, + "step": 85350 + }, + { + "epoch": 12.712987786714327, + "grad_norm": 0.031161542981863022, + "learning_rate": 1.763715356024264e-05, + "loss": 0.0125, + "num_input_tokens_seen": 49515032, + "step": 85355 + }, + { + "epoch": 12.713732499255288, + "grad_norm": 0.012315036728978157, + "learning_rate": 1.7634048321901447e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49517912, + "step": 85360 + }, + { + "epoch": 12.714477211796247, + "grad_norm": 0.00962352566421032, + "learning_rate": 1.7630943208000526e-05, + "loss": 0.1918, + "num_input_tokens_seen": 49520728, + "step": 85365 + }, + { + "epoch": 12.715221924337206, + "grad_norm": 72.70338439941406, + "learning_rate": 1.762783821859233e-05, + "loss": 0.0329, + "num_input_tokens_seen": 49523800, + "step": 85370 + }, + { + "epoch": 12.715966636878164, + "grad_norm": 0.0026200676802545786, + "learning_rate": 1.762473335372932e-05, + "loss": 0.1039, + "num_input_tokens_seen": 49526712, + "step": 85375 + }, + { + "epoch": 12.716711349419125, + "grad_norm": 0.007189625408500433, + "learning_rate": 1.7621628613463928e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49529592, + "step": 85380 + }, + { + "epoch": 12.717456061960084, + "grad_norm": 0.008421545848250389, + "learning_rate": 1.7618523997848634e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49532312, + "step": 85385 + }, + { + "epoch": 12.718200774501042, + "grad_norm": 0.041139621287584305, + "learning_rate": 1.7615419506935866e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49534936, + "step": 85390 + }, + { + "epoch": 12.718945487042001, + "grad_norm": 0.07477787137031555, + "learning_rate": 1.761231514077809e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49537784, + "step": 85395 + }, + { + "epoch": 12.719690199582962, + "grad_norm": 0.004553090315312147, + "learning_rate": 1.760921089942774e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49540600, + "step": 85400 + }, + { + "epoch": 12.72043491212392, + "grad_norm": 0.009886344894766808, + "learning_rate": 1.7606106782937256e-05, + "loss": 0.1157, + "num_input_tokens_seen": 49543640, + "step": 85405 + }, + { + "epoch": 12.72117962466488, + "grad_norm": 0.0032321333419531584, + "learning_rate": 1.760300279135909e-05, + "loss": 0.0007, + "num_input_tokens_seen": 49546392, + "step": 85410 + }, + { + "epoch": 12.721924337205838, + "grad_norm": 0.08862777054309845, + "learning_rate": 1.759989892474566e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49549144, + "step": 85415 + }, + { + "epoch": 12.722669049746798, + "grad_norm": 3.7062859535217285, + "learning_rate": 1.7596795183149428e-05, + "loss": 0.1868, + "num_input_tokens_seen": 49551992, + "step": 85420 + }, + { + "epoch": 12.723413762287757, + "grad_norm": 0.010921919718384743, + "learning_rate": 1.759369156662281e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49554584, + "step": 85425 + }, + { + "epoch": 12.724158474828716, + "grad_norm": 0.001889337319880724, + "learning_rate": 1.759058807521825e-05, + "loss": 0.0086, + "num_input_tokens_seen": 49557400, + "step": 85430 + }, + { + "epoch": 12.724903187369675, + "grad_norm": 9.735382627695799e-05, + "learning_rate": 1.7587484708988176e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49560024, + "step": 85435 + }, + { + "epoch": 12.725647899910635, + "grad_norm": 0.0020455168560147285, + "learning_rate": 1.758438146798501e-05, + "loss": 0.0197, + "num_input_tokens_seen": 49562776, + "step": 85440 + }, + { + "epoch": 12.726392612451594, + "grad_norm": 0.00010470223787706345, + "learning_rate": 1.758127835226119e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49565528, + "step": 85445 + }, + { + "epoch": 12.727137324992553, + "grad_norm": 30.989700317382812, + "learning_rate": 1.757817536186912e-05, + "loss": 0.0733, + "num_input_tokens_seen": 49568696, + "step": 85450 + }, + { + "epoch": 12.727882037533512, + "grad_norm": 5.429154872894287, + "learning_rate": 1.7575072496861243e-05, + "loss": 0.0049, + "num_input_tokens_seen": 49571960, + "step": 85455 + }, + { + "epoch": 12.728626750074472, + "grad_norm": 0.007467359304428101, + "learning_rate": 1.757196975728996e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49574840, + "step": 85460 + }, + { + "epoch": 12.72937146261543, + "grad_norm": 0.023058097809553146, + "learning_rate": 1.7568867143207708e-05, + "loss": 0.0011, + "num_input_tokens_seen": 49577624, + "step": 85465 + }, + { + "epoch": 12.73011617515639, + "grad_norm": 85.3587875366211, + "learning_rate": 1.7565764654666888e-05, + "loss": 0.1709, + "num_input_tokens_seen": 49580568, + "step": 85470 + }, + { + "epoch": 12.730860887697348, + "grad_norm": 0.008718418888747692, + "learning_rate": 1.756266229171993e-05, + "loss": 0.0004, + "num_input_tokens_seen": 49583416, + "step": 85475 + }, + { + "epoch": 12.731605600238307, + "grad_norm": 6.244245529174805, + "learning_rate": 1.7559560054419225e-05, + "loss": 0.054, + "num_input_tokens_seen": 49586168, + "step": 85480 + }, + { + "epoch": 12.732350312779268, + "grad_norm": 0.9134387969970703, + "learning_rate": 1.7556457942817184e-05, + "loss": 0.0004, + "num_input_tokens_seen": 49589304, + "step": 85485 + }, + { + "epoch": 12.733095025320226, + "grad_norm": 1.4432148933410645, + "learning_rate": 1.7553355956966227e-05, + "loss": 0.0014, + "num_input_tokens_seen": 49592376, + "step": 85490 + }, + { + "epoch": 12.733839737861185, + "grad_norm": 0.016722213476896286, + "learning_rate": 1.7550254096918748e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49595288, + "step": 85495 + }, + { + "epoch": 12.734584450402146, + "grad_norm": 0.00026199486455880105, + "learning_rate": 1.754715236272716e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49598296, + "step": 85500 + }, + { + "epoch": 12.735329162943104, + "grad_norm": 0.0013452480779960752, + "learning_rate": 1.7544050754443857e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49601176, + "step": 85505 + }, + { + "epoch": 12.736073875484063, + "grad_norm": 0.0018230044515803456, + "learning_rate": 1.7540949272121244e-05, + "loss": 0.008, + "num_input_tokens_seen": 49604280, + "step": 85510 + }, + { + "epoch": 12.736818588025022, + "grad_norm": 0.0047990805469453335, + "learning_rate": 1.753784791581171e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49607416, + "step": 85515 + }, + { + "epoch": 12.73756330056598, + "grad_norm": 0.019571145996451378, + "learning_rate": 1.753474668556764e-05, + "loss": 0.1192, + "num_input_tokens_seen": 49610520, + "step": 85520 + }, + { + "epoch": 12.738308013106941, + "grad_norm": 0.0049005295149981976, + "learning_rate": 1.7531645581441447e-05, + "loss": 0.0, + "num_input_tokens_seen": 49613272, + "step": 85525 + }, + { + "epoch": 12.7390527256479, + "grad_norm": 0.001886082929559052, + "learning_rate": 1.7528544603485507e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49616312, + "step": 85530 + }, + { + "epoch": 12.739797438188859, + "grad_norm": 0.014291423372924328, + "learning_rate": 1.7525443751752218e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49619224, + "step": 85535 + }, + { + "epoch": 12.740542150729818, + "grad_norm": 0.013937222771346569, + "learning_rate": 1.7522343026293953e-05, + "loss": 0.0008, + "num_input_tokens_seen": 49622360, + "step": 85540 + }, + { + "epoch": 12.741286863270778, + "grad_norm": 0.00786004588007927, + "learning_rate": 1.751924242716311e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49625304, + "step": 85545 + }, + { + "epoch": 12.742031575811737, + "grad_norm": 0.012381323613226414, + "learning_rate": 1.751614195441205e-05, + "loss": 0.0004, + "num_input_tokens_seen": 49628120, + "step": 85550 + }, + { + "epoch": 12.742776288352696, + "grad_norm": 0.045022208243608475, + "learning_rate": 1.7513041608093185e-05, + "loss": 0.0108, + "num_input_tokens_seen": 49630936, + "step": 85555 + }, + { + "epoch": 12.743521000893654, + "grad_norm": 0.0007173011545091867, + "learning_rate": 1.7509941388258865e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49633880, + "step": 85560 + }, + { + "epoch": 12.744265713434615, + "grad_norm": 0.001082685892470181, + "learning_rate": 1.750684129496147e-05, + "loss": 0.0, + "num_input_tokens_seen": 49636568, + "step": 85565 + }, + { + "epoch": 12.745010425975574, + "grad_norm": 0.04871830716729164, + "learning_rate": 1.7503741328253377e-05, + "loss": 0.0658, + "num_input_tokens_seen": 49639096, + "step": 85570 + }, + { + "epoch": 12.745755138516532, + "grad_norm": 34.28227996826172, + "learning_rate": 1.7500641488186946e-05, + "loss": 0.2037, + "num_input_tokens_seen": 49641912, + "step": 85575 + }, + { + "epoch": 12.746499851057491, + "grad_norm": 0.004275629762560129, + "learning_rate": 1.7497541774814568e-05, + "loss": 0.0525, + "num_input_tokens_seen": 49644760, + "step": 85580 + }, + { + "epoch": 12.747244563598452, + "grad_norm": 0.03209638595581055, + "learning_rate": 1.7494442188188588e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49647800, + "step": 85585 + }, + { + "epoch": 12.74798927613941, + "grad_norm": 0.006099701393395662, + "learning_rate": 1.7491342728361383e-05, + "loss": 0.0323, + "num_input_tokens_seen": 49650648, + "step": 85590 + }, + { + "epoch": 12.74873398868037, + "grad_norm": 0.009351495653390884, + "learning_rate": 1.7488243395385317e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49653688, + "step": 85595 + }, + { + "epoch": 12.749478701221328, + "grad_norm": 0.01955948770046234, + "learning_rate": 1.7485144189312735e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49656728, + "step": 85600 + }, + { + "epoch": 12.750223413762289, + "grad_norm": 0.0022362617310136557, + "learning_rate": 1.7482045110196014e-05, + "loss": 0.0006, + "num_input_tokens_seen": 49659480, + "step": 85605 + }, + { + "epoch": 12.750968126303247, + "grad_norm": 0.0017151819774881005, + "learning_rate": 1.7478946158087484e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49662552, + "step": 85610 + }, + { + "epoch": 12.751712838844206, + "grad_norm": 0.006828207056969404, + "learning_rate": 1.747584733303953e-05, + "loss": 0.1813, + "num_input_tokens_seen": 49665336, + "step": 85615 + }, + { + "epoch": 12.752457551385165, + "grad_norm": 0.022269979119300842, + "learning_rate": 1.7472748635104475e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49667928, + "step": 85620 + }, + { + "epoch": 12.753202263926125, + "grad_norm": 0.04577671363949776, + "learning_rate": 1.7469650064334693e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49670712, + "step": 85625 + }, + { + "epoch": 12.753946976467084, + "grad_norm": 0.0027796258218586445, + "learning_rate": 1.746655162078251e-05, + "loss": 0.0, + "num_input_tokens_seen": 49673656, + "step": 85630 + }, + { + "epoch": 12.754691689008043, + "grad_norm": 0.0018503712490200996, + "learning_rate": 1.7463453304500292e-05, + "loss": 0.0088, + "num_input_tokens_seen": 49676504, + "step": 85635 + }, + { + "epoch": 12.755436401549002, + "grad_norm": 0.010955153964459896, + "learning_rate": 1.7460355115540366e-05, + "loss": 0.1315, + "num_input_tokens_seen": 49679768, + "step": 85640 + }, + { + "epoch": 12.756181114089962, + "grad_norm": 0.001280555734410882, + "learning_rate": 1.7457257053955073e-05, + "loss": 0.0333, + "num_input_tokens_seen": 49682680, + "step": 85645 + }, + { + "epoch": 12.756925826630921, + "grad_norm": 15.908454895019531, + "learning_rate": 1.7454159119796758e-05, + "loss": 0.104, + "num_input_tokens_seen": 49685624, + "step": 85650 + }, + { + "epoch": 12.75767053917188, + "grad_norm": 0.012520047836005688, + "learning_rate": 1.745106131311775e-05, + "loss": 0.001, + "num_input_tokens_seen": 49688888, + "step": 85655 + }, + { + "epoch": 12.758415251712838, + "grad_norm": 0.0006127561791799963, + "learning_rate": 1.74479636339704e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49691960, + "step": 85660 + }, + { + "epoch": 12.759159964253797, + "grad_norm": 0.007021514233201742, + "learning_rate": 1.7444866082407026e-05, + "loss": 0.0006, + "num_input_tokens_seen": 49694776, + "step": 85665 + }, + { + "epoch": 12.759904676794758, + "grad_norm": 0.02880801074206829, + "learning_rate": 1.744176865847996e-05, + "loss": 0.0004, + "num_input_tokens_seen": 49697816, + "step": 85670 + }, + { + "epoch": 12.760649389335716, + "grad_norm": 0.014505007304251194, + "learning_rate": 1.743867136224153e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49700536, + "step": 85675 + }, + { + "epoch": 12.761394101876675, + "grad_norm": 2.5268394892918877e-05, + "learning_rate": 1.743557419374406e-05, + "loss": 0.0226, + "num_input_tokens_seen": 49703352, + "step": 85680 + }, + { + "epoch": 12.762138814417634, + "grad_norm": 0.2137998789548874, + "learning_rate": 1.7432477153039877e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49706072, + "step": 85685 + }, + { + "epoch": 12.762883526958595, + "grad_norm": 0.0015571730909869075, + "learning_rate": 1.7429380240181296e-05, + "loss": 0.054, + "num_input_tokens_seen": 49708824, + "step": 85690 + }, + { + "epoch": 12.763628239499553, + "grad_norm": 0.3913547098636627, + "learning_rate": 1.7426283455220652e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49711544, + "step": 85695 + }, + { + "epoch": 12.764372952040512, + "grad_norm": 0.036297544836997986, + "learning_rate": 1.7423186798210244e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49714872, + "step": 85700 + }, + { + "epoch": 12.76511766458147, + "grad_norm": 0.0009282070677727461, + "learning_rate": 1.74200902692024e-05, + "loss": 0.0, + "num_input_tokens_seen": 49717688, + "step": 85705 + }, + { + "epoch": 12.765862377122431, + "grad_norm": 0.0007557700737379491, + "learning_rate": 1.7416993868249422e-05, + "loss": 0.0018, + "num_input_tokens_seen": 49720728, + "step": 85710 + }, + { + "epoch": 12.76660708966339, + "grad_norm": 0.0012669540010392666, + "learning_rate": 1.7413897595403627e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49723544, + "step": 85715 + }, + { + "epoch": 12.767351802204349, + "grad_norm": 0.00011877428914885968, + "learning_rate": 1.741080145071733e-05, + "loss": 0.0, + "num_input_tokens_seen": 49726392, + "step": 85720 + }, + { + "epoch": 12.768096514745308, + "grad_norm": 1.1432950496673584, + "learning_rate": 1.740770543424281e-05, + "loss": 0.0793, + "num_input_tokens_seen": 49729272, + "step": 85725 + }, + { + "epoch": 12.768841227286268, + "grad_norm": 0.0015796402003616095, + "learning_rate": 1.7404609546032407e-05, + "loss": 0.0, + "num_input_tokens_seen": 49732312, + "step": 85730 + }, + { + "epoch": 12.769585939827227, + "grad_norm": 0.01592382788658142, + "learning_rate": 1.7401513786138402e-05, + "loss": 0.0013, + "num_input_tokens_seen": 49734840, + "step": 85735 + }, + { + "epoch": 12.770330652368186, + "grad_norm": 0.003649502992630005, + "learning_rate": 1.73984181546131e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49737976, + "step": 85740 + }, + { + "epoch": 12.771075364909144, + "grad_norm": 0.0065109338611364365, + "learning_rate": 1.7395322651508788e-05, + "loss": 0.1342, + "num_input_tokens_seen": 49740888, + "step": 85745 + }, + { + "epoch": 12.771820077450105, + "grad_norm": 0.0014312088023871183, + "learning_rate": 1.739222727687778e-05, + "loss": 0.225, + "num_input_tokens_seen": 49743672, + "step": 85750 + }, + { + "epoch": 12.772564789991064, + "grad_norm": 0.005178509280085564, + "learning_rate": 1.7389132030772365e-05, + "loss": 0.0018, + "num_input_tokens_seen": 49746456, + "step": 85755 + }, + { + "epoch": 12.773309502532022, + "grad_norm": 0.0009848776971921325, + "learning_rate": 1.7386036913244812e-05, + "loss": 0.4126, + "num_input_tokens_seen": 49749304, + "step": 85760 + }, + { + "epoch": 12.774054215072981, + "grad_norm": 0.0036430826876312494, + "learning_rate": 1.7382941924347443e-05, + "loss": 0.0452, + "num_input_tokens_seen": 49752216, + "step": 85765 + }, + { + "epoch": 12.774798927613942, + "grad_norm": 1.8442713553668e-05, + "learning_rate": 1.737984706413252e-05, + "loss": 0.0004, + "num_input_tokens_seen": 49755096, + "step": 85770 + }, + { + "epoch": 12.7755436401549, + "grad_norm": 0.0016590714221820235, + "learning_rate": 1.737675233265234e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49758072, + "step": 85775 + }, + { + "epoch": 12.77628835269586, + "grad_norm": 12.603066444396973, + "learning_rate": 1.7373657729959176e-05, + "loss": 0.0521, + "num_input_tokens_seen": 49761272, + "step": 85780 + }, + { + "epoch": 12.777033065236818, + "grad_norm": 0.0030424485448747873, + "learning_rate": 1.7370563256105322e-05, + "loss": 0.0, + "num_input_tokens_seen": 49763992, + "step": 85785 + }, + { + "epoch": 12.777777777777779, + "grad_norm": 0.01674521714448929, + "learning_rate": 1.7367468911143036e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49766648, + "step": 85790 + }, + { + "epoch": 12.778522490318737, + "grad_norm": 0.0001492440205765888, + "learning_rate": 1.7364374695124613e-05, + "loss": 0.2908, + "num_input_tokens_seen": 49769496, + "step": 85795 + }, + { + "epoch": 12.779267202859696, + "grad_norm": 0.007452419959008694, + "learning_rate": 1.7361280608102325e-05, + "loss": 0.0004, + "num_input_tokens_seen": 49772408, + "step": 85800 + }, + { + "epoch": 12.780011915400655, + "grad_norm": 0.0018084163311868906, + "learning_rate": 1.7358186650128427e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49775096, + "step": 85805 + }, + { + "epoch": 12.780756627941614, + "grad_norm": 0.001281910459510982, + "learning_rate": 1.7355092821255208e-05, + "loss": 0.0, + "num_input_tokens_seen": 49777848, + "step": 85810 + }, + { + "epoch": 12.781501340482574, + "grad_norm": 0.0027677584439516068, + "learning_rate": 1.7351999121534913e-05, + "loss": 0.0009, + "num_input_tokens_seen": 49780760, + "step": 85815 + }, + { + "epoch": 12.782246053023533, + "grad_norm": 0.0021137818694114685, + "learning_rate": 1.734890555101983e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49783480, + "step": 85820 + }, + { + "epoch": 12.782990765564492, + "grad_norm": 0.06425339728593826, + "learning_rate": 1.7345812109762204e-05, + "loss": 0.0806, + "num_input_tokens_seen": 49786520, + "step": 85825 + }, + { + "epoch": 12.783735478105452, + "grad_norm": 0.009444275870919228, + "learning_rate": 1.734271879781431e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49789400, + "step": 85830 + }, + { + "epoch": 12.784480190646411, + "grad_norm": 47.61064529418945, + "learning_rate": 1.7339625615228396e-05, + "loss": 0.3595, + "num_input_tokens_seen": 49792760, + "step": 85835 + }, + { + "epoch": 12.78522490318737, + "grad_norm": 0.0037584463134407997, + "learning_rate": 1.733653256205673e-05, + "loss": 0.1314, + "num_input_tokens_seen": 49795672, + "step": 85840 + }, + { + "epoch": 12.785969615728328, + "grad_norm": 0.018671473488211632, + "learning_rate": 1.733343963835155e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49798968, + "step": 85845 + }, + { + "epoch": 12.786714328269287, + "grad_norm": 0.0028088847175240517, + "learning_rate": 1.733034684416511e-05, + "loss": 0.0866, + "num_input_tokens_seen": 49801848, + "step": 85850 + }, + { + "epoch": 12.787459040810248, + "grad_norm": 0.0116038853302598, + "learning_rate": 1.7327254179549674e-05, + "loss": 0.0191, + "num_input_tokens_seen": 49804888, + "step": 85855 + }, + { + "epoch": 12.788203753351207, + "grad_norm": 0.038771554827690125, + "learning_rate": 1.7324161644557472e-05, + "loss": 0.0046, + "num_input_tokens_seen": 49808024, + "step": 85860 + }, + { + "epoch": 12.788948465892165, + "grad_norm": 0.0005932456697337329, + "learning_rate": 1.7321069239240767e-05, + "loss": 0.0057, + "num_input_tokens_seen": 49810840, + "step": 85865 + }, + { + "epoch": 12.789693178433124, + "grad_norm": 0.48324888944625854, + "learning_rate": 1.7317976963651783e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49814040, + "step": 85870 + }, + { + "epoch": 12.790437890974085, + "grad_norm": 0.006349166855216026, + "learning_rate": 1.731488481784278e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49817112, + "step": 85875 + }, + { + "epoch": 12.791182603515043, + "grad_norm": 0.024959813803434372, + "learning_rate": 1.731179280186599e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49820408, + "step": 85880 + }, + { + "epoch": 12.791927316056002, + "grad_norm": 0.0033323164097964764, + "learning_rate": 1.730870091577363e-05, + "loss": 0.0004, + "num_input_tokens_seen": 49823640, + "step": 85885 + }, + { + "epoch": 12.79267202859696, + "grad_norm": 0.012747218832373619, + "learning_rate": 1.7305609159617965e-05, + "loss": 0.001, + "num_input_tokens_seen": 49826616, + "step": 85890 + }, + { + "epoch": 12.793416741137921, + "grad_norm": 0.4919515550136566, + "learning_rate": 1.730251753345121e-05, + "loss": 0.0005, + "num_input_tokens_seen": 49829464, + "step": 85895 + }, + { + "epoch": 12.79416145367888, + "grad_norm": 0.015575661323964596, + "learning_rate": 1.72994260373256e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49832248, + "step": 85900 + }, + { + "epoch": 12.794906166219839, + "grad_norm": 11.935733795166016, + "learning_rate": 1.729633467129335e-05, + "loss": 0.0023, + "num_input_tokens_seen": 49835256, + "step": 85905 + }, + { + "epoch": 12.795650878760798, + "grad_norm": 0.004104229621589184, + "learning_rate": 1.7293243435406705e-05, + "loss": 0.0, + "num_input_tokens_seen": 49838264, + "step": 85910 + }, + { + "epoch": 12.796395591301758, + "grad_norm": 0.003840456251055002, + "learning_rate": 1.7290152329717873e-05, + "loss": 0.1274, + "num_input_tokens_seen": 49841176, + "step": 85915 + }, + { + "epoch": 12.797140303842717, + "grad_norm": 0.004501932300627232, + "learning_rate": 1.7287061354279088e-05, + "loss": 0.0945, + "num_input_tokens_seen": 49844184, + "step": 85920 + }, + { + "epoch": 12.797885016383676, + "grad_norm": 0.0008701806073077023, + "learning_rate": 1.7283970509142567e-05, + "loss": 0.0, + "num_input_tokens_seen": 49847000, + "step": 85925 + }, + { + "epoch": 12.798629728924634, + "grad_norm": 31.234416961669922, + "learning_rate": 1.728087979436051e-05, + "loss": 0.0148, + "num_input_tokens_seen": 49850168, + "step": 85930 + }, + { + "epoch": 12.799374441465595, + "grad_norm": 0.00022290690685622394, + "learning_rate": 1.7277789209985155e-05, + "loss": 0.0005, + "num_input_tokens_seen": 49853368, + "step": 85935 + }, + { + "epoch": 12.800119154006554, + "grad_norm": 0.001487645087763667, + "learning_rate": 1.727469875606869e-05, + "loss": 0.0333, + "num_input_tokens_seen": 49856536, + "step": 85940 + }, + { + "epoch": 12.800863866547513, + "grad_norm": 0.0023919357918202877, + "learning_rate": 1.727160843266335e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49859896, + "step": 85945 + }, + { + "epoch": 12.801608579088471, + "grad_norm": 0.002227624412626028, + "learning_rate": 1.7268518239821318e-05, + "loss": 0.0, + "num_input_tokens_seen": 49862776, + "step": 85950 + }, + { + "epoch": 12.802353291629432, + "grad_norm": 0.05178854614496231, + "learning_rate": 1.7265428177594822e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49865592, + "step": 85955 + }, + { + "epoch": 12.80309800417039, + "grad_norm": 0.047200124710798264, + "learning_rate": 1.7262338246036057e-05, + "loss": 0.019, + "num_input_tokens_seen": 49868536, + "step": 85960 + }, + { + "epoch": 12.80384271671135, + "grad_norm": 0.017823413014411926, + "learning_rate": 1.7259248445197217e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49871384, + "step": 85965 + }, + { + "epoch": 12.804587429252308, + "grad_norm": 0.006016066763550043, + "learning_rate": 1.7256158775130517e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49874232, + "step": 85970 + }, + { + "epoch": 12.805332141793269, + "grad_norm": 0.004846783820539713, + "learning_rate": 1.725306923588813e-05, + "loss": 0.1597, + "num_input_tokens_seen": 49877144, + "step": 85975 + }, + { + "epoch": 12.806076854334227, + "grad_norm": 0.4736813008785248, + "learning_rate": 1.7249979827522274e-05, + "loss": 0.146, + "num_input_tokens_seen": 49880344, + "step": 85980 + }, + { + "epoch": 12.806821566875186, + "grad_norm": 0.008291794918477535, + "learning_rate": 1.7246890550085122e-05, + "loss": 0.1688, + "num_input_tokens_seen": 49883256, + "step": 85985 + }, + { + "epoch": 12.807566279416145, + "grad_norm": 0.00025215791538357735, + "learning_rate": 1.7243801403628887e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49886296, + "step": 85990 + }, + { + "epoch": 12.808310991957104, + "grad_norm": 0.11184889823198318, + "learning_rate": 1.724071238820574e-05, + "loss": 0.0006, + "num_input_tokens_seen": 49889080, + "step": 85995 + }, + { + "epoch": 12.809055704498064, + "grad_norm": 0.06088162213563919, + "learning_rate": 1.723762350386787e-05, + "loss": 0.0005, + "num_input_tokens_seen": 49892056, + "step": 86000 + }, + { + "epoch": 12.809800417039023, + "grad_norm": 0.20062902569770813, + "learning_rate": 1.7234534750667468e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49894968, + "step": 86005 + }, + { + "epoch": 12.810545129579982, + "grad_norm": 0.0037692273035645485, + "learning_rate": 1.7231446128656693e-05, + "loss": 0.2345, + "num_input_tokens_seen": 49897784, + "step": 86010 + }, + { + "epoch": 12.811289842120942, + "grad_norm": 0.030806107446551323, + "learning_rate": 1.7228357637887755e-05, + "loss": 0.0057, + "num_input_tokens_seen": 49900600, + "step": 86015 + }, + { + "epoch": 12.812034554661901, + "grad_norm": 0.0006393928197212517, + "learning_rate": 1.7225269278412802e-05, + "loss": 0.0676, + "num_input_tokens_seen": 49903288, + "step": 86020 + }, + { + "epoch": 12.81277926720286, + "grad_norm": 0.007870725356042385, + "learning_rate": 1.7222181050284037e-05, + "loss": 0.0005, + "num_input_tokens_seen": 49906168, + "step": 86025 + }, + { + "epoch": 12.813523979743819, + "grad_norm": 26.213218688964844, + "learning_rate": 1.7219092953553612e-05, + "loss": 0.1535, + "num_input_tokens_seen": 49909272, + "step": 86030 + }, + { + "epoch": 12.814268692284777, + "grad_norm": 0.04391617700457573, + "learning_rate": 1.7216004988273706e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49912088, + "step": 86035 + }, + { + "epoch": 12.815013404825738, + "grad_norm": 0.0008751377463340759, + "learning_rate": 1.7212917154496488e-05, + "loss": 0.0033, + "num_input_tokens_seen": 49914936, + "step": 86040 + }, + { + "epoch": 12.815758117366697, + "grad_norm": 0.005639682058244944, + "learning_rate": 1.7209829452274108e-05, + "loss": 0.0018, + "num_input_tokens_seen": 49917624, + "step": 86045 + }, + { + "epoch": 12.816502829907655, + "grad_norm": 0.014424136839807034, + "learning_rate": 1.720674188165875e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49920312, + "step": 86050 + }, + { + "epoch": 12.817247542448614, + "grad_norm": 0.00830841064453125, + "learning_rate": 1.720365444270256e-05, + "loss": 0.0, + "num_input_tokens_seen": 49922968, + "step": 86055 + }, + { + "epoch": 12.817992254989575, + "grad_norm": 0.05327388644218445, + "learning_rate": 1.720056713545771e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49925912, + "step": 86060 + }, + { + "epoch": 12.818736967530533, + "grad_norm": 50.8437614440918, + "learning_rate": 1.7197479959976353e-05, + "loss": 0.0733, + "num_input_tokens_seen": 49928760, + "step": 86065 + }, + { + "epoch": 12.819481680071492, + "grad_norm": 0.014138366095721722, + "learning_rate": 1.719439291631064e-05, + "loss": 0.041, + "num_input_tokens_seen": 49931672, + "step": 86070 + }, + { + "epoch": 12.820226392612451, + "grad_norm": 0.55284583568573, + "learning_rate": 1.7191306004512723e-05, + "loss": 0.3677, + "num_input_tokens_seen": 49934552, + "step": 86075 + }, + { + "epoch": 12.820971105153411, + "grad_norm": 0.0014423667453229427, + "learning_rate": 1.7188219224634762e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49937848, + "step": 86080 + }, + { + "epoch": 12.82171581769437, + "grad_norm": 0.005591205786913633, + "learning_rate": 1.7185132576728898e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49940696, + "step": 86085 + }, + { + "epoch": 12.822460530235329, + "grad_norm": 0.006248617544770241, + "learning_rate": 1.718204606084726e-05, + "loss": 0.0136, + "num_input_tokens_seen": 49943480, + "step": 86090 + }, + { + "epoch": 12.823205242776288, + "grad_norm": 0.0036710998974740505, + "learning_rate": 1.717895967704202e-05, + "loss": 0.0002, + "num_input_tokens_seen": 49946136, + "step": 86095 + }, + { + "epoch": 12.823949955317248, + "grad_norm": 57.21271514892578, + "learning_rate": 1.7175873425365308e-05, + "loss": 0.0113, + "num_input_tokens_seen": 49949016, + "step": 86100 + }, + { + "epoch": 12.824694667858207, + "grad_norm": 0.004546158015727997, + "learning_rate": 1.7172787305869266e-05, + "loss": 0.0734, + "num_input_tokens_seen": 49951800, + "step": 86105 + }, + { + "epoch": 12.825439380399166, + "grad_norm": 0.0018952632090076804, + "learning_rate": 1.7169701318606014e-05, + "loss": 0.0009, + "num_input_tokens_seen": 49954520, + "step": 86110 + }, + { + "epoch": 12.826184092940125, + "grad_norm": 0.000212125014513731, + "learning_rate": 1.7166615463627712e-05, + "loss": 0.0005, + "num_input_tokens_seen": 49957400, + "step": 86115 + }, + { + "epoch": 12.826928805481085, + "grad_norm": 0.018448257818818092, + "learning_rate": 1.716352974098648e-05, + "loss": 0.0004, + "num_input_tokens_seen": 49960568, + "step": 86120 + }, + { + "epoch": 12.827673518022044, + "grad_norm": 0.1913902759552002, + "learning_rate": 1.716044415073444e-05, + "loss": 0.1659, + "num_input_tokens_seen": 49963256, + "step": 86125 + }, + { + "epoch": 12.828418230563003, + "grad_norm": 0.006168971303850412, + "learning_rate": 1.7157358692923737e-05, + "loss": 0.0076, + "num_input_tokens_seen": 49966040, + "step": 86130 + }, + { + "epoch": 12.829162943103961, + "grad_norm": 0.0020248848013579845, + "learning_rate": 1.7154273367606484e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49968760, + "step": 86135 + }, + { + "epoch": 12.829907655644922, + "grad_norm": 0.0031085822265595198, + "learning_rate": 1.715118817483481e-05, + "loss": 0.0004, + "num_input_tokens_seen": 49971544, + "step": 86140 + }, + { + "epoch": 12.83065236818588, + "grad_norm": 0.004790259059518576, + "learning_rate": 1.7148103114660825e-05, + "loss": 0.0003, + "num_input_tokens_seen": 49974232, + "step": 86145 + }, + { + "epoch": 12.83139708072684, + "grad_norm": 0.3917948305606842, + "learning_rate": 1.7145018187136668e-05, + "loss": 0.001, + "num_input_tokens_seen": 49977016, + "step": 86150 + }, + { + "epoch": 12.832141793267798, + "grad_norm": 0.01672595925629139, + "learning_rate": 1.7141933392314436e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49979928, + "step": 86155 + }, + { + "epoch": 12.832886505808759, + "grad_norm": 0.007926632650196552, + "learning_rate": 1.7138848730246264e-05, + "loss": 0.0011, + "num_input_tokens_seen": 49982904, + "step": 86160 + }, + { + "epoch": 12.833631218349717, + "grad_norm": 0.004228135105222464, + "learning_rate": 1.7135764200984253e-05, + "loss": 0.0004, + "num_input_tokens_seen": 49986008, + "step": 86165 + }, + { + "epoch": 12.834375930890676, + "grad_norm": 0.003537018084898591, + "learning_rate": 1.7132679804580505e-05, + "loss": 0.1346, + "num_input_tokens_seen": 49988856, + "step": 86170 + }, + { + "epoch": 12.835120643431635, + "grad_norm": 1.805498480796814, + "learning_rate": 1.7129595541087146e-05, + "loss": 0.0032, + "num_input_tokens_seen": 49991608, + "step": 86175 + }, + { + "epoch": 12.835865355972594, + "grad_norm": 0.021270744502544403, + "learning_rate": 1.7126511410556256e-05, + "loss": 0.0001, + "num_input_tokens_seen": 49994616, + "step": 86180 + }, + { + "epoch": 12.836610068513554, + "grad_norm": 0.013442466966807842, + "learning_rate": 1.7123427413039967e-05, + "loss": 0.0005, + "num_input_tokens_seen": 49997720, + "step": 86185 + }, + { + "epoch": 12.837354781054513, + "grad_norm": 0.0039382497780025005, + "learning_rate": 1.712034354859036e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50000440, + "step": 86190 + }, + { + "epoch": 12.838099493595472, + "grad_norm": 0.0003815798554569483, + "learning_rate": 1.711725981725954e-05, + "loss": 0.0, + "num_input_tokens_seen": 50003480, + "step": 86195 + }, + { + "epoch": 12.83884420613643, + "grad_norm": 0.0009653366869315505, + "learning_rate": 1.7114176219099607e-05, + "loss": 0.0032, + "num_input_tokens_seen": 50006200, + "step": 86200 + }, + { + "epoch": 12.839588918677391, + "grad_norm": 0.002132205292582512, + "learning_rate": 1.711109275416265e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50009112, + "step": 86205 + }, + { + "epoch": 12.84033363121835, + "grad_norm": 0.0028617915231734514, + "learning_rate": 1.7108009422500767e-05, + "loss": 0.0246, + "num_input_tokens_seen": 50012056, + "step": 86210 + }, + { + "epoch": 12.841078343759309, + "grad_norm": 10.362509727478027, + "learning_rate": 1.7104926224166033e-05, + "loss": 0.0065, + "num_input_tokens_seen": 50014936, + "step": 86215 + }, + { + "epoch": 12.841823056300267, + "grad_norm": 0.04725867137312889, + "learning_rate": 1.7101843159210556e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50017912, + "step": 86220 + }, + { + "epoch": 12.842567768841228, + "grad_norm": 0.027194542810320854, + "learning_rate": 1.709876022768641e-05, + "loss": 0.3519, + "num_input_tokens_seen": 50020824, + "step": 86225 + }, + { + "epoch": 12.843312481382187, + "grad_norm": 0.00665623415261507, + "learning_rate": 1.7095677429645682e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50024056, + "step": 86230 + }, + { + "epoch": 12.844057193923145, + "grad_norm": 3.516364574432373, + "learning_rate": 1.709259476514044e-05, + "loss": 0.0003, + "num_input_tokens_seen": 50026968, + "step": 86235 + }, + { + "epoch": 12.844801906464104, + "grad_norm": 0.012159540317952633, + "learning_rate": 1.7089512234222783e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50030168, + "step": 86240 + }, + { + "epoch": 12.845546619005065, + "grad_norm": 5.47294282913208, + "learning_rate": 1.7086429836944777e-05, + "loss": 0.0141, + "num_input_tokens_seen": 50033144, + "step": 86245 + }, + { + "epoch": 12.846291331546023, + "grad_norm": 2.262056350708008, + "learning_rate": 1.7083347573358484e-05, + "loss": 0.014, + "num_input_tokens_seen": 50036056, + "step": 86250 + }, + { + "epoch": 12.847036044086982, + "grad_norm": 3.165961742401123, + "learning_rate": 1.7080265443516e-05, + "loss": 0.068, + "num_input_tokens_seen": 50038968, + "step": 86255 + }, + { + "epoch": 12.847780756627941, + "grad_norm": 0.008329992182552814, + "learning_rate": 1.7077183447469376e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50041912, + "step": 86260 + }, + { + "epoch": 12.848525469168901, + "grad_norm": 0.038082800805568695, + "learning_rate": 1.7074101585270692e-05, + "loss": 0.0011, + "num_input_tokens_seen": 50044760, + "step": 86265 + }, + { + "epoch": 12.84927018170986, + "grad_norm": 0.006830411031842232, + "learning_rate": 1.7071019856971993e-05, + "loss": 0.0, + "num_input_tokens_seen": 50047256, + "step": 86270 + }, + { + "epoch": 12.850014894250819, + "grad_norm": 0.000621574989054352, + "learning_rate": 1.7067938262625364e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50050104, + "step": 86275 + }, + { + "epoch": 12.850759606791778, + "grad_norm": 0.00042930597555823624, + "learning_rate": 1.7064856802282865e-05, + "loss": 0.024, + "num_input_tokens_seen": 50053176, + "step": 86280 + }, + { + "epoch": 12.851504319332738, + "grad_norm": 0.0005573154194280505, + "learning_rate": 1.706177547599653e-05, + "loss": 0.0, + "num_input_tokens_seen": 50056088, + "step": 86285 + }, + { + "epoch": 12.852249031873697, + "grad_norm": 7.791140079498291, + "learning_rate": 1.7058694283818437e-05, + "loss": 0.0293, + "num_input_tokens_seen": 50059224, + "step": 86290 + }, + { + "epoch": 12.852993744414656, + "grad_norm": 29.481191635131836, + "learning_rate": 1.705561322580063e-05, + "loss": 0.1946, + "num_input_tokens_seen": 50061944, + "step": 86295 + }, + { + "epoch": 12.853738456955615, + "grad_norm": 0.0006435224204324186, + "learning_rate": 1.7052532301995168e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50064824, + "step": 86300 + }, + { + "epoch": 12.854483169496575, + "grad_norm": 0.108548603951931, + "learning_rate": 1.7049451512454085e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50067768, + "step": 86305 + }, + { + "epoch": 12.855227882037534, + "grad_norm": 126.7125244140625, + "learning_rate": 1.704637085722945e-05, + "loss": 0.2449, + "num_input_tokens_seen": 50070488, + "step": 86310 + }, + { + "epoch": 12.855972594578493, + "grad_norm": 28.75945281982422, + "learning_rate": 1.7043290336373286e-05, + "loss": 0.1293, + "num_input_tokens_seen": 50073176, + "step": 86315 + }, + { + "epoch": 12.856717307119451, + "grad_norm": 0.009517334401607513, + "learning_rate": 1.7040209949937653e-05, + "loss": 0.1221, + "num_input_tokens_seen": 50075928, + "step": 86320 + }, + { + "epoch": 12.857462019660412, + "grad_norm": 0.00051517115207389, + "learning_rate": 1.7037129697974585e-05, + "loss": 0.0, + "num_input_tokens_seen": 50078840, + "step": 86325 + }, + { + "epoch": 12.85820673220137, + "grad_norm": 0.0016343597089871764, + "learning_rate": 1.703404958053611e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50082360, + "step": 86330 + }, + { + "epoch": 12.85895144474233, + "grad_norm": 0.00838403683155775, + "learning_rate": 1.703096959767428e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50085176, + "step": 86335 + }, + { + "epoch": 12.859696157283288, + "grad_norm": 0.20560777187347412, + "learning_rate": 1.7027889749441108e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50088184, + "step": 86340 + }, + { + "epoch": 12.860440869824249, + "grad_norm": 7.50889812479727e-05, + "learning_rate": 1.702481003588864e-05, + "loss": 0.0406, + "num_input_tokens_seen": 50091128, + "step": 86345 + }, + { + "epoch": 12.861185582365207, + "grad_norm": 0.0015572767006233335, + "learning_rate": 1.7021730457068898e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50094360, + "step": 86350 + }, + { + "epoch": 12.861930294906166, + "grad_norm": 0.04274682328104973, + "learning_rate": 1.701865101303392e-05, + "loss": 0.0007, + "num_input_tokens_seen": 50097400, + "step": 86355 + }, + { + "epoch": 12.862675007447125, + "grad_norm": 0.006334968842566013, + "learning_rate": 1.701557170383572e-05, + "loss": 0.1721, + "num_input_tokens_seen": 50100248, + "step": 86360 + }, + { + "epoch": 12.863419719988084, + "grad_norm": 0.053290557116270065, + "learning_rate": 1.7012492529526315e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50103032, + "step": 86365 + }, + { + "epoch": 12.864164432529044, + "grad_norm": 0.00454667117446661, + "learning_rate": 1.700941349015774e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50106040, + "step": 86370 + }, + { + "epoch": 12.864909145070003, + "grad_norm": 0.0023605742026120424, + "learning_rate": 1.7006334585781986e-05, + "loss": 0.0, + "num_input_tokens_seen": 50108920, + "step": 86375 + }, + { + "epoch": 12.865653857610962, + "grad_norm": 0.4143233895301819, + "learning_rate": 1.7003255816451098e-05, + "loss": 0.092, + "num_input_tokens_seen": 50111960, + "step": 86380 + }, + { + "epoch": 12.86639857015192, + "grad_norm": 0.004598559346050024, + "learning_rate": 1.7000177182217066e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50114648, + "step": 86385 + }, + { + "epoch": 12.867143282692881, + "grad_norm": 0.05018145591020584, + "learning_rate": 1.6997098683131918e-05, + "loss": 0.1663, + "num_input_tokens_seen": 50117592, + "step": 86390 + }, + { + "epoch": 12.86788799523384, + "grad_norm": 0.013786939904093742, + "learning_rate": 1.699402031924765e-05, + "loss": 0.0, + "num_input_tokens_seen": 50120472, + "step": 86395 + }, + { + "epoch": 12.868632707774799, + "grad_norm": 0.0483006052672863, + "learning_rate": 1.699094209061628e-05, + "loss": 0.2329, + "num_input_tokens_seen": 50123096, + "step": 86400 + }, + { + "epoch": 12.869377420315757, + "grad_norm": 0.0043522948399186134, + "learning_rate": 1.69878639972898e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50125976, + "step": 86405 + }, + { + "epoch": 12.870122132856718, + "grad_norm": 0.0020647775381803513, + "learning_rate": 1.6984786039320207e-05, + "loss": 0.0, + "num_input_tokens_seen": 50128856, + "step": 86410 + }, + { + "epoch": 12.870866845397677, + "grad_norm": 0.0006877705454826355, + "learning_rate": 1.6981708216759515e-05, + "loss": 0.0, + "num_input_tokens_seen": 50131704, + "step": 86415 + }, + { + "epoch": 12.871611557938635, + "grad_norm": 251.8816375732422, + "learning_rate": 1.6978630529659706e-05, + "loss": 0.0692, + "num_input_tokens_seen": 50134680, + "step": 86420 + }, + { + "epoch": 12.872356270479594, + "grad_norm": 0.003554023802280426, + "learning_rate": 1.697555297807279e-05, + "loss": 0.0082, + "num_input_tokens_seen": 50137656, + "step": 86425 + }, + { + "epoch": 12.873100983020555, + "grad_norm": 0.017301447689533234, + "learning_rate": 1.6972475562050744e-05, + "loss": 0.0, + "num_input_tokens_seen": 50140952, + "step": 86430 + }, + { + "epoch": 12.873845695561513, + "grad_norm": 0.009612548165023327, + "learning_rate": 1.6969398281645572e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50143640, + "step": 86435 + }, + { + "epoch": 12.874590408102472, + "grad_norm": 0.00020406300609465688, + "learning_rate": 1.696632113690924e-05, + "loss": 0.0029, + "num_input_tokens_seen": 50146456, + "step": 86440 + }, + { + "epoch": 12.875335120643431, + "grad_norm": 0.003960699308663607, + "learning_rate": 1.6963244127893763e-05, + "loss": 0.1287, + "num_input_tokens_seen": 50149336, + "step": 86445 + }, + { + "epoch": 12.876079833184392, + "grad_norm": 0.006540250964462757, + "learning_rate": 1.6960167254651105e-05, + "loss": 0.1742, + "num_input_tokens_seen": 50151992, + "step": 86450 + }, + { + "epoch": 12.87682454572535, + "grad_norm": 0.0064406925812363625, + "learning_rate": 1.6957090517233242e-05, + "loss": 0.0, + "num_input_tokens_seen": 50155128, + "step": 86455 + }, + { + "epoch": 12.877569258266309, + "grad_norm": 0.00408166041597724, + "learning_rate": 1.6954013915692167e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50158232, + "step": 86460 + }, + { + "epoch": 12.878313970807268, + "grad_norm": 0.024215713143348694, + "learning_rate": 1.695093745007985e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50161080, + "step": 86465 + }, + { + "epoch": 12.879058683348228, + "grad_norm": 0.009610737673938274, + "learning_rate": 1.6947861120448262e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50164376, + "step": 86470 + }, + { + "epoch": 12.879803395889187, + "grad_norm": 0.017495047301054, + "learning_rate": 1.694478492684937e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50167608, + "step": 86475 + }, + { + "epoch": 12.880548108430146, + "grad_norm": 0.0428953692317009, + "learning_rate": 1.694170886933516e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50170488, + "step": 86480 + }, + { + "epoch": 12.881292820971105, + "grad_norm": 0.00047532340977340937, + "learning_rate": 1.693863294795759e-05, + "loss": 0.0038, + "num_input_tokens_seen": 50173624, + "step": 86485 + }, + { + "epoch": 12.882037533512065, + "grad_norm": 0.0013832873664796352, + "learning_rate": 1.6935557162768612e-05, + "loss": 0.11, + "num_input_tokens_seen": 50176792, + "step": 86490 + }, + { + "epoch": 12.882782246053024, + "grad_norm": 0.0007419013418257236, + "learning_rate": 1.693248151382021e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50179704, + "step": 86495 + }, + { + "epoch": 12.883526958593983, + "grad_norm": 0.0045373765751719475, + "learning_rate": 1.6929406001164325e-05, + "loss": 0.0, + "num_input_tokens_seen": 50182744, + "step": 86500 + }, + { + "epoch": 12.884271671134941, + "grad_norm": 0.0021338530350476503, + "learning_rate": 1.6926330624852932e-05, + "loss": 0.0352, + "num_input_tokens_seen": 50185528, + "step": 86505 + }, + { + "epoch": 12.8850163836759, + "grad_norm": 0.0016047859098762274, + "learning_rate": 1.6923255384937963e-05, + "loss": 0.1004, + "num_input_tokens_seen": 50188536, + "step": 86510 + }, + { + "epoch": 12.88576109621686, + "grad_norm": 0.005334163550287485, + "learning_rate": 1.69201802814714e-05, + "loss": 0.0402, + "num_input_tokens_seen": 50191224, + "step": 86515 + }, + { + "epoch": 12.88650580875782, + "grad_norm": 0.001974907238036394, + "learning_rate": 1.691710531450517e-05, + "loss": 0.1228, + "num_input_tokens_seen": 50194072, + "step": 86520 + }, + { + "epoch": 12.887250521298778, + "grad_norm": 0.023885494098067284, + "learning_rate": 1.6914030484091235e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50197144, + "step": 86525 + }, + { + "epoch": 12.887995233839739, + "grad_norm": 0.005337268579751253, + "learning_rate": 1.6910955790281538e-05, + "loss": 0.001, + "num_input_tokens_seen": 50199896, + "step": 86530 + }, + { + "epoch": 12.888739946380698, + "grad_norm": 0.0011761734494939446, + "learning_rate": 1.690788123312802e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50202616, + "step": 86535 + }, + { + "epoch": 12.889484658921656, + "grad_norm": 0.017417822033166885, + "learning_rate": 1.6904806812682628e-05, + "loss": 0.0008, + "num_input_tokens_seen": 50205432, + "step": 86540 + }, + { + "epoch": 12.890229371462615, + "grad_norm": 0.0012965736677870154, + "learning_rate": 1.6901732528997282e-05, + "loss": 0.0, + "num_input_tokens_seen": 50208184, + "step": 86545 + }, + { + "epoch": 12.890974084003574, + "grad_norm": 0.0021399366669356823, + "learning_rate": 1.689865838212395e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50211224, + "step": 86550 + }, + { + "epoch": 12.891718796544534, + "grad_norm": 17.99346160888672, + "learning_rate": 1.6895584372114544e-05, + "loss": 0.0664, + "num_input_tokens_seen": 50214168, + "step": 86555 + }, + { + "epoch": 12.892463509085493, + "grad_norm": 0.00259318295866251, + "learning_rate": 1.689251049902101e-05, + "loss": 0.001, + "num_input_tokens_seen": 50217080, + "step": 86560 + }, + { + "epoch": 12.893208221626452, + "grad_norm": 0.0017322689527645707, + "learning_rate": 1.6889436762895267e-05, + "loss": 0.1439, + "num_input_tokens_seen": 50220344, + "step": 86565 + }, + { + "epoch": 12.89395293416741, + "grad_norm": 0.001178123289719224, + "learning_rate": 1.6886363163789243e-05, + "loss": 0.0011, + "num_input_tokens_seen": 50223352, + "step": 86570 + }, + { + "epoch": 12.894697646708371, + "grad_norm": 0.0021158375311642885, + "learning_rate": 1.6883289701754872e-05, + "loss": 0.0108, + "num_input_tokens_seen": 50226136, + "step": 86575 + }, + { + "epoch": 12.89544235924933, + "grad_norm": 0.01359441690146923, + "learning_rate": 1.6880216376844066e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50229240, + "step": 86580 + }, + { + "epoch": 12.896187071790289, + "grad_norm": 0.0009166504023596644, + "learning_rate": 1.6877143189108758e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50232152, + "step": 86585 + }, + { + "epoch": 12.896931784331247, + "grad_norm": 0.0001080312576959841, + "learning_rate": 1.6874070138600855e-05, + "loss": 0.0, + "num_input_tokens_seen": 50234968, + "step": 86590 + }, + { + "epoch": 12.897676496872208, + "grad_norm": 0.0002656692231539637, + "learning_rate": 1.6870997225372286e-05, + "loss": 0.0011, + "num_input_tokens_seen": 50237880, + "step": 86595 + }, + { + "epoch": 12.898421209413167, + "grad_norm": 0.0009070205851458013, + "learning_rate": 1.686792444947494e-05, + "loss": 0.222, + "num_input_tokens_seen": 50240696, + "step": 86600 + }, + { + "epoch": 12.899165921954125, + "grad_norm": 0.04910476878285408, + "learning_rate": 1.6864851810960763e-05, + "loss": 0.0004, + "num_input_tokens_seen": 50243704, + "step": 86605 + }, + { + "epoch": 12.899910634495084, + "grad_norm": 0.003691155230626464, + "learning_rate": 1.6861779309881648e-05, + "loss": 0.1283, + "num_input_tokens_seen": 50247032, + "step": 86610 + }, + { + "epoch": 12.900655347036045, + "grad_norm": 0.0006251081940717995, + "learning_rate": 1.6858706946289486e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50249912, + "step": 86615 + }, + { + "epoch": 12.901400059577004, + "grad_norm": 0.0028502324130386114, + "learning_rate": 1.6855634720236206e-05, + "loss": 0.0, + "num_input_tokens_seen": 50253176, + "step": 86620 + }, + { + "epoch": 12.902144772117962, + "grad_norm": 0.004704058635979891, + "learning_rate": 1.6852562631773694e-05, + "loss": 0.0114, + "num_input_tokens_seen": 50255896, + "step": 86625 + }, + { + "epoch": 12.902889484658921, + "grad_norm": 0.027564095333218575, + "learning_rate": 1.684949068095386e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50259160, + "step": 86630 + }, + { + "epoch": 12.903634197199882, + "grad_norm": 0.000305664143525064, + "learning_rate": 1.684641886782859e-05, + "loss": 0.0003, + "num_input_tokens_seen": 50262008, + "step": 86635 + }, + { + "epoch": 12.90437890974084, + "grad_norm": 1.44496488571167, + "learning_rate": 1.6843347192449793e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50264760, + "step": 86640 + }, + { + "epoch": 12.905123622281799, + "grad_norm": 0.0001687975600361824, + "learning_rate": 1.6840275654869358e-05, + "loss": 0.0214, + "num_input_tokens_seen": 50267768, + "step": 86645 + }, + { + "epoch": 12.905868334822758, + "grad_norm": 47.538299560546875, + "learning_rate": 1.6837204255139164e-05, + "loss": 0.2065, + "num_input_tokens_seen": 50270744, + "step": 86650 + }, + { + "epoch": 12.906613047363718, + "grad_norm": 0.002748476807028055, + "learning_rate": 1.6834132993311115e-05, + "loss": 0.0124, + "num_input_tokens_seen": 50273880, + "step": 86655 + }, + { + "epoch": 12.907357759904677, + "grad_norm": 0.02056274749338627, + "learning_rate": 1.6831061869437086e-05, + "loss": 0.1408, + "num_input_tokens_seen": 50276760, + "step": 86660 + }, + { + "epoch": 12.908102472445636, + "grad_norm": 0.6319087743759155, + "learning_rate": 1.6827990883568966e-05, + "loss": 0.0534, + "num_input_tokens_seen": 50279736, + "step": 86665 + }, + { + "epoch": 12.908847184986595, + "grad_norm": 0.007732930593192577, + "learning_rate": 1.6824920035758628e-05, + "loss": 0.0138, + "num_input_tokens_seen": 50282392, + "step": 86670 + }, + { + "epoch": 12.909591897527555, + "grad_norm": 0.09403684735298157, + "learning_rate": 1.6821849326057963e-05, + "loss": 0.0003, + "num_input_tokens_seen": 50285016, + "step": 86675 + }, + { + "epoch": 12.910336610068514, + "grad_norm": 0.0005306536331772804, + "learning_rate": 1.681877875451884e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50288088, + "step": 86680 + }, + { + "epoch": 12.911081322609473, + "grad_norm": 0.002908022841438651, + "learning_rate": 1.681570832119314e-05, + "loss": 0.0, + "num_input_tokens_seen": 50291288, + "step": 86685 + }, + { + "epoch": 12.911826035150431, + "grad_norm": 0.0007169267046265304, + "learning_rate": 1.6812638026132728e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50294136, + "step": 86690 + }, + { + "epoch": 12.91257074769139, + "grad_norm": 0.00452067144215107, + "learning_rate": 1.680956786938947e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50297016, + "step": 86695 + }, + { + "epoch": 12.91331546023235, + "grad_norm": 0.002037615282461047, + "learning_rate": 1.6806497851015246e-05, + "loss": 0.1752, + "num_input_tokens_seen": 50299832, + "step": 86700 + }, + { + "epoch": 12.91406017277331, + "grad_norm": 3.85032320022583, + "learning_rate": 1.68034279710619e-05, + "loss": 0.1324, + "num_input_tokens_seen": 50302712, + "step": 86705 + }, + { + "epoch": 12.914804885314268, + "grad_norm": 0.005739120300859213, + "learning_rate": 1.6800358229581326e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50305624, + "step": 86710 + }, + { + "epoch": 12.915549597855229, + "grad_norm": 0.00301033491268754, + "learning_rate": 1.6797288626625345e-05, + "loss": 0.0003, + "num_input_tokens_seen": 50308600, + "step": 86715 + }, + { + "epoch": 12.916294310396188, + "grad_norm": 0.0028737778775393963, + "learning_rate": 1.6794219162245855e-05, + "loss": 0.2, + "num_input_tokens_seen": 50311512, + "step": 86720 + }, + { + "epoch": 12.917039022937146, + "grad_norm": 42.536128997802734, + "learning_rate": 1.679114983649469e-05, + "loss": 0.2876, + "num_input_tokens_seen": 50314424, + "step": 86725 + }, + { + "epoch": 12.917783735478105, + "grad_norm": 0.0050727324560284615, + "learning_rate": 1.6788080649423696e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50317208, + "step": 86730 + }, + { + "epoch": 12.918528448019064, + "grad_norm": 0.004470780026167631, + "learning_rate": 1.678501160108474e-05, + "loss": 0.0675, + "num_input_tokens_seen": 50319896, + "step": 86735 + }, + { + "epoch": 12.919273160560024, + "grad_norm": 0.004396403674036264, + "learning_rate": 1.6781942691529656e-05, + "loss": 0.2198, + "num_input_tokens_seen": 50323288, + "step": 86740 + }, + { + "epoch": 12.920017873100983, + "grad_norm": 0.0016513812588527799, + "learning_rate": 1.6778873920810305e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50326104, + "step": 86745 + }, + { + "epoch": 12.920762585641942, + "grad_norm": 0.00034514954313635826, + "learning_rate": 1.6775805288978517e-05, + "loss": 0.0, + "num_input_tokens_seen": 50328824, + "step": 86750 + }, + { + "epoch": 12.9215072981829, + "grad_norm": 0.0015195629093796015, + "learning_rate": 1.6772736796086146e-05, + "loss": 0.2157, + "num_input_tokens_seen": 50331960, + "step": 86755 + }, + { + "epoch": 12.922252010723861, + "grad_norm": 0.0023241920862346888, + "learning_rate": 1.676966844218502e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50335448, + "step": 86760 + }, + { + "epoch": 12.92299672326482, + "grad_norm": 0.03718443214893341, + "learning_rate": 1.676660022732699e-05, + "loss": 0.0868, + "num_input_tokens_seen": 50338648, + "step": 86765 + }, + { + "epoch": 12.923741435805779, + "grad_norm": 0.07580795884132385, + "learning_rate": 1.6763532151563878e-05, + "loss": 0.0003, + "num_input_tokens_seen": 50341400, + "step": 86770 + }, + { + "epoch": 12.924486148346737, + "grad_norm": 0.013905137777328491, + "learning_rate": 1.676046421494751e-05, + "loss": 0.0004, + "num_input_tokens_seen": 50344120, + "step": 86775 + }, + { + "epoch": 12.925230860887698, + "grad_norm": 0.019777415320277214, + "learning_rate": 1.6757396417529735e-05, + "loss": 0.0591, + "num_input_tokens_seen": 50347096, + "step": 86780 + }, + { + "epoch": 12.925975573428657, + "grad_norm": 0.13293282687664032, + "learning_rate": 1.675432875936236e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50350104, + "step": 86785 + }, + { + "epoch": 12.926720285969616, + "grad_norm": 0.0005378664936870337, + "learning_rate": 1.6751261240497228e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50353080, + "step": 86790 + }, + { + "epoch": 12.927464998510574, + "grad_norm": 0.0002669515961315483, + "learning_rate": 1.6748193860986152e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50355896, + "step": 86795 + }, + { + "epoch": 12.928209711051535, + "grad_norm": 0.004892590455710888, + "learning_rate": 1.674512662088096e-05, + "loss": 0.001, + "num_input_tokens_seen": 50358680, + "step": 86800 + }, + { + "epoch": 12.928954423592494, + "grad_norm": 0.020253529772162437, + "learning_rate": 1.674205952023346e-05, + "loss": 0.3126, + "num_input_tokens_seen": 50361944, + "step": 86805 + }, + { + "epoch": 12.929699136133452, + "grad_norm": 0.005324371624737978, + "learning_rate": 1.6738992559095462e-05, + "loss": 0.1129, + "num_input_tokens_seen": 50365080, + "step": 86810 + }, + { + "epoch": 12.930443848674411, + "grad_norm": 0.02027498558163643, + "learning_rate": 1.67359257375188e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50367864, + "step": 86815 + }, + { + "epoch": 12.931188561215372, + "grad_norm": 5.247127532958984, + "learning_rate": 1.673285905555526e-05, + "loss": 0.0258, + "num_input_tokens_seen": 50370744, + "step": 86820 + }, + { + "epoch": 12.93193327375633, + "grad_norm": 0.0034861492458730936, + "learning_rate": 1.6729792513256682e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50373464, + "step": 86825 + }, + { + "epoch": 12.93267798629729, + "grad_norm": 0.22048792243003845, + "learning_rate": 1.672672611067484e-05, + "loss": 0.0006, + "num_input_tokens_seen": 50376376, + "step": 86830 + }, + { + "epoch": 12.933422698838248, + "grad_norm": 18.44155502319336, + "learning_rate": 1.672365984786156e-05, + "loss": 0.1037, + "num_input_tokens_seen": 50379448, + "step": 86835 + }, + { + "epoch": 12.934167411379208, + "grad_norm": 1.5086859464645386, + "learning_rate": 1.6720593724868626e-05, + "loss": 0.001, + "num_input_tokens_seen": 50382392, + "step": 86840 + }, + { + "epoch": 12.934912123920167, + "grad_norm": 0.01302889734506607, + "learning_rate": 1.6717527741747857e-05, + "loss": 0.0005, + "num_input_tokens_seen": 50385400, + "step": 86845 + }, + { + "epoch": 12.935656836461126, + "grad_norm": 0.1133202463388443, + "learning_rate": 1.6714461898551037e-05, + "loss": 0.0057, + "num_input_tokens_seen": 50388472, + "step": 86850 + }, + { + "epoch": 12.936401549002085, + "grad_norm": 0.020882418379187584, + "learning_rate": 1.6711396195329955e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50391000, + "step": 86855 + }, + { + "epoch": 12.937146261543045, + "grad_norm": 0.009539092890918255, + "learning_rate": 1.670833063213642e-05, + "loss": 0.0, + "num_input_tokens_seen": 50394200, + "step": 86860 + }, + { + "epoch": 12.937890974084004, + "grad_norm": 0.0026948750019073486, + "learning_rate": 1.6705265209022204e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50396888, + "step": 86865 + }, + { + "epoch": 12.938635686624963, + "grad_norm": 0.0007690303027629852, + "learning_rate": 1.6702199926039107e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50399736, + "step": 86870 + }, + { + "epoch": 12.939380399165922, + "grad_norm": 0.009897586889564991, + "learning_rate": 1.66991347832389e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50403192, + "step": 86875 + }, + { + "epoch": 12.94012511170688, + "grad_norm": 0.027142371982336044, + "learning_rate": 1.669606978067338e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50406232, + "step": 86880 + }, + { + "epoch": 12.94086982424784, + "grad_norm": 0.05261221528053284, + "learning_rate": 1.669300491839433e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50409400, + "step": 86885 + }, + { + "epoch": 12.9416145367888, + "grad_norm": 0.0008184011094272137, + "learning_rate": 1.6689940196453507e-05, + "loss": 0.0, + "num_input_tokens_seen": 50412632, + "step": 86890 + }, + { + "epoch": 12.942359249329758, + "grad_norm": 0.00018492553499527276, + "learning_rate": 1.66868756149027e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50415448, + "step": 86895 + }, + { + "epoch": 12.943103961870717, + "grad_norm": 0.006960802711546421, + "learning_rate": 1.668381117379368e-05, + "loss": 0.0, + "num_input_tokens_seen": 50418488, + "step": 86900 + }, + { + "epoch": 12.943848674411678, + "grad_norm": 0.0015094481641426682, + "learning_rate": 1.6680746873178225e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50421656, + "step": 86905 + }, + { + "epoch": 12.944593386952636, + "grad_norm": 0.03018202632665634, + "learning_rate": 1.6677682713108082e-05, + "loss": 0.0004, + "num_input_tokens_seen": 50424760, + "step": 86910 + }, + { + "epoch": 12.945338099493595, + "grad_norm": 0.0008246718207374215, + "learning_rate": 1.6674618693635047e-05, + "loss": 0.0, + "num_input_tokens_seen": 50427640, + "step": 86915 + }, + { + "epoch": 12.946082812034554, + "grad_norm": 2.8014941215515137, + "learning_rate": 1.6671554814810857e-05, + "loss": 0.0007, + "num_input_tokens_seen": 50430616, + "step": 86920 + }, + { + "epoch": 12.946827524575514, + "grad_norm": 149.14492797851562, + "learning_rate": 1.6668491076687294e-05, + "loss": 0.2501, + "num_input_tokens_seen": 50433368, + "step": 86925 + }, + { + "epoch": 12.947572237116473, + "grad_norm": 0.09325306117534637, + "learning_rate": 1.66654274793161e-05, + "loss": 0.0533, + "num_input_tokens_seen": 50436024, + "step": 86930 + }, + { + "epoch": 12.948316949657432, + "grad_norm": 0.0029249214567244053, + "learning_rate": 1.6662364022749035e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50438904, + "step": 86935 + }, + { + "epoch": 12.94906166219839, + "grad_norm": 0.001648407313041389, + "learning_rate": 1.6659300707037864e-05, + "loss": 0.0007, + "num_input_tokens_seen": 50441880, + "step": 86940 + }, + { + "epoch": 12.949806374739351, + "grad_norm": 0.010837684385478497, + "learning_rate": 1.665623753223432e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50445208, + "step": 86945 + }, + { + "epoch": 12.95055108728031, + "grad_norm": 0.0012530346866697073, + "learning_rate": 1.6653174498390172e-05, + "loss": 0.0088, + "num_input_tokens_seen": 50448248, + "step": 86950 + }, + { + "epoch": 12.951295799821269, + "grad_norm": 0.012232079170644283, + "learning_rate": 1.665011160555715e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50451160, + "step": 86955 + }, + { + "epoch": 12.952040512362228, + "grad_norm": 0.0005931376363150775, + "learning_rate": 1.6647048853787018e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50453912, + "step": 86960 + }, + { + "epoch": 12.952785224903188, + "grad_norm": 0.15605100989341736, + "learning_rate": 1.6643986243131497e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50456696, + "step": 86965 + }, + { + "epoch": 12.953529937444147, + "grad_norm": 0.0053127543069422245, + "learning_rate": 1.664092377364233e-05, + "loss": 0.1566, + "num_input_tokens_seen": 50459256, + "step": 86970 + }, + { + "epoch": 12.954274649985106, + "grad_norm": 0.006871982477605343, + "learning_rate": 1.663786144537127e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50461976, + "step": 86975 + }, + { + "epoch": 12.955019362526064, + "grad_norm": 0.0012772228801622987, + "learning_rate": 1.6634799258370036e-05, + "loss": 0.0, + "num_input_tokens_seen": 50464856, + "step": 86980 + }, + { + "epoch": 12.955764075067025, + "grad_norm": 0.004697101190686226, + "learning_rate": 1.6631737212690373e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50467704, + "step": 86985 + }, + { + "epoch": 12.956508787607984, + "grad_norm": 0.09939850121736526, + "learning_rate": 1.6628675308384e-05, + "loss": 0.0003, + "num_input_tokens_seen": 50470232, + "step": 86990 + }, + { + "epoch": 12.957253500148942, + "grad_norm": 0.007014528848230839, + "learning_rate": 1.6625613545502653e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50473080, + "step": 86995 + }, + { + "epoch": 12.957998212689901, + "grad_norm": 0.0014583561569452286, + "learning_rate": 1.6622551924098046e-05, + "loss": 0.1534, + "num_input_tokens_seen": 50475896, + "step": 87000 + }, + { + "epoch": 12.958742925230862, + "grad_norm": 0.06573514640331268, + "learning_rate": 1.6619490444221918e-05, + "loss": 0.0469, + "num_input_tokens_seen": 50478968, + "step": 87005 + }, + { + "epoch": 12.95948763777182, + "grad_norm": 0.00022695737425237894, + "learning_rate": 1.6616429105925978e-05, + "loss": 0.0005, + "num_input_tokens_seen": 50481816, + "step": 87010 + }, + { + "epoch": 12.96023235031278, + "grad_norm": 0.00037276087095960975, + "learning_rate": 1.6613367909261946e-05, + "loss": 0.0036, + "num_input_tokens_seen": 50484568, + "step": 87015 + }, + { + "epoch": 12.960977062853738, + "grad_norm": 0.0021404840517789125, + "learning_rate": 1.6610306854281542e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50487416, + "step": 87020 + }, + { + "epoch": 12.961721775394697, + "grad_norm": 0.0009988384554162621, + "learning_rate": 1.6607245941036476e-05, + "loss": 0.0041, + "num_input_tokens_seen": 50490456, + "step": 87025 + }, + { + "epoch": 12.962466487935657, + "grad_norm": 0.009208780713379383, + "learning_rate": 1.660418516957846e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50493336, + "step": 87030 + }, + { + "epoch": 12.963211200476616, + "grad_norm": 0.011213668622076511, + "learning_rate": 1.66011245399592e-05, + "loss": 0.0088, + "num_input_tokens_seen": 50496408, + "step": 87035 + }, + { + "epoch": 12.963955913017575, + "grad_norm": 30.224584579467773, + "learning_rate": 1.6598064052230407e-05, + "loss": 0.1284, + "num_input_tokens_seen": 50499128, + "step": 87040 + }, + { + "epoch": 12.964700625558535, + "grad_norm": 1.2530465126037598, + "learning_rate": 1.659500370644378e-05, + "loss": 0.0011, + "num_input_tokens_seen": 50501816, + "step": 87045 + }, + { + "epoch": 12.965445338099494, + "grad_norm": 0.09653742611408234, + "learning_rate": 1.6591943502651025e-05, + "loss": 0.1319, + "num_input_tokens_seen": 50504984, + "step": 87050 + }, + { + "epoch": 12.966190050640453, + "grad_norm": 0.00021213600120972842, + "learning_rate": 1.6588883440903847e-05, + "loss": 0.1563, + "num_input_tokens_seen": 50507672, + "step": 87055 + }, + { + "epoch": 12.966934763181412, + "grad_norm": 0.001043919357471168, + "learning_rate": 1.6585823521253924e-05, + "loss": 0.0425, + "num_input_tokens_seen": 50510744, + "step": 87060 + }, + { + "epoch": 12.96767947572237, + "grad_norm": 0.003659999929368496, + "learning_rate": 1.6582763743752965e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50513368, + "step": 87065 + }, + { + "epoch": 12.96842418826333, + "grad_norm": 3.337789535522461, + "learning_rate": 1.6579704108452653e-05, + "loss": 0.0012, + "num_input_tokens_seen": 50516504, + "step": 87070 + }, + { + "epoch": 12.96916890080429, + "grad_norm": 0.001714933430776, + "learning_rate": 1.6576644615404686e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50519352, + "step": 87075 + }, + { + "epoch": 12.969913613345248, + "grad_norm": 0.0009438499691896141, + "learning_rate": 1.657358526466074e-05, + "loss": 0.1339, + "num_input_tokens_seen": 50522104, + "step": 87080 + }, + { + "epoch": 12.970658325886207, + "grad_norm": 21.401145935058594, + "learning_rate": 1.6570526056272516e-05, + "loss": 0.1379, + "num_input_tokens_seen": 50525080, + "step": 87085 + }, + { + "epoch": 12.971403038427168, + "grad_norm": 0.6839497685432434, + "learning_rate": 1.656746699029169e-05, + "loss": 0.0011, + "num_input_tokens_seen": 50527736, + "step": 87090 + }, + { + "epoch": 12.972147750968126, + "grad_norm": 0.05420093610882759, + "learning_rate": 1.6564408066769932e-05, + "loss": 0.0006, + "num_input_tokens_seen": 50530776, + "step": 87095 + }, + { + "epoch": 12.972892463509085, + "grad_norm": 0.0025460561737418175, + "learning_rate": 1.6561349285758932e-05, + "loss": 0.0631, + "num_input_tokens_seen": 50533944, + "step": 87100 + }, + { + "epoch": 12.973637176050044, + "grad_norm": 0.014695725403726101, + "learning_rate": 1.6558290647310347e-05, + "loss": 0.1025, + "num_input_tokens_seen": 50536728, + "step": 87105 + }, + { + "epoch": 12.974381888591004, + "grad_norm": 0.016145214438438416, + "learning_rate": 1.6555232151475872e-05, + "loss": 0.209, + "num_input_tokens_seen": 50539512, + "step": 87110 + }, + { + "epoch": 12.975126601131963, + "grad_norm": 71.54926300048828, + "learning_rate": 1.6552173798307157e-05, + "loss": 0.0084, + "num_input_tokens_seen": 50542392, + "step": 87115 + }, + { + "epoch": 12.975871313672922, + "grad_norm": 0.01563318818807602, + "learning_rate": 1.654911558785589e-05, + "loss": 0.3534, + "num_input_tokens_seen": 50545208, + "step": 87120 + }, + { + "epoch": 12.97661602621388, + "grad_norm": 15.241663932800293, + "learning_rate": 1.654605752017372e-05, + "loss": 0.0462, + "num_input_tokens_seen": 50547992, + "step": 87125 + }, + { + "epoch": 12.977360738754841, + "grad_norm": 0.004873394034802914, + "learning_rate": 1.6542999595312324e-05, + "loss": 0.1627, + "num_input_tokens_seen": 50550808, + "step": 87130 + }, + { + "epoch": 12.9781054512958, + "grad_norm": 0.0014380763750523329, + "learning_rate": 1.6539941813323353e-05, + "loss": 0.015, + "num_input_tokens_seen": 50553848, + "step": 87135 + }, + { + "epoch": 12.978850163836759, + "grad_norm": 0.0006199234630912542, + "learning_rate": 1.653688417425846e-05, + "loss": 0.0033, + "num_input_tokens_seen": 50556664, + "step": 87140 + }, + { + "epoch": 12.979594876377718, + "grad_norm": 0.07827471196651459, + "learning_rate": 1.653382667816931e-05, + "loss": 0.0004, + "num_input_tokens_seen": 50559352, + "step": 87145 + }, + { + "epoch": 12.980339588918678, + "grad_norm": 0.025798389688134193, + "learning_rate": 1.653076932510755e-05, + "loss": 0.0004, + "num_input_tokens_seen": 50562072, + "step": 87150 + }, + { + "epoch": 12.981084301459637, + "grad_norm": 0.004229824058711529, + "learning_rate": 1.652771211512484e-05, + "loss": 0.0882, + "num_input_tokens_seen": 50564984, + "step": 87155 + }, + { + "epoch": 12.981829014000596, + "grad_norm": 0.0003371595812495798, + "learning_rate": 1.652465504827282e-05, + "loss": 0.0839, + "num_input_tokens_seen": 50567864, + "step": 87160 + }, + { + "epoch": 12.982573726541554, + "grad_norm": 0.0007829719106666744, + "learning_rate": 1.6521598124603143e-05, + "loss": 0.0003, + "num_input_tokens_seen": 50571032, + "step": 87165 + }, + { + "epoch": 12.983318439082515, + "grad_norm": 0.01312517374753952, + "learning_rate": 1.651854134416745e-05, + "loss": 0.0029, + "num_input_tokens_seen": 50573880, + "step": 87170 + }, + { + "epoch": 12.984063151623474, + "grad_norm": 66.55436706542969, + "learning_rate": 1.651548470701737e-05, + "loss": 0.0373, + "num_input_tokens_seen": 50576792, + "step": 87175 + }, + { + "epoch": 12.984807864164432, + "grad_norm": 0.04701361060142517, + "learning_rate": 1.6512428213204564e-05, + "loss": 0.0089, + "num_input_tokens_seen": 50579768, + "step": 87180 + }, + { + "epoch": 12.985552576705391, + "grad_norm": 0.000801806862000376, + "learning_rate": 1.6509371862780644e-05, + "loss": 0.0005, + "num_input_tokens_seen": 50582712, + "step": 87185 + }, + { + "epoch": 12.986297289246352, + "grad_norm": 0.005224470980465412, + "learning_rate": 1.650631565579727e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50585784, + "step": 87190 + }, + { + "epoch": 12.98704200178731, + "grad_norm": 0.04782705008983612, + "learning_rate": 1.6503259592306053e-05, + "loss": 0.0083, + "num_input_tokens_seen": 50588472, + "step": 87195 + }, + { + "epoch": 12.98778671432827, + "grad_norm": 0.0012275540502741933, + "learning_rate": 1.650020367235864e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50591352, + "step": 87200 + }, + { + "epoch": 12.988531426869228, + "grad_norm": 0.008970639668405056, + "learning_rate": 1.649714789600663e-05, + "loss": 0.0233, + "num_input_tokens_seen": 50594200, + "step": 87205 + }, + { + "epoch": 12.989276139410187, + "grad_norm": 0.0010354385012760758, + "learning_rate": 1.649409226330168e-05, + "loss": 0.092, + "num_input_tokens_seen": 50596984, + "step": 87210 + }, + { + "epoch": 12.990020851951147, + "grad_norm": 0.001275761635042727, + "learning_rate": 1.6491036774295393e-05, + "loss": 0.0003, + "num_input_tokens_seen": 50600184, + "step": 87215 + }, + { + "epoch": 12.990765564492106, + "grad_norm": 0.004896854516118765, + "learning_rate": 1.6487981429039383e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50603192, + "step": 87220 + }, + { + "epoch": 12.991510277033065, + "grad_norm": 0.007399050984531641, + "learning_rate": 1.648492622758528e-05, + "loss": 0.0, + "num_input_tokens_seen": 50606232, + "step": 87225 + }, + { + "epoch": 12.992254989574025, + "grad_norm": 0.004505633842200041, + "learning_rate": 1.6481871169984696e-05, + "loss": 0.0, + "num_input_tokens_seen": 50608984, + "step": 87230 + }, + { + "epoch": 12.992999702114984, + "grad_norm": 0.009454218670725822, + "learning_rate": 1.647881625628924e-05, + "loss": 0.0005, + "num_input_tokens_seen": 50611928, + "step": 87235 + }, + { + "epoch": 12.993744414655943, + "grad_norm": 1.0144299268722534, + "learning_rate": 1.6475761486550516e-05, + "loss": 0.0027, + "num_input_tokens_seen": 50614968, + "step": 87240 + }, + { + "epoch": 12.994489127196902, + "grad_norm": 0.0010331503581255674, + "learning_rate": 1.6472706860820152e-05, + "loss": 0.0, + "num_input_tokens_seen": 50617624, + "step": 87245 + }, + { + "epoch": 12.99523383973786, + "grad_norm": 0.0005691754049621522, + "learning_rate": 1.6469652379149736e-05, + "loss": 0.0, + "num_input_tokens_seen": 50620344, + "step": 87250 + }, + { + "epoch": 12.995978552278821, + "grad_norm": 0.00017965608276426792, + "learning_rate": 1.6466598041590866e-05, + "loss": 0.0976, + "num_input_tokens_seen": 50623064, + "step": 87255 + }, + { + "epoch": 12.99672326481978, + "grad_norm": 0.018700011074543, + "learning_rate": 1.646354384819515e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50625912, + "step": 87260 + }, + { + "epoch": 12.997467977360738, + "grad_norm": 0.010241085663437843, + "learning_rate": 1.6460489799014188e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50628664, + "step": 87265 + }, + { + "epoch": 12.998212689901697, + "grad_norm": 33.2099494934082, + "learning_rate": 1.6457435894099575e-05, + "loss": 0.144, + "num_input_tokens_seen": 50631896, + "step": 87270 + }, + { + "epoch": 12.998957402442658, + "grad_norm": 0.00439877063035965, + "learning_rate": 1.645438213350289e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50634872, + "step": 87275 + }, + { + "epoch": 12.999702114983616, + "grad_norm": 0.003132509533315897, + "learning_rate": 1.645132851727574e-05, + "loss": 0.0, + "num_input_tokens_seen": 50637720, + "step": 87280 + }, + { + "epoch": 13.0, + "eval_loss": 2.076596736907959, + "eval_runtime": 51.2577, + "eval_samples_per_second": 58.216, + "eval_steps_per_second": 14.554, + "num_input_tokens_seen": 50638456, + "step": 87282 + }, + { + "epoch": 13.000446827524575, + "grad_norm": 0.0034419901203364134, + "learning_rate": 1.6448275045469702e-05, + "loss": 0.0, + "num_input_tokens_seen": 50640152, + "step": 87285 + }, + { + "epoch": 13.001191540065534, + "grad_norm": 0.00022647184960078448, + "learning_rate": 1.6445221718136376e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50643064, + "step": 87290 + }, + { + "epoch": 13.001936252606495, + "grad_norm": 0.0001880809577414766, + "learning_rate": 1.644216853532733e-05, + "loss": 0.0, + "num_input_tokens_seen": 50645688, + "step": 87295 + }, + { + "epoch": 13.002680965147453, + "grad_norm": 0.008154895156621933, + "learning_rate": 1.6439115497094137e-05, + "loss": 0.0, + "num_input_tokens_seen": 50648728, + "step": 87300 + }, + { + "epoch": 13.003425677688412, + "grad_norm": 0.0030214188154786825, + "learning_rate": 1.64360626034884e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50651576, + "step": 87305 + }, + { + "epoch": 13.00417039022937, + "grad_norm": 0.00027357606450095773, + "learning_rate": 1.6433009854561672e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50654360, + "step": 87310 + }, + { + "epoch": 13.004915102770331, + "grad_norm": 0.010692211799323559, + "learning_rate": 1.6429957250365547e-05, + "loss": 0.0012, + "num_input_tokens_seen": 50657176, + "step": 87315 + }, + { + "epoch": 13.00565981531129, + "grad_norm": 0.0006303550326265395, + "learning_rate": 1.6426904790951575e-05, + "loss": 0.0494, + "num_input_tokens_seen": 50659992, + "step": 87320 + }, + { + "epoch": 13.006404527852249, + "grad_norm": 0.004333070944994688, + "learning_rate": 1.642385247637134e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50663064, + "step": 87325 + }, + { + "epoch": 13.007149240393208, + "grad_norm": 0.0008513805805705488, + "learning_rate": 1.6420800306676397e-05, + "loss": 0.1345, + "num_input_tokens_seen": 50665816, + "step": 87330 + }, + { + "epoch": 13.007893952934168, + "grad_norm": 0.1528186798095703, + "learning_rate": 1.641774828191831e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50668792, + "step": 87335 + }, + { + "epoch": 13.008638665475127, + "grad_norm": 0.002160955686122179, + "learning_rate": 1.641469640214865e-05, + "loss": 0.0, + "num_input_tokens_seen": 50671576, + "step": 87340 + }, + { + "epoch": 13.009383378016086, + "grad_norm": 0.0019360944861546159, + "learning_rate": 1.6411644667418958e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50674616, + "step": 87345 + }, + { + "epoch": 13.010128090557044, + "grad_norm": 0.009804164990782738, + "learning_rate": 1.6408593077780808e-05, + "loss": 0.0012, + "num_input_tokens_seen": 50677752, + "step": 87350 + }, + { + "epoch": 13.010872803098005, + "grad_norm": 0.003853457048535347, + "learning_rate": 1.6405541633285748e-05, + "loss": 0.0835, + "num_input_tokens_seen": 50680664, + "step": 87355 + }, + { + "epoch": 13.011617515638964, + "grad_norm": 0.0031479394529014826, + "learning_rate": 1.6402490333985325e-05, + "loss": 0.0, + "num_input_tokens_seen": 50683224, + "step": 87360 + }, + { + "epoch": 13.012362228179922, + "grad_norm": 0.022073576226830482, + "learning_rate": 1.6399439179931087e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50686104, + "step": 87365 + }, + { + "epoch": 13.013106940720881, + "grad_norm": 0.005577464587986469, + "learning_rate": 1.6396388171174586e-05, + "loss": 0.0, + "num_input_tokens_seen": 50689144, + "step": 87370 + }, + { + "epoch": 13.013851653261842, + "grad_norm": 0.001061313203535974, + "learning_rate": 1.6393337307767364e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50692184, + "step": 87375 + }, + { + "epoch": 13.0145963658028, + "grad_norm": 0.0004044607048854232, + "learning_rate": 1.6390286589760957e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50695000, + "step": 87380 + }, + { + "epoch": 13.01534107834376, + "grad_norm": 0.0018743897089734674, + "learning_rate": 1.6387236017206908e-05, + "loss": 0.0228, + "num_input_tokens_seen": 50698072, + "step": 87385 + }, + { + "epoch": 13.016085790884718, + "grad_norm": 0.002689376240596175, + "learning_rate": 1.6384185590156752e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50700888, + "step": 87390 + }, + { + "epoch": 13.016830503425677, + "grad_norm": 0.005161574576050043, + "learning_rate": 1.6381135308662032e-05, + "loss": 0.0, + "num_input_tokens_seen": 50703672, + "step": 87395 + }, + { + "epoch": 13.017575215966637, + "grad_norm": 0.009474766440689564, + "learning_rate": 1.6378085172774258e-05, + "loss": 0.0588, + "num_input_tokens_seen": 50706776, + "step": 87400 + }, + { + "epoch": 13.018319928507596, + "grad_norm": 0.013057068921625614, + "learning_rate": 1.6375035182544983e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50709400, + "step": 87405 + }, + { + "epoch": 13.019064641048555, + "grad_norm": 0.0015409747138619423, + "learning_rate": 1.637198533802572e-05, + "loss": 0.0, + "num_input_tokens_seen": 50712248, + "step": 87410 + }, + { + "epoch": 13.019809353589514, + "grad_norm": 0.253399521112442, + "learning_rate": 1.636893563926799e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50715000, + "step": 87415 + }, + { + "epoch": 13.020554066130474, + "grad_norm": 0.14702974259853363, + "learning_rate": 1.6365886086323327e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50717624, + "step": 87420 + }, + { + "epoch": 13.021298778671433, + "grad_norm": 0.01018522772938013, + "learning_rate": 1.636283667924324e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50720472, + "step": 87425 + }, + { + "epoch": 13.022043491212392, + "grad_norm": 0.0009752410114742815, + "learning_rate": 1.6359787418079254e-05, + "loss": 0.0, + "num_input_tokens_seen": 50723160, + "step": 87430 + }, + { + "epoch": 13.02278820375335, + "grad_norm": 19.35694694519043, + "learning_rate": 1.6356738302882864e-05, + "loss": 0.0917, + "num_input_tokens_seen": 50725912, + "step": 87435 + }, + { + "epoch": 13.023532916294311, + "grad_norm": 0.01142184343189001, + "learning_rate": 1.6353689333705606e-05, + "loss": 0.0401, + "num_input_tokens_seen": 50728952, + "step": 87440 + }, + { + "epoch": 13.02427762883527, + "grad_norm": 0.0044629438780248165, + "learning_rate": 1.6350640510598974e-05, + "loss": 0.0, + "num_input_tokens_seen": 50731800, + "step": 87445 + }, + { + "epoch": 13.025022341376228, + "grad_norm": 0.0001285846665268764, + "learning_rate": 1.634759183361449e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50734776, + "step": 87450 + }, + { + "epoch": 13.025767053917187, + "grad_norm": 0.013380159623920918, + "learning_rate": 1.6344543302803643e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50737432, + "step": 87455 + }, + { + "epoch": 13.026511766458148, + "grad_norm": 0.0900954082608223, + "learning_rate": 1.6341494918217938e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50740920, + "step": 87460 + }, + { + "epoch": 13.027256478999107, + "grad_norm": 0.012016816064715385, + "learning_rate": 1.633844667990888e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50744152, + "step": 87465 + }, + { + "epoch": 13.028001191540065, + "grad_norm": 0.022397536784410477, + "learning_rate": 1.633539858792795e-05, + "loss": 0.0355, + "num_input_tokens_seen": 50747000, + "step": 87470 + }, + { + "epoch": 13.028745904081024, + "grad_norm": 0.0006425056490115821, + "learning_rate": 1.6332350642326673e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50750104, + "step": 87475 + }, + { + "epoch": 13.029490616621985, + "grad_norm": 1.1255583763122559, + "learning_rate": 1.6329302843156503e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50753400, + "step": 87480 + }, + { + "epoch": 13.030235329162943, + "grad_norm": 0.001955547137185931, + "learning_rate": 1.6326255190468965e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50756376, + "step": 87485 + }, + { + "epoch": 13.030980041703902, + "grad_norm": 0.00038363353814929724, + "learning_rate": 1.632320768431553e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50759864, + "step": 87490 + }, + { + "epoch": 13.03172475424486, + "grad_norm": 0.0010963677195832133, + "learning_rate": 1.6320160324747672e-05, + "loss": 0.039, + "num_input_tokens_seen": 50762552, + "step": 87495 + }, + { + "epoch": 13.032469466785821, + "grad_norm": 0.0006010960787534714, + "learning_rate": 1.631711311181689e-05, + "loss": 0.0035, + "num_input_tokens_seen": 50765304, + "step": 87500 + }, + { + "epoch": 13.03321417932678, + "grad_norm": 0.0026318407617509365, + "learning_rate": 1.631406604557465e-05, + "loss": 0.0, + "num_input_tokens_seen": 50768344, + "step": 87505 + }, + { + "epoch": 13.033958891867739, + "grad_norm": 0.0048803361132740974, + "learning_rate": 1.6311019126072447e-05, + "loss": 0.1082, + "num_input_tokens_seen": 50771256, + "step": 87510 + }, + { + "epoch": 13.034703604408698, + "grad_norm": 0.018001487478613853, + "learning_rate": 1.630797235336173e-05, + "loss": 0.1541, + "num_input_tokens_seen": 50774200, + "step": 87515 + }, + { + "epoch": 13.035448316949658, + "grad_norm": 0.05091017484664917, + "learning_rate": 1.6304925727493998e-05, + "loss": 0.1814, + "num_input_tokens_seen": 50777176, + "step": 87520 + }, + { + "epoch": 13.036193029490617, + "grad_norm": 0.0010593910701572895, + "learning_rate": 1.6301879248520707e-05, + "loss": 0.0, + "num_input_tokens_seen": 50780088, + "step": 87525 + }, + { + "epoch": 13.036937742031576, + "grad_norm": 0.0003127987147308886, + "learning_rate": 1.629883291649333e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50782936, + "step": 87530 + }, + { + "epoch": 13.037682454572534, + "grad_norm": 0.015288139693439007, + "learning_rate": 1.6295786731463324e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50785752, + "step": 87535 + }, + { + "epoch": 13.038427167113495, + "grad_norm": 0.21112418174743652, + "learning_rate": 1.6292740693482144e-05, + "loss": 0.0454, + "num_input_tokens_seen": 50788472, + "step": 87540 + }, + { + "epoch": 13.039171879654454, + "grad_norm": 0.0007011136622168124, + "learning_rate": 1.6289694802601273e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50791064, + "step": 87545 + }, + { + "epoch": 13.039916592195413, + "grad_norm": 6.243204593658447, + "learning_rate": 1.628664905887215e-05, + "loss": 0.0496, + "num_input_tokens_seen": 50793816, + "step": 87550 + }, + { + "epoch": 13.040661304736371, + "grad_norm": 0.002418211428448558, + "learning_rate": 1.6283603462346235e-05, + "loss": 0.0003, + "num_input_tokens_seen": 50796856, + "step": 87555 + }, + { + "epoch": 13.041406017277332, + "grad_norm": 0.009532691910862923, + "learning_rate": 1.628055801307498e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50799512, + "step": 87560 + }, + { + "epoch": 13.04215072981829, + "grad_norm": 0.7202498316764832, + "learning_rate": 1.6277512711109842e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50802360, + "step": 87565 + }, + { + "epoch": 13.04289544235925, + "grad_norm": 0.0008933661738410592, + "learning_rate": 1.627446755650226e-05, + "loss": 0.0, + "num_input_tokens_seen": 50805048, + "step": 87570 + }, + { + "epoch": 13.043640154900208, + "grad_norm": 0.01471196860074997, + "learning_rate": 1.627142254930367e-05, + "loss": 0.0, + "num_input_tokens_seen": 50807704, + "step": 87575 + }, + { + "epoch": 13.044384867441167, + "grad_norm": 0.011758171953260899, + "learning_rate": 1.6268377689565533e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50810328, + "step": 87580 + }, + { + "epoch": 13.045129579982127, + "grad_norm": 0.053944386541843414, + "learning_rate": 1.6265332977339282e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50813336, + "step": 87585 + }, + { + "epoch": 13.045874292523086, + "grad_norm": 0.025621602311730385, + "learning_rate": 1.6262288412676345e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50815992, + "step": 87590 + }, + { + "epoch": 13.046619005064045, + "grad_norm": 0.020800398662686348, + "learning_rate": 1.625924399562817e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50818936, + "step": 87595 + }, + { + "epoch": 13.047363717605004, + "grad_norm": 0.011361988261342049, + "learning_rate": 1.625619972624619e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50821496, + "step": 87600 + }, + { + "epoch": 13.048108430145964, + "grad_norm": 0.009082137607038021, + "learning_rate": 1.6253155604581817e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50824408, + "step": 87605 + }, + { + "epoch": 13.048853142686923, + "grad_norm": 0.0005735268350690603, + "learning_rate": 1.6250111630686498e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50827256, + "step": 87610 + }, + { + "epoch": 13.049597855227882, + "grad_norm": 0.012703288346529007, + "learning_rate": 1.6247067804611652e-05, + "loss": 0.0108, + "num_input_tokens_seen": 50830328, + "step": 87615 + }, + { + "epoch": 13.05034256776884, + "grad_norm": 0.09513884782791138, + "learning_rate": 1.6244024126408695e-05, + "loss": 0.0513, + "num_input_tokens_seen": 50833080, + "step": 87620 + }, + { + "epoch": 13.051087280309801, + "grad_norm": 0.05616231635212898, + "learning_rate": 1.6240980596129053e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50835960, + "step": 87625 + }, + { + "epoch": 13.05183199285076, + "grad_norm": 0.09428738802671432, + "learning_rate": 1.6237937213824134e-05, + "loss": 0.0416, + "num_input_tokens_seen": 50838808, + "step": 87630 + }, + { + "epoch": 13.052576705391719, + "grad_norm": 0.0007723908056505024, + "learning_rate": 1.623489397954537e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50841656, + "step": 87635 + }, + { + "epoch": 13.053321417932677, + "grad_norm": 0.0008910991018638015, + "learning_rate": 1.623185089334415e-05, + "loss": 0.0017, + "num_input_tokens_seen": 50844472, + "step": 87640 + }, + { + "epoch": 13.054066130473638, + "grad_norm": 6.654095341218635e-05, + "learning_rate": 1.6228807955271915e-05, + "loss": 0.2125, + "num_input_tokens_seen": 50847416, + "step": 87645 + }, + { + "epoch": 13.054810843014597, + "grad_norm": 0.0006758523522876203, + "learning_rate": 1.6225765165380046e-05, + "loss": 0.0, + "num_input_tokens_seen": 50850328, + "step": 87650 + }, + { + "epoch": 13.055555555555555, + "grad_norm": 0.0006048714276403189, + "learning_rate": 1.6222722523719963e-05, + "loss": 0.0, + "num_input_tokens_seen": 50853176, + "step": 87655 + }, + { + "epoch": 13.056300268096514, + "grad_norm": 0.0002710174594540149, + "learning_rate": 1.6219680030343063e-05, + "loss": 0.0057, + "num_input_tokens_seen": 50856184, + "step": 87660 + }, + { + "epoch": 13.057044980637475, + "grad_norm": 0.00012660115316975862, + "learning_rate": 1.6216637685300735e-05, + "loss": 0.0007, + "num_input_tokens_seen": 50859224, + "step": 87665 + }, + { + "epoch": 13.057789693178433, + "grad_norm": 0.000485716707771644, + "learning_rate": 1.6213595488644393e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50861848, + "step": 87670 + }, + { + "epoch": 13.058534405719392, + "grad_norm": 0.0018238615011796355, + "learning_rate": 1.6210553440425415e-05, + "loss": 0.0, + "num_input_tokens_seen": 50864856, + "step": 87675 + }, + { + "epoch": 13.059279118260351, + "grad_norm": 0.0003919219016097486, + "learning_rate": 1.6207511540695215e-05, + "loss": 0.0003, + "num_input_tokens_seen": 50868088, + "step": 87680 + }, + { + "epoch": 13.060023830801311, + "grad_norm": 0.0027708052657544613, + "learning_rate": 1.6204469789505165e-05, + "loss": 0.0, + "num_input_tokens_seen": 50870872, + "step": 87685 + }, + { + "epoch": 13.06076854334227, + "grad_norm": 0.008131559938192368, + "learning_rate": 1.620142818690667e-05, + "loss": 0.0, + "num_input_tokens_seen": 50873976, + "step": 87690 + }, + { + "epoch": 13.061513255883229, + "grad_norm": 0.0002980472054332495, + "learning_rate": 1.61983867329511e-05, + "loss": 0.0, + "num_input_tokens_seen": 50876888, + "step": 87695 + }, + { + "epoch": 13.062257968424188, + "grad_norm": 0.000489094469230622, + "learning_rate": 1.6195345427689826e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50879992, + "step": 87700 + }, + { + "epoch": 13.063002680965148, + "grad_norm": 0.015136078000068665, + "learning_rate": 1.6192304271174256e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50882776, + "step": 87705 + }, + { + "epoch": 13.063747393506107, + "grad_norm": 0.009688328020274639, + "learning_rate": 1.618926326345574e-05, + "loss": 0.4385, + "num_input_tokens_seen": 50885400, + "step": 87710 + }, + { + "epoch": 13.064492106047066, + "grad_norm": 0.0015170591650530696, + "learning_rate": 1.618622240458568e-05, + "loss": 0.0002, + "num_input_tokens_seen": 50888120, + "step": 87715 + }, + { + "epoch": 13.065236818588025, + "grad_norm": 0.0006535944412462413, + "learning_rate": 1.618318169461543e-05, + "loss": 0.0009, + "num_input_tokens_seen": 50891128, + "step": 87720 + }, + { + "epoch": 13.065981531128985, + "grad_norm": 40.01628875732422, + "learning_rate": 1.6180141133596367e-05, + "loss": 0.1882, + "num_input_tokens_seen": 50894040, + "step": 87725 + }, + { + "epoch": 13.066726243669944, + "grad_norm": 0.00017156281683128327, + "learning_rate": 1.6177100721579847e-05, + "loss": 0.006, + "num_input_tokens_seen": 50896664, + "step": 87730 + }, + { + "epoch": 13.067470956210903, + "grad_norm": 0.0043132109567523, + "learning_rate": 1.617406045861725e-05, + "loss": 0.0, + "num_input_tokens_seen": 50899800, + "step": 87735 + }, + { + "epoch": 13.068215668751861, + "grad_norm": 0.0004276877152733505, + "learning_rate": 1.6171020344759936e-05, + "loss": 0.0, + "num_input_tokens_seen": 50902776, + "step": 87740 + }, + { + "epoch": 13.06896038129282, + "grad_norm": 0.0002691273402888328, + "learning_rate": 1.616798038005925e-05, + "loss": 0.0, + "num_input_tokens_seen": 50905528, + "step": 87745 + }, + { + "epoch": 13.06970509383378, + "grad_norm": 0.0008610355434939265, + "learning_rate": 1.6164940564566566e-05, + "loss": 0.2637, + "num_input_tokens_seen": 50908408, + "step": 87750 + }, + { + "epoch": 13.07044980637474, + "grad_norm": 0.005566601641476154, + "learning_rate": 1.6161900898333225e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50911192, + "step": 87755 + }, + { + "epoch": 13.071194518915698, + "grad_norm": 0.021396776661276817, + "learning_rate": 1.6158861381410593e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50914232, + "step": 87760 + }, + { + "epoch": 13.071939231456657, + "grad_norm": 0.0005182484746910632, + "learning_rate": 1.6155822013850004e-05, + "loss": 0.0, + "num_input_tokens_seen": 50916952, + "step": 87765 + }, + { + "epoch": 13.072683943997617, + "grad_norm": 0.00017219400615431368, + "learning_rate": 1.615278279570282e-05, + "loss": 0.0033, + "num_input_tokens_seen": 50920056, + "step": 87770 + }, + { + "epoch": 13.073428656538576, + "grad_norm": 0.003707321360707283, + "learning_rate": 1.614974372702038e-05, + "loss": 0.1101, + "num_input_tokens_seen": 50923128, + "step": 87775 + }, + { + "epoch": 13.074173369079535, + "grad_norm": 0.01167973130941391, + "learning_rate": 1.6146704807854014e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50926136, + "step": 87780 + }, + { + "epoch": 13.074918081620494, + "grad_norm": 0.001475233118981123, + "learning_rate": 1.6143666038255084e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50929080, + "step": 87785 + }, + { + "epoch": 13.075662794161454, + "grad_norm": 0.016604246571660042, + "learning_rate": 1.614062741827491e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50932088, + "step": 87790 + }, + { + "epoch": 13.076407506702413, + "grad_norm": 5.3562107495963573e-05, + "learning_rate": 1.6137588947964838e-05, + "loss": 0.0, + "num_input_tokens_seen": 50935064, + "step": 87795 + }, + { + "epoch": 13.077152219243372, + "grad_norm": 145.61981201171875, + "learning_rate": 1.613455062737618e-05, + "loss": 0.1595, + "num_input_tokens_seen": 50938392, + "step": 87800 + }, + { + "epoch": 13.07789693178433, + "grad_norm": 0.00720706582069397, + "learning_rate": 1.613151245656029e-05, + "loss": 0.0004, + "num_input_tokens_seen": 50941208, + "step": 87805 + }, + { + "epoch": 13.078641644325291, + "grad_norm": 0.0004957220517098904, + "learning_rate": 1.612847443556847e-05, + "loss": 0.0, + "num_input_tokens_seen": 50943928, + "step": 87810 + }, + { + "epoch": 13.07938635686625, + "grad_norm": 0.011525270529091358, + "learning_rate": 1.6125436564452075e-05, + "loss": 0.0, + "num_input_tokens_seen": 50947032, + "step": 87815 + }, + { + "epoch": 13.080131069407209, + "grad_norm": 0.0008557035471312702, + "learning_rate": 1.6122398843262405e-05, + "loss": 0.0, + "num_input_tokens_seen": 50949816, + "step": 87820 + }, + { + "epoch": 13.080875781948167, + "grad_norm": 0.0006939615122973919, + "learning_rate": 1.6119361272050777e-05, + "loss": 0.0025, + "num_input_tokens_seen": 50952792, + "step": 87825 + }, + { + "epoch": 13.081620494489128, + "grad_norm": 0.0003587512474041432, + "learning_rate": 1.6116323850868526e-05, + "loss": 0.0, + "num_input_tokens_seen": 50955992, + "step": 87830 + }, + { + "epoch": 13.082365207030087, + "grad_norm": 0.0013234963407739997, + "learning_rate": 1.611328657976694e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50958840, + "step": 87835 + }, + { + "epoch": 13.083109919571045, + "grad_norm": 0.0007129675941541791, + "learning_rate": 1.6110249458797355e-05, + "loss": 0.0327, + "num_input_tokens_seen": 50961688, + "step": 87840 + }, + { + "epoch": 13.083854632112004, + "grad_norm": 0.00325764250010252, + "learning_rate": 1.610721248801106e-05, + "loss": 0.0, + "num_input_tokens_seen": 50964696, + "step": 87845 + }, + { + "epoch": 13.084599344652965, + "grad_norm": 0.014201907441020012, + "learning_rate": 1.610417566745938e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50967576, + "step": 87850 + }, + { + "epoch": 13.085344057193923, + "grad_norm": 0.009792809374630451, + "learning_rate": 1.6101138997193615e-05, + "loss": 0.0006, + "num_input_tokens_seen": 50970488, + "step": 87855 + }, + { + "epoch": 13.086088769734882, + "grad_norm": 0.002347995527088642, + "learning_rate": 1.6098102477265057e-05, + "loss": 0.0, + "num_input_tokens_seen": 50974040, + "step": 87860 + }, + { + "epoch": 13.086833482275841, + "grad_norm": 0.0015521359164267778, + "learning_rate": 1.6095066107725015e-05, + "loss": 0.0, + "num_input_tokens_seen": 50976952, + "step": 87865 + }, + { + "epoch": 13.087578194816802, + "grad_norm": 0.00027229878469370306, + "learning_rate": 1.609202988862477e-05, + "loss": 0.0, + "num_input_tokens_seen": 50980216, + "step": 87870 + }, + { + "epoch": 13.08832290735776, + "grad_norm": 0.0005564957391470671, + "learning_rate": 1.6088993820015634e-05, + "loss": 0.0, + "num_input_tokens_seen": 50983096, + "step": 87875 + }, + { + "epoch": 13.089067619898719, + "grad_norm": 0.002557294210419059, + "learning_rate": 1.608595790194889e-05, + "loss": 0.0, + "num_input_tokens_seen": 50986040, + "step": 87880 + }, + { + "epoch": 13.089812332439678, + "grad_norm": 0.005081031005829573, + "learning_rate": 1.6082922134475823e-05, + "loss": 0.0001, + "num_input_tokens_seen": 50988920, + "step": 87885 + }, + { + "epoch": 13.090557044980638, + "grad_norm": 0.01040416769683361, + "learning_rate": 1.6079886517647723e-05, + "loss": 0.0, + "num_input_tokens_seen": 50991832, + "step": 87890 + }, + { + "epoch": 13.091301757521597, + "grad_norm": 7.972842693328857, + "learning_rate": 1.6076851051515884e-05, + "loss": 0.0305, + "num_input_tokens_seen": 50995000, + "step": 87895 + }, + { + "epoch": 13.092046470062556, + "grad_norm": 0.0006742797559127212, + "learning_rate": 1.6073815736131577e-05, + "loss": 0.0, + "num_input_tokens_seen": 50997784, + "step": 87900 + }, + { + "epoch": 13.092791182603515, + "grad_norm": 13.559967041015625, + "learning_rate": 1.6070780571546066e-05, + "loss": 0.0479, + "num_input_tokens_seen": 51000632, + "step": 87905 + }, + { + "epoch": 13.093535895144473, + "grad_norm": 6.978231430053711, + "learning_rate": 1.6067745557810656e-05, + "loss": 0.0012, + "num_input_tokens_seen": 51003320, + "step": 87910 + }, + { + "epoch": 13.094280607685434, + "grad_norm": 0.000232565522310324, + "learning_rate": 1.60647106949766e-05, + "loss": 0.0, + "num_input_tokens_seen": 51006232, + "step": 87915 + }, + { + "epoch": 13.095025320226393, + "grad_norm": 0.015687955543398857, + "learning_rate": 1.6061675983095177e-05, + "loss": 0.004, + "num_input_tokens_seen": 51009080, + "step": 87920 + }, + { + "epoch": 13.095770032767351, + "grad_norm": 0.0017024055123329163, + "learning_rate": 1.605864142221765e-05, + "loss": 0.0, + "num_input_tokens_seen": 51011864, + "step": 87925 + }, + { + "epoch": 13.09651474530831, + "grad_norm": 0.004482814110815525, + "learning_rate": 1.60556070123953e-05, + "loss": 0.3233, + "num_input_tokens_seen": 51014808, + "step": 87930 + }, + { + "epoch": 13.09725945784927, + "grad_norm": 0.0014238397125154734, + "learning_rate": 1.6052572753679372e-05, + "loss": 0.0, + "num_input_tokens_seen": 51017880, + "step": 87935 + }, + { + "epoch": 13.09800417039023, + "grad_norm": 0.0013995793415233493, + "learning_rate": 1.604953864612113e-05, + "loss": 0.0002, + "num_input_tokens_seen": 51020696, + "step": 87940 + }, + { + "epoch": 13.098748882931188, + "grad_norm": 0.0034664892591536045, + "learning_rate": 1.6046504689771842e-05, + "loss": 0.0, + "num_input_tokens_seen": 51023608, + "step": 87945 + }, + { + "epoch": 13.099493595472147, + "grad_norm": 0.019763099029660225, + "learning_rate": 1.6043470884682753e-05, + "loss": 0.0, + "num_input_tokens_seen": 51026232, + "step": 87950 + }, + { + "epoch": 13.100238308013108, + "grad_norm": 0.001389911980368197, + "learning_rate": 1.6040437230905126e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51029496, + "step": 87955 + }, + { + "epoch": 13.100983020554066, + "grad_norm": 0.0020251371897757053, + "learning_rate": 1.6037403728490193e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51032408, + "step": 87960 + }, + { + "epoch": 13.101727733095025, + "grad_norm": 0.0009458607528358698, + "learning_rate": 1.6034370377489227e-05, + "loss": 0.0005, + "num_input_tokens_seen": 51035224, + "step": 87965 + }, + { + "epoch": 13.102472445635984, + "grad_norm": 0.0011286772787570953, + "learning_rate": 1.6031337177953455e-05, + "loss": 0.0004, + "num_input_tokens_seen": 51038264, + "step": 87970 + }, + { + "epoch": 13.103217158176944, + "grad_norm": 0.021847886964678764, + "learning_rate": 1.602830412993413e-05, + "loss": 0.021, + "num_input_tokens_seen": 51041176, + "step": 87975 + }, + { + "epoch": 13.103961870717903, + "grad_norm": 0.0011715545551851392, + "learning_rate": 1.6025271233482492e-05, + "loss": 0.0, + "num_input_tokens_seen": 51044088, + "step": 87980 + }, + { + "epoch": 13.104706583258862, + "grad_norm": 0.004420379642397165, + "learning_rate": 1.6022238488649764e-05, + "loss": 0.3469, + "num_input_tokens_seen": 51047224, + "step": 87985 + }, + { + "epoch": 13.10545129579982, + "grad_norm": 0.005878714844584465, + "learning_rate": 1.6019205895487204e-05, + "loss": 0.0, + "num_input_tokens_seen": 51050040, + "step": 87990 + }, + { + "epoch": 13.106196008340781, + "grad_norm": 0.006313197780400515, + "learning_rate": 1.6016173454046018e-05, + "loss": 0.0, + "num_input_tokens_seen": 51053048, + "step": 87995 + }, + { + "epoch": 13.10694072088174, + "grad_norm": 0.003938291221857071, + "learning_rate": 1.6013141164377467e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51055864, + "step": 88000 + }, + { + "epoch": 13.107685433422699, + "grad_norm": 0.0026675157714635134, + "learning_rate": 1.6010109026532747e-05, + "loss": 0.0, + "num_input_tokens_seen": 51058776, + "step": 88005 + }, + { + "epoch": 13.108430145963657, + "grad_norm": 0.0030458965338766575, + "learning_rate": 1.600707704056311e-05, + "loss": 0.0977, + "num_input_tokens_seen": 51061752, + "step": 88010 + }, + { + "epoch": 13.109174858504618, + "grad_norm": 0.04168876260519028, + "learning_rate": 1.6004045206519763e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51064664, + "step": 88015 + }, + { + "epoch": 13.109919571045577, + "grad_norm": 0.00025873235426843166, + "learning_rate": 1.6001013524453928e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51067608, + "step": 88020 + }, + { + "epoch": 13.110664283586535, + "grad_norm": 1.0647810697555542, + "learning_rate": 1.599798199441683e-05, + "loss": 0.0092, + "num_input_tokens_seen": 51070200, + "step": 88025 + }, + { + "epoch": 13.111408996127494, + "grad_norm": 0.005368134006857872, + "learning_rate": 1.5994950616459664e-05, + "loss": 0.0034, + "num_input_tokens_seen": 51073080, + "step": 88030 + }, + { + "epoch": 13.112153708668455, + "grad_norm": 0.008685760200023651, + "learning_rate": 1.5991919390633662e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51076152, + "step": 88035 + }, + { + "epoch": 13.112898421209414, + "grad_norm": 0.007943988777697086, + "learning_rate": 1.5988888316990018e-05, + "loss": 0.0, + "num_input_tokens_seen": 51078872, + "step": 88040 + }, + { + "epoch": 13.113643133750372, + "grad_norm": 0.005955028813332319, + "learning_rate": 1.5985857395579963e-05, + "loss": 0.0003, + "num_input_tokens_seen": 51081624, + "step": 88045 + }, + { + "epoch": 13.114387846291331, + "grad_norm": 0.0005669934907928109, + "learning_rate": 1.5982826626454678e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51084440, + "step": 88050 + }, + { + "epoch": 13.115132558832292, + "grad_norm": 0.0153341768309474, + "learning_rate": 1.5979796009665376e-05, + "loss": 0.0424, + "num_input_tokens_seen": 51087224, + "step": 88055 + }, + { + "epoch": 13.11587727137325, + "grad_norm": 0.000544234411790967, + "learning_rate": 1.5976765545263254e-05, + "loss": 0.0, + "num_input_tokens_seen": 51090104, + "step": 88060 + }, + { + "epoch": 13.116621983914209, + "grad_norm": 0.004453164990991354, + "learning_rate": 1.5973735233299496e-05, + "loss": 0.0, + "num_input_tokens_seen": 51092984, + "step": 88065 + }, + { + "epoch": 13.117366696455168, + "grad_norm": 0.0011829586001113057, + "learning_rate": 1.5970705073825315e-05, + "loss": 0.0, + "num_input_tokens_seen": 51095704, + "step": 88070 + }, + { + "epoch": 13.118111408996128, + "grad_norm": 0.00023129994224291295, + "learning_rate": 1.596767506689189e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51098520, + "step": 88075 + }, + { + "epoch": 13.118856121537087, + "grad_norm": 0.007247552275657654, + "learning_rate": 1.5964645212550422e-05, + "loss": 0.0793, + "num_input_tokens_seen": 51101400, + "step": 88080 + }, + { + "epoch": 13.119600834078046, + "grad_norm": 0.0014377450570464134, + "learning_rate": 1.5961615510852083e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51104728, + "step": 88085 + }, + { + "epoch": 13.120345546619005, + "grad_norm": 0.013366551138460636, + "learning_rate": 1.5958585961848072e-05, + "loss": 0.0, + "num_input_tokens_seen": 51107576, + "step": 88090 + }, + { + "epoch": 13.121090259159963, + "grad_norm": 0.001038416987285018, + "learning_rate": 1.5955556565589564e-05, + "loss": 0.0, + "num_input_tokens_seen": 51110584, + "step": 88095 + }, + { + "epoch": 13.121834971700924, + "grad_norm": 103.04186248779297, + "learning_rate": 1.5952527322127718e-05, + "loss": 0.0287, + "num_input_tokens_seen": 51113432, + "step": 88100 + }, + { + "epoch": 13.122579684241883, + "grad_norm": 0.013356535695493221, + "learning_rate": 1.5949498231513744e-05, + "loss": 0.0, + "num_input_tokens_seen": 51116312, + "step": 88105 + }, + { + "epoch": 13.123324396782841, + "grad_norm": 3.561663834261708e-05, + "learning_rate": 1.5946469293798788e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51119288, + "step": 88110 + }, + { + "epoch": 13.1240691093238, + "grad_norm": 5.896623611450195, + "learning_rate": 1.5943440509034038e-05, + "loss": 0.0528, + "num_input_tokens_seen": 51122616, + "step": 88115 + }, + { + "epoch": 13.12481382186476, + "grad_norm": 0.0011887018335983157, + "learning_rate": 1.5940411877270655e-05, + "loss": 0.0036, + "num_input_tokens_seen": 51125272, + "step": 88120 + }, + { + "epoch": 13.12555853440572, + "grad_norm": 0.0047907233238220215, + "learning_rate": 1.5937383398559808e-05, + "loss": 0.0, + "num_input_tokens_seen": 51128248, + "step": 88125 + }, + { + "epoch": 13.126303246946678, + "grad_norm": 0.001060485141351819, + "learning_rate": 1.593435507295265e-05, + "loss": 0.0, + "num_input_tokens_seen": 51131384, + "step": 88130 + }, + { + "epoch": 13.127047959487637, + "grad_norm": 0.008775829337537289, + "learning_rate": 1.5931326900500353e-05, + "loss": 0.001, + "num_input_tokens_seen": 51134232, + "step": 88135 + }, + { + "epoch": 13.127792672028598, + "grad_norm": 0.0034378543496131897, + "learning_rate": 1.5928298881254077e-05, + "loss": 0.0005, + "num_input_tokens_seen": 51137080, + "step": 88140 + }, + { + "epoch": 13.128537384569556, + "grad_norm": 0.0029084125999361277, + "learning_rate": 1.5925271015264962e-05, + "loss": 0.0003, + "num_input_tokens_seen": 51140120, + "step": 88145 + }, + { + "epoch": 13.129282097110515, + "grad_norm": 0.007665906101465225, + "learning_rate": 1.5922243302584176e-05, + "loss": 0.0, + "num_input_tokens_seen": 51142904, + "step": 88150 + }, + { + "epoch": 13.130026809651474, + "grad_norm": 0.016668984666466713, + "learning_rate": 1.5919215743262862e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51145528, + "step": 88155 + }, + { + "epoch": 13.130771522192434, + "grad_norm": 0.0019902712665498257, + "learning_rate": 1.591618833735217e-05, + "loss": 0.0, + "num_input_tokens_seen": 51148440, + "step": 88160 + }, + { + "epoch": 13.131516234733393, + "grad_norm": 0.00026282324688509107, + "learning_rate": 1.5913161084903237e-05, + "loss": 0.0002, + "num_input_tokens_seen": 51151512, + "step": 88165 + }, + { + "epoch": 13.132260947274352, + "grad_norm": 2.8984719392610714e-05, + "learning_rate": 1.591013398596722e-05, + "loss": 0.0002, + "num_input_tokens_seen": 51154776, + "step": 88170 + }, + { + "epoch": 13.13300565981531, + "grad_norm": 0.0002840996312443167, + "learning_rate": 1.5907107040595255e-05, + "loss": 0.0, + "num_input_tokens_seen": 51157656, + "step": 88175 + }, + { + "epoch": 13.133750372356271, + "grad_norm": 0.0007368390797637403, + "learning_rate": 1.590408024883846e-05, + "loss": 0.0, + "num_input_tokens_seen": 51160664, + "step": 88180 + }, + { + "epoch": 13.13449508489723, + "grad_norm": 0.0028093024156987667, + "learning_rate": 1.5901053610747995e-05, + "loss": 0.0, + "num_input_tokens_seen": 51163384, + "step": 88185 + }, + { + "epoch": 13.135239797438189, + "grad_norm": 0.0018641515634953976, + "learning_rate": 1.5898027126374974e-05, + "loss": 0.0, + "num_input_tokens_seen": 51166200, + "step": 88190 + }, + { + "epoch": 13.135984509979147, + "grad_norm": 0.005197590682655573, + "learning_rate": 1.5895000795770547e-05, + "loss": 0.0, + "num_input_tokens_seen": 51169304, + "step": 88195 + }, + { + "epoch": 13.136729222520108, + "grad_norm": 0.0010024916846305132, + "learning_rate": 1.589197461898581e-05, + "loss": 0.0, + "num_input_tokens_seen": 51172408, + "step": 88200 + }, + { + "epoch": 13.137473935061067, + "grad_norm": 0.00500329677015543, + "learning_rate": 1.588894859607192e-05, + "loss": 0.0, + "num_input_tokens_seen": 51175224, + "step": 88205 + }, + { + "epoch": 13.138218647602026, + "grad_norm": 0.0013203835114836693, + "learning_rate": 1.5885922727079977e-05, + "loss": 0.0, + "num_input_tokens_seen": 51178296, + "step": 88210 + }, + { + "epoch": 13.138963360142984, + "grad_norm": 0.011472834274172783, + "learning_rate": 1.5882897012061104e-05, + "loss": 0.0, + "num_input_tokens_seen": 51181208, + "step": 88215 + }, + { + "epoch": 13.139708072683945, + "grad_norm": 0.0011723614297807217, + "learning_rate": 1.5879871451066424e-05, + "loss": 0.0, + "num_input_tokens_seen": 51184088, + "step": 88220 + }, + { + "epoch": 13.140452785224904, + "grad_norm": 0.0009264565305784345, + "learning_rate": 1.5876846044147048e-05, + "loss": 0.0, + "num_input_tokens_seen": 51186776, + "step": 88225 + }, + { + "epoch": 13.141197497765862, + "grad_norm": 0.0005572803202085197, + "learning_rate": 1.5873820791354085e-05, + "loss": 0.0, + "num_input_tokens_seen": 51189656, + "step": 88230 + }, + { + "epoch": 13.141942210306821, + "grad_norm": 3.9161495806183666e-05, + "learning_rate": 1.5870795692738635e-05, + "loss": 0.0, + "num_input_tokens_seen": 51192472, + "step": 88235 + }, + { + "epoch": 13.142686922847782, + "grad_norm": 1.1875524520874023, + "learning_rate": 1.5867770748351822e-05, + "loss": 0.0006, + "num_input_tokens_seen": 51195576, + "step": 88240 + }, + { + "epoch": 13.14343163538874, + "grad_norm": 0.009340121410787106, + "learning_rate": 1.586474595824474e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51198296, + "step": 88245 + }, + { + "epoch": 13.1441763479297, + "grad_norm": 0.0018177080200985074, + "learning_rate": 1.5861721322468487e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51201304, + "step": 88250 + }, + { + "epoch": 13.144921060470658, + "grad_norm": 0.0034595944453030825, + "learning_rate": 1.5858696841074166e-05, + "loss": 0.0, + "num_input_tokens_seen": 51204280, + "step": 88255 + }, + { + "epoch": 13.145665773011617, + "grad_norm": 0.0018776338547468185, + "learning_rate": 1.5855672514112876e-05, + "loss": 0.0, + "num_input_tokens_seen": 51207000, + "step": 88260 + }, + { + "epoch": 13.146410485552577, + "grad_norm": 0.0018867867765948176, + "learning_rate": 1.5852648341635705e-05, + "loss": 0.0085, + "num_input_tokens_seen": 51209848, + "step": 88265 + }, + { + "epoch": 13.147155198093536, + "grad_norm": 0.003170256968587637, + "learning_rate": 1.5849624323693736e-05, + "loss": 0.0, + "num_input_tokens_seen": 51212792, + "step": 88270 + }, + { + "epoch": 13.147899910634495, + "grad_norm": 0.0105271702632308, + "learning_rate": 1.5846600460338068e-05, + "loss": 0.0405, + "num_input_tokens_seen": 51215544, + "step": 88275 + }, + { + "epoch": 13.148644623175453, + "grad_norm": 0.02086811140179634, + "learning_rate": 1.584357675161978e-05, + "loss": 0.0017, + "num_input_tokens_seen": 51218360, + "step": 88280 + }, + { + "epoch": 13.149389335716414, + "grad_norm": 0.00025368345086462796, + "learning_rate": 1.5840553197589964e-05, + "loss": 0.0423, + "num_input_tokens_seen": 51221464, + "step": 88285 + }, + { + "epoch": 13.150134048257373, + "grad_norm": 0.0031836689449846745, + "learning_rate": 1.5837529798299677e-05, + "loss": 0.0306, + "num_input_tokens_seen": 51224152, + "step": 88290 + }, + { + "epoch": 13.150878760798332, + "grad_norm": 0.002131577581167221, + "learning_rate": 1.5834506553800026e-05, + "loss": 0.0097, + "num_input_tokens_seen": 51226904, + "step": 88295 + }, + { + "epoch": 13.15162347333929, + "grad_norm": 0.001064126379787922, + "learning_rate": 1.583148346414207e-05, + "loss": 0.0199, + "num_input_tokens_seen": 51229976, + "step": 88300 + }, + { + "epoch": 13.15236818588025, + "grad_norm": 0.034256964921951294, + "learning_rate": 1.5828460529376876e-05, + "loss": 0.0, + "num_input_tokens_seen": 51233016, + "step": 88305 + }, + { + "epoch": 13.15311289842121, + "grad_norm": 0.0008094434160739183, + "learning_rate": 1.5825437749555525e-05, + "loss": 0.0, + "num_input_tokens_seen": 51236024, + "step": 88310 + }, + { + "epoch": 13.153857610962168, + "grad_norm": 0.000830553995911032, + "learning_rate": 1.582241512472907e-05, + "loss": 0.0, + "num_input_tokens_seen": 51238680, + "step": 88315 + }, + { + "epoch": 13.154602323503127, + "grad_norm": 0.00022894804715178907, + "learning_rate": 1.58193926549486e-05, + "loss": 0.0004, + "num_input_tokens_seen": 51241624, + "step": 88320 + }, + { + "epoch": 13.155347036044088, + "grad_norm": 0.00010717545228544623, + "learning_rate": 1.5816370340265144e-05, + "loss": 0.0, + "num_input_tokens_seen": 51244760, + "step": 88325 + }, + { + "epoch": 13.156091748585046, + "grad_norm": 0.0909232497215271, + "learning_rate": 1.5813348180729788e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51247448, + "step": 88330 + }, + { + "epoch": 13.156836461126005, + "grad_norm": 0.01660744473338127, + "learning_rate": 1.5810326176393566e-05, + "loss": 0.0, + "num_input_tokens_seen": 51250520, + "step": 88335 + }, + { + "epoch": 13.157581173666964, + "grad_norm": 0.0007398162852041423, + "learning_rate": 1.5807304327307556e-05, + "loss": 0.1023, + "num_input_tokens_seen": 51253560, + "step": 88340 + }, + { + "epoch": 13.158325886207924, + "grad_norm": 0.0001000834017759189, + "learning_rate": 1.5804282633522793e-05, + "loss": 0.0, + "num_input_tokens_seen": 51256312, + "step": 88345 + }, + { + "epoch": 13.159070598748883, + "grad_norm": 0.00030075214453972876, + "learning_rate": 1.5801261095090325e-05, + "loss": 0.0, + "num_input_tokens_seen": 51259160, + "step": 88350 + }, + { + "epoch": 13.159815311289842, + "grad_norm": 0.0004373049596324563, + "learning_rate": 1.579823971206121e-05, + "loss": 0.0, + "num_input_tokens_seen": 51262008, + "step": 88355 + }, + { + "epoch": 13.1605600238308, + "grad_norm": 0.0002860345703084022, + "learning_rate": 1.5795218484486468e-05, + "loss": 0.0, + "num_input_tokens_seen": 51264728, + "step": 88360 + }, + { + "epoch": 13.161304736371761, + "grad_norm": 0.006238278932869434, + "learning_rate": 1.5792197412417167e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51267928, + "step": 88365 + }, + { + "epoch": 13.16204944891272, + "grad_norm": 0.004010607022792101, + "learning_rate": 1.578917649590432e-05, + "loss": 0.0, + "num_input_tokens_seen": 51270680, + "step": 88370 + }, + { + "epoch": 13.162794161453679, + "grad_norm": 7.420337200164795, + "learning_rate": 1.5786155734998988e-05, + "loss": 0.0023, + "num_input_tokens_seen": 51273688, + "step": 88375 + }, + { + "epoch": 13.163538873994638, + "grad_norm": 0.0005827685235999525, + "learning_rate": 1.578313512975219e-05, + "loss": 0.0, + "num_input_tokens_seen": 51276984, + "step": 88380 + }, + { + "epoch": 13.164283586535598, + "grad_norm": 0.00320300180464983, + "learning_rate": 1.5780114680214948e-05, + "loss": 0.2289, + "num_input_tokens_seen": 51279928, + "step": 88385 + }, + { + "epoch": 13.165028299076557, + "grad_norm": 0.0012589002726599574, + "learning_rate": 1.5777094386438306e-05, + "loss": 0.0, + "num_input_tokens_seen": 51283000, + "step": 88390 + }, + { + "epoch": 13.165773011617516, + "grad_norm": 0.016306860372424126, + "learning_rate": 1.577407424847327e-05, + "loss": 0.0, + "num_input_tokens_seen": 51285752, + "step": 88395 + }, + { + "epoch": 13.166517724158474, + "grad_norm": 0.0013724465388804674, + "learning_rate": 1.5771054266370882e-05, + "loss": 0.0, + "num_input_tokens_seen": 51288856, + "step": 88400 + }, + { + "epoch": 13.167262436699435, + "grad_norm": 0.0012747648870572448, + "learning_rate": 1.5768034440182143e-05, + "loss": 0.0, + "num_input_tokens_seen": 51291672, + "step": 88405 + }, + { + "epoch": 13.168007149240394, + "grad_norm": 0.00021003125584684312, + "learning_rate": 1.576501476995809e-05, + "loss": 0.0, + "num_input_tokens_seen": 51294360, + "step": 88410 + }, + { + "epoch": 13.168751861781352, + "grad_norm": 0.0002070987829938531, + "learning_rate": 1.576199525574972e-05, + "loss": 0.1319, + "num_input_tokens_seen": 51297176, + "step": 88415 + }, + { + "epoch": 13.169496574322311, + "grad_norm": 0.0008700246107764542, + "learning_rate": 1.5758975897608056e-05, + "loss": 0.0, + "num_input_tokens_seen": 51299736, + "step": 88420 + }, + { + "epoch": 13.17024128686327, + "grad_norm": 0.0002398190990788862, + "learning_rate": 1.57559566955841e-05, + "loss": 0.0, + "num_input_tokens_seen": 51302616, + "step": 88425 + }, + { + "epoch": 13.17098599940423, + "grad_norm": 0.002079608151689172, + "learning_rate": 1.5752937649728854e-05, + "loss": 0.0, + "num_input_tokens_seen": 51305304, + "step": 88430 + }, + { + "epoch": 13.17173071194519, + "grad_norm": 0.0003426706825848669, + "learning_rate": 1.574991876009334e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51308216, + "step": 88435 + }, + { + "epoch": 13.172475424486148, + "grad_norm": 0.00611090799793601, + "learning_rate": 1.574690002672853e-05, + "loss": 0.0, + "num_input_tokens_seen": 51311000, + "step": 88440 + }, + { + "epoch": 13.173220137027107, + "grad_norm": 0.0006197705515660346, + "learning_rate": 1.5743881449685456e-05, + "loss": 0.0, + "num_input_tokens_seen": 51314328, + "step": 88445 + }, + { + "epoch": 13.173964849568067, + "grad_norm": 0.003948068246245384, + "learning_rate": 1.574086302901509e-05, + "loss": 0.0, + "num_input_tokens_seen": 51317304, + "step": 88450 + }, + { + "epoch": 13.174709562109026, + "grad_norm": 0.00011256422294536605, + "learning_rate": 1.5737844764768437e-05, + "loss": 0.0036, + "num_input_tokens_seen": 51320536, + "step": 88455 + }, + { + "epoch": 13.175454274649985, + "grad_norm": 0.0008482892299070954, + "learning_rate": 1.5734826656996482e-05, + "loss": 0.341, + "num_input_tokens_seen": 51323352, + "step": 88460 + }, + { + "epoch": 13.176198987190944, + "grad_norm": 3.0818283557891846, + "learning_rate": 1.5731808705750206e-05, + "loss": 0.0003, + "num_input_tokens_seen": 51326072, + "step": 88465 + }, + { + "epoch": 13.176943699731904, + "grad_norm": 0.0035824698861688375, + "learning_rate": 1.5728790911080612e-05, + "loss": 0.0, + "num_input_tokens_seen": 51328952, + "step": 88470 + }, + { + "epoch": 13.177688412272863, + "grad_norm": 17.593381881713867, + "learning_rate": 1.5725773273038662e-05, + "loss": 0.1226, + "num_input_tokens_seen": 51331832, + "step": 88475 + }, + { + "epoch": 13.178433124813822, + "grad_norm": 5.541128120967187e-05, + "learning_rate": 1.5722755791675358e-05, + "loss": 0.1346, + "num_input_tokens_seen": 51334776, + "step": 88480 + }, + { + "epoch": 13.17917783735478, + "grad_norm": 0.0028655652422457933, + "learning_rate": 1.571973846704166e-05, + "loss": 0.0, + "num_input_tokens_seen": 51337432, + "step": 88485 + }, + { + "epoch": 13.17992254989574, + "grad_norm": 0.00013480008055921644, + "learning_rate": 1.5716721299188553e-05, + "loss": 0.0018, + "num_input_tokens_seen": 51340472, + "step": 88490 + }, + { + "epoch": 13.1806672624367, + "grad_norm": 0.0010091107105836272, + "learning_rate": 1.5713704288166998e-05, + "loss": 0.0, + "num_input_tokens_seen": 51343160, + "step": 88495 + }, + { + "epoch": 13.181411974977658, + "grad_norm": 0.0011378360213711858, + "learning_rate": 1.5710687434027976e-05, + "loss": 0.0, + "num_input_tokens_seen": 51345976, + "step": 88500 + }, + { + "epoch": 13.182156687518617, + "grad_norm": 0.0032726626377552748, + "learning_rate": 1.5707670736822448e-05, + "loss": 0.0051, + "num_input_tokens_seen": 51348600, + "step": 88505 + }, + { + "epoch": 13.182901400059578, + "grad_norm": 0.002615702338516712, + "learning_rate": 1.5704654196601375e-05, + "loss": 0.1037, + "num_input_tokens_seen": 51351512, + "step": 88510 + }, + { + "epoch": 13.183646112600536, + "grad_norm": 0.010397507809102535, + "learning_rate": 1.5701637813415728e-05, + "loss": 0.0004, + "num_input_tokens_seen": 51354456, + "step": 88515 + }, + { + "epoch": 13.184390825141495, + "grad_norm": 26.602706909179688, + "learning_rate": 1.5698621587316454e-05, + "loss": 0.1751, + "num_input_tokens_seen": 51357496, + "step": 88520 + }, + { + "epoch": 13.185135537682454, + "grad_norm": 0.0012213349109515548, + "learning_rate": 1.5695605518354524e-05, + "loss": 0.0647, + "num_input_tokens_seen": 51360312, + "step": 88525 + }, + { + "epoch": 13.185880250223414, + "grad_norm": 0.4737376272678375, + "learning_rate": 1.5692589606580866e-05, + "loss": 0.0015, + "num_input_tokens_seen": 51363064, + "step": 88530 + }, + { + "epoch": 13.186624962764373, + "grad_norm": 0.0444888211786747, + "learning_rate": 1.568957385204646e-05, + "loss": 0.0002, + "num_input_tokens_seen": 51366072, + "step": 88535 + }, + { + "epoch": 13.187369675305332, + "grad_norm": 0.0009395650122314692, + "learning_rate": 1.568655825480224e-05, + "loss": 0.0, + "num_input_tokens_seen": 51368856, + "step": 88540 + }, + { + "epoch": 13.18811438784629, + "grad_norm": 201.31024169921875, + "learning_rate": 1.5683542814899144e-05, + "loss": 0.0387, + "num_input_tokens_seen": 51371544, + "step": 88545 + }, + { + "epoch": 13.188859100387251, + "grad_norm": 0.0013583260588347912, + "learning_rate": 1.5680527532388133e-05, + "loss": 0.0, + "num_input_tokens_seen": 51374360, + "step": 88550 + }, + { + "epoch": 13.18960381292821, + "grad_norm": 0.0009055964765138924, + "learning_rate": 1.5677512407320133e-05, + "loss": 0.0006, + "num_input_tokens_seen": 51377240, + "step": 88555 + }, + { + "epoch": 13.190348525469169, + "grad_norm": 0.0008772570872679353, + "learning_rate": 1.5674497439746088e-05, + "loss": 0.0, + "num_input_tokens_seen": 51379896, + "step": 88560 + }, + { + "epoch": 13.191093238010128, + "grad_norm": 0.0012011545477434993, + "learning_rate": 1.5671482629716926e-05, + "loss": 0.0018, + "num_input_tokens_seen": 51382968, + "step": 88565 + }, + { + "epoch": 13.191837950551088, + "grad_norm": 0.0005605078185908496, + "learning_rate": 1.566846797728359e-05, + "loss": 0.0355, + "num_input_tokens_seen": 51385976, + "step": 88570 + }, + { + "epoch": 13.192582663092047, + "grad_norm": 0.0012537803268060088, + "learning_rate": 1.5665453482497e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51388920, + "step": 88575 + }, + { + "epoch": 13.193327375633006, + "grad_norm": 0.001006527803838253, + "learning_rate": 1.5662439145408084e-05, + "loss": 0.0, + "num_input_tokens_seen": 51391640, + "step": 88580 + }, + { + "epoch": 13.194072088173964, + "grad_norm": 492.5390319824219, + "learning_rate": 1.5659424966067775e-05, + "loss": 0.0883, + "num_input_tokens_seen": 51394488, + "step": 88585 + }, + { + "epoch": 13.194816800714925, + "grad_norm": 0.0020044269040226936, + "learning_rate": 1.5656410944526984e-05, + "loss": 0.0, + "num_input_tokens_seen": 51397304, + "step": 88590 + }, + { + "epoch": 13.195561513255884, + "grad_norm": 0.00014562287833541632, + "learning_rate": 1.5653397080836633e-05, + "loss": 0.0006, + "num_input_tokens_seen": 51400344, + "step": 88595 + }, + { + "epoch": 13.196306225796842, + "grad_norm": 23.311613082885742, + "learning_rate": 1.5650383375047634e-05, + "loss": 0.0029, + "num_input_tokens_seen": 51403096, + "step": 88600 + }, + { + "epoch": 13.197050938337801, + "grad_norm": 1.1172902304679155e-05, + "learning_rate": 1.5647369827210917e-05, + "loss": 0.0, + "num_input_tokens_seen": 51405976, + "step": 88605 + }, + { + "epoch": 13.19779565087876, + "grad_norm": 0.00014037889195606112, + "learning_rate": 1.5644356437377373e-05, + "loss": 0.1725, + "num_input_tokens_seen": 51408792, + "step": 88610 + }, + { + "epoch": 13.19854036341972, + "grad_norm": 0.00015193165745586157, + "learning_rate": 1.5641343205597925e-05, + "loss": 0.0, + "num_input_tokens_seen": 51411800, + "step": 88615 + }, + { + "epoch": 13.19928507596068, + "grad_norm": 0.000421926291892305, + "learning_rate": 1.5638330131923465e-05, + "loss": 0.0376, + "num_input_tokens_seen": 51414712, + "step": 88620 + }, + { + "epoch": 13.200029788501638, + "grad_norm": 6.536224100273103e-05, + "learning_rate": 1.5635317216404906e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51417592, + "step": 88625 + }, + { + "epoch": 13.200774501042597, + "grad_norm": 0.010747872292995453, + "learning_rate": 1.5632304459093145e-05, + "loss": 0.0029, + "num_input_tokens_seen": 51420568, + "step": 88630 + }, + { + "epoch": 13.201519213583557, + "grad_norm": 0.05341038107872009, + "learning_rate": 1.562929186003907e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51423096, + "step": 88635 + }, + { + "epoch": 13.202263926124516, + "grad_norm": 2.1065301552880555e-05, + "learning_rate": 1.56262794192936e-05, + "loss": 0.0, + "num_input_tokens_seen": 51425816, + "step": 88640 + }, + { + "epoch": 13.203008638665475, + "grad_norm": 0.0005094577791169286, + "learning_rate": 1.5623267136907602e-05, + "loss": 0.0034, + "num_input_tokens_seen": 51428760, + "step": 88645 + }, + { + "epoch": 13.203753351206434, + "grad_norm": 0.00021063339954707772, + "learning_rate": 1.5620255012931984e-05, + "loss": 0.0, + "num_input_tokens_seen": 51431832, + "step": 88650 + }, + { + "epoch": 13.204498063747394, + "grad_norm": 0.00040014288970269263, + "learning_rate": 1.5617243047417614e-05, + "loss": 0.0, + "num_input_tokens_seen": 51434872, + "step": 88655 + }, + { + "epoch": 13.205242776288353, + "grad_norm": 0.0007388432277366519, + "learning_rate": 1.5614231240415393e-05, + "loss": 0.0616, + "num_input_tokens_seen": 51437752, + "step": 88660 + }, + { + "epoch": 13.205987488829312, + "grad_norm": 2.1809170246124268, + "learning_rate": 1.5611219591976198e-05, + "loss": 0.0002, + "num_input_tokens_seen": 51440536, + "step": 88665 + }, + { + "epoch": 13.20673220137027, + "grad_norm": 0.0012843937147408724, + "learning_rate": 1.5608208102150895e-05, + "loss": 0.0, + "num_input_tokens_seen": 51443736, + "step": 88670 + }, + { + "epoch": 13.207476913911231, + "grad_norm": 0.0027780216187238693, + "learning_rate": 1.560519677099038e-05, + "loss": 0.0, + "num_input_tokens_seen": 51446808, + "step": 88675 + }, + { + "epoch": 13.20822162645219, + "grad_norm": 0.015643253922462463, + "learning_rate": 1.5602185598545515e-05, + "loss": 0.0016, + "num_input_tokens_seen": 51449880, + "step": 88680 + }, + { + "epoch": 13.208966338993148, + "grad_norm": 0.00024034401576500386, + "learning_rate": 1.5599174584867177e-05, + "loss": 0.0, + "num_input_tokens_seen": 51452696, + "step": 88685 + }, + { + "epoch": 13.209711051534107, + "grad_norm": 0.0003973853017669171, + "learning_rate": 1.5596163730006218e-05, + "loss": 0.0, + "num_input_tokens_seen": 51455672, + "step": 88690 + }, + { + "epoch": 13.210455764075068, + "grad_norm": 0.0063739558681845665, + "learning_rate": 1.5593153034013526e-05, + "loss": 0.0, + "num_input_tokens_seen": 51458424, + "step": 88695 + }, + { + "epoch": 13.211200476616026, + "grad_norm": 0.0012507332721725106, + "learning_rate": 1.5590142496939956e-05, + "loss": 0.0, + "num_input_tokens_seen": 51461528, + "step": 88700 + }, + { + "epoch": 13.211945189156985, + "grad_norm": 6.154894799692556e-05, + "learning_rate": 1.558713211883635e-05, + "loss": 0.0, + "num_input_tokens_seen": 51464248, + "step": 88705 + }, + { + "epoch": 13.212689901697944, + "grad_norm": 2.3266075004357845e-05, + "learning_rate": 1.5584121899753595e-05, + "loss": 0.0, + "num_input_tokens_seen": 51467128, + "step": 88710 + }, + { + "epoch": 13.213434614238905, + "grad_norm": 0.0001900256611406803, + "learning_rate": 1.558111183974252e-05, + "loss": 0.0002, + "num_input_tokens_seen": 51470200, + "step": 88715 + }, + { + "epoch": 13.214179326779863, + "grad_norm": 0.00013027693785261363, + "learning_rate": 1.5578101938853994e-05, + "loss": 0.0122, + "num_input_tokens_seen": 51473208, + "step": 88720 + }, + { + "epoch": 13.214924039320822, + "grad_norm": 0.0001341466704616323, + "learning_rate": 1.5575092197138852e-05, + "loss": 0.0004, + "num_input_tokens_seen": 51475864, + "step": 88725 + }, + { + "epoch": 13.21566875186178, + "grad_norm": 0.0005498474929481745, + "learning_rate": 1.557208261464796e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51478552, + "step": 88730 + }, + { + "epoch": 13.216413464402741, + "grad_norm": 0.0008970362250693142, + "learning_rate": 1.556907319143214e-05, + "loss": 0.0014, + "num_input_tokens_seen": 51481400, + "step": 88735 + }, + { + "epoch": 13.2171581769437, + "grad_norm": 0.0017974823713302612, + "learning_rate": 1.5566063927542245e-05, + "loss": 0.0, + "num_input_tokens_seen": 51483960, + "step": 88740 + }, + { + "epoch": 13.217902889484659, + "grad_norm": 0.0019671383779495955, + "learning_rate": 1.5563054823029122e-05, + "loss": 0.0, + "num_input_tokens_seen": 51486904, + "step": 88745 + }, + { + "epoch": 13.218647602025618, + "grad_norm": 0.0002087063912767917, + "learning_rate": 1.5560045877943585e-05, + "loss": 0.0, + "num_input_tokens_seen": 51489592, + "step": 88750 + }, + { + "epoch": 13.219392314566578, + "grad_norm": 80.55364227294922, + "learning_rate": 1.5557037092336486e-05, + "loss": 0.4188, + "num_input_tokens_seen": 51492632, + "step": 88755 + }, + { + "epoch": 13.220137027107537, + "grad_norm": 0.002358797937631607, + "learning_rate": 1.5554028466258634e-05, + "loss": 0.0, + "num_input_tokens_seen": 51495576, + "step": 88760 + }, + { + "epoch": 13.220881739648496, + "grad_norm": 0.0017798064509406686, + "learning_rate": 1.5551019999760885e-05, + "loss": 0.0, + "num_input_tokens_seen": 51498424, + "step": 88765 + }, + { + "epoch": 13.221626452189454, + "grad_norm": 0.0003207075933460146, + "learning_rate": 1.554801169289404e-05, + "loss": 0.0, + "num_input_tokens_seen": 51501656, + "step": 88770 + }, + { + "epoch": 13.222371164730415, + "grad_norm": 0.000907443929463625, + "learning_rate": 1.554500354570894e-05, + "loss": 0.0, + "num_input_tokens_seen": 51504728, + "step": 88775 + }, + { + "epoch": 13.223115877271374, + "grad_norm": 0.0004762191674672067, + "learning_rate": 1.5541995558256394e-05, + "loss": 0.0, + "num_input_tokens_seen": 51507992, + "step": 88780 + }, + { + "epoch": 13.223860589812332, + "grad_norm": 0.0007307286141440272, + "learning_rate": 1.5538987730587217e-05, + "loss": 0.0, + "num_input_tokens_seen": 51510808, + "step": 88785 + }, + { + "epoch": 13.224605302353291, + "grad_norm": 5.118368790135719e-05, + "learning_rate": 1.553598006275223e-05, + "loss": 0.0, + "num_input_tokens_seen": 51513848, + "step": 88790 + }, + { + "epoch": 13.22535001489425, + "grad_norm": 0.0002087106986436993, + "learning_rate": 1.5532972554802232e-05, + "loss": 0.0, + "num_input_tokens_seen": 51516568, + "step": 88795 + }, + { + "epoch": 13.22609472743521, + "grad_norm": 0.0002907028829213232, + "learning_rate": 1.552996520678805e-05, + "loss": 0.0, + "num_input_tokens_seen": 51519448, + "step": 88800 + }, + { + "epoch": 13.22683943997617, + "grad_norm": 0.004552000667899847, + "learning_rate": 1.5526958018760473e-05, + "loss": 0.0, + "num_input_tokens_seen": 51522328, + "step": 88805 + }, + { + "epoch": 13.227584152517128, + "grad_norm": 6.964908970985562e-05, + "learning_rate": 1.552395099077032e-05, + "loss": 0.0117, + "num_input_tokens_seen": 51525464, + "step": 88810 + }, + { + "epoch": 13.228328865058087, + "grad_norm": 0.00023367426183540374, + "learning_rate": 1.552094412286838e-05, + "loss": 0.0174, + "num_input_tokens_seen": 51528344, + "step": 88815 + }, + { + "epoch": 13.229073577599047, + "grad_norm": 7.66784287407063e-05, + "learning_rate": 1.551793741510546e-05, + "loss": 0.0, + "num_input_tokens_seen": 51531384, + "step": 88820 + }, + { + "epoch": 13.229818290140006, + "grad_norm": 0.0029521831311285496, + "learning_rate": 1.5514930867532352e-05, + "loss": 0.0, + "num_input_tokens_seen": 51534264, + "step": 88825 + }, + { + "epoch": 13.230563002680965, + "grad_norm": 0.4266112446784973, + "learning_rate": 1.5511924480199836e-05, + "loss": 0.1262, + "num_input_tokens_seen": 51537048, + "step": 88830 + }, + { + "epoch": 13.231307715221924, + "grad_norm": 0.0034780940040946007, + "learning_rate": 1.550891825315872e-05, + "loss": 0.0, + "num_input_tokens_seen": 51539992, + "step": 88835 + }, + { + "epoch": 13.232052427762884, + "grad_norm": 8.532864740118384e-05, + "learning_rate": 1.5505912186459775e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51543032, + "step": 88840 + }, + { + "epoch": 13.232797140303843, + "grad_norm": 0.002773078391328454, + "learning_rate": 1.5502906280153806e-05, + "loss": 0.0, + "num_input_tokens_seen": 51546104, + "step": 88845 + }, + { + "epoch": 13.233541852844802, + "grad_norm": 0.000134805086418055, + "learning_rate": 1.5499900534291575e-05, + "loss": 0.0, + "num_input_tokens_seen": 51549144, + "step": 88850 + }, + { + "epoch": 13.23428656538576, + "grad_norm": 21.52448272705078, + "learning_rate": 1.5496894948923873e-05, + "loss": 0.0108, + "num_input_tokens_seen": 51552056, + "step": 88855 + }, + { + "epoch": 13.235031277926721, + "grad_norm": 8.686516957823187e-05, + "learning_rate": 1.5493889524101467e-05, + "loss": 0.0, + "num_input_tokens_seen": 51554872, + "step": 88860 + }, + { + "epoch": 13.23577599046768, + "grad_norm": 1.042164921760559, + "learning_rate": 1.5490884259875143e-05, + "loss": 0.0002, + "num_input_tokens_seen": 51557912, + "step": 88865 + }, + { + "epoch": 13.236520703008638, + "grad_norm": 0.00045483410940505564, + "learning_rate": 1.5487879156295665e-05, + "loss": 0.0, + "num_input_tokens_seen": 51561048, + "step": 88870 + }, + { + "epoch": 13.237265415549597, + "grad_norm": 0.00010585832933429629, + "learning_rate": 1.548487421341379e-05, + "loss": 0.0, + "num_input_tokens_seen": 51564152, + "step": 88875 + }, + { + "epoch": 13.238010128090558, + "grad_norm": 0.0029542469419538975, + "learning_rate": 1.548186943128031e-05, + "loss": 0.2031, + "num_input_tokens_seen": 51566872, + "step": 88880 + }, + { + "epoch": 13.238754840631517, + "grad_norm": 0.0007636593072675169, + "learning_rate": 1.5478864809945965e-05, + "loss": 0.1407, + "num_input_tokens_seen": 51569624, + "step": 88885 + }, + { + "epoch": 13.239499553172475, + "grad_norm": 0.0013591375900432467, + "learning_rate": 1.5475860349461524e-05, + "loss": 0.0, + "num_input_tokens_seen": 51572568, + "step": 88890 + }, + { + "epoch": 13.240244265713434, + "grad_norm": 0.0005628740764223039, + "learning_rate": 1.5472856049877733e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51575416, + "step": 88895 + }, + { + "epoch": 13.240988978254395, + "grad_norm": 9.241462248610333e-05, + "learning_rate": 1.5469851911245368e-05, + "loss": 0.2095, + "num_input_tokens_seen": 51578328, + "step": 88900 + }, + { + "epoch": 13.241733690795353, + "grad_norm": 0.19352787733078003, + "learning_rate": 1.5466847933615165e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51581048, + "step": 88905 + }, + { + "epoch": 13.242478403336312, + "grad_norm": 0.00035447656409814954, + "learning_rate": 1.5463844117037872e-05, + "loss": 0.0, + "num_input_tokens_seen": 51583672, + "step": 88910 + }, + { + "epoch": 13.24322311587727, + "grad_norm": 0.0008591762743890285, + "learning_rate": 1.5460840461564247e-05, + "loss": 0.0, + "num_input_tokens_seen": 51586744, + "step": 88915 + }, + { + "epoch": 13.243967828418231, + "grad_norm": 1.642347160668578e-05, + "learning_rate": 1.5457836967245027e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51589368, + "step": 88920 + }, + { + "epoch": 13.24471254095919, + "grad_norm": 0.0018403829308226705, + "learning_rate": 1.5454833634130955e-05, + "loss": 0.0, + "num_input_tokens_seen": 51592472, + "step": 88925 + }, + { + "epoch": 13.245457253500149, + "grad_norm": 0.00011080219701398164, + "learning_rate": 1.5451830462272753e-05, + "loss": 0.0005, + "num_input_tokens_seen": 51595192, + "step": 88930 + }, + { + "epoch": 13.246201966041108, + "grad_norm": 0.005105331540107727, + "learning_rate": 1.5448827451721188e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51598616, + "step": 88935 + }, + { + "epoch": 13.246946678582066, + "grad_norm": 0.00046988253598101437, + "learning_rate": 1.5445824602526966e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51601528, + "step": 88940 + }, + { + "epoch": 13.247691391123027, + "grad_norm": 0.0028898310847580433, + "learning_rate": 1.5442821914740836e-05, + "loss": 0.0883, + "num_input_tokens_seen": 51604728, + "step": 88945 + }, + { + "epoch": 13.248436103663986, + "grad_norm": 2.489116907119751, + "learning_rate": 1.543981938841351e-05, + "loss": 0.3048, + "num_input_tokens_seen": 51607640, + "step": 88950 + }, + { + "epoch": 13.249180816204944, + "grad_norm": 0.0003970162651967257, + "learning_rate": 1.5436817023595716e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51610776, + "step": 88955 + }, + { + "epoch": 13.249925528745903, + "grad_norm": 0.0007587976870127022, + "learning_rate": 1.543381482033819e-05, + "loss": 0.0, + "num_input_tokens_seen": 51613912, + "step": 88960 + }, + { + "epoch": 13.250670241286864, + "grad_norm": 0.01498606987297535, + "learning_rate": 1.5430812778691626e-05, + "loss": 0.0, + "num_input_tokens_seen": 51616568, + "step": 88965 + }, + { + "epoch": 13.251414953827823, + "grad_norm": 0.005003286991268396, + "learning_rate": 1.5427810898706764e-05, + "loss": 0.0002, + "num_input_tokens_seen": 51619480, + "step": 88970 + }, + { + "epoch": 13.252159666368781, + "grad_norm": 0.0006842021830379963, + "learning_rate": 1.54248091804343e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51622520, + "step": 88975 + }, + { + "epoch": 13.25290437890974, + "grad_norm": 0.0002713638823479414, + "learning_rate": 1.5421807623924968e-05, + "loss": 0.0, + "num_input_tokens_seen": 51625656, + "step": 88980 + }, + { + "epoch": 13.2536490914507, + "grad_norm": 0.10789591073989868, + "learning_rate": 1.5418806229229452e-05, + "loss": 0.0051, + "num_input_tokens_seen": 51628344, + "step": 88985 + }, + { + "epoch": 13.25439380399166, + "grad_norm": 0.0009329282329417765, + "learning_rate": 1.541580499639846e-05, + "loss": 0.0, + "num_input_tokens_seen": 51631480, + "step": 88990 + }, + { + "epoch": 13.255138516532618, + "grad_norm": 0.0001121704772231169, + "learning_rate": 1.541280392548271e-05, + "loss": 0.2531, + "num_input_tokens_seen": 51634424, + "step": 88995 + }, + { + "epoch": 13.255883229073577, + "grad_norm": 0.011326493695378304, + "learning_rate": 1.5409803016532888e-05, + "loss": 0.0, + "num_input_tokens_seen": 51637176, + "step": 89000 + }, + { + "epoch": 13.256627941614537, + "grad_norm": 0.005158075597137213, + "learning_rate": 1.5406802269599703e-05, + "loss": 0.0, + "num_input_tokens_seen": 51640088, + "step": 89005 + }, + { + "epoch": 13.257372654155496, + "grad_norm": 0.30574682354927063, + "learning_rate": 1.540380168473384e-05, + "loss": 0.0006, + "num_input_tokens_seen": 51642872, + "step": 89010 + }, + { + "epoch": 13.258117366696455, + "grad_norm": 9.459244756726548e-05, + "learning_rate": 1.5400801261986e-05, + "loss": 0.0, + "num_input_tokens_seen": 51645560, + "step": 89015 + }, + { + "epoch": 13.258862079237414, + "grad_norm": 0.0009745617862790823, + "learning_rate": 1.5397801001406857e-05, + "loss": 0.0, + "num_input_tokens_seen": 51648216, + "step": 89020 + }, + { + "epoch": 13.259606791778374, + "grad_norm": 0.00010441967606311664, + "learning_rate": 1.5394800903047114e-05, + "loss": 0.0003, + "num_input_tokens_seen": 51651320, + "step": 89025 + }, + { + "epoch": 13.260351504319333, + "grad_norm": 0.0021384379360824823, + "learning_rate": 1.5391800966957448e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51654072, + "step": 89030 + }, + { + "epoch": 13.261096216860292, + "grad_norm": 34.11234664916992, + "learning_rate": 1.538880119318853e-05, + "loss": 0.0983, + "num_input_tokens_seen": 51657336, + "step": 89035 + }, + { + "epoch": 13.26184092940125, + "grad_norm": 1.9073346265940927e-05, + "learning_rate": 1.538580158179106e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51660248, + "step": 89040 + }, + { + "epoch": 13.262585641942211, + "grad_norm": 0.00012497259012889117, + "learning_rate": 1.5382802132815694e-05, + "loss": 0.0057, + "num_input_tokens_seen": 51663160, + "step": 89045 + }, + { + "epoch": 13.26333035448317, + "grad_norm": 0.0016500004567205906, + "learning_rate": 1.5379802846313115e-05, + "loss": 0.0, + "num_input_tokens_seen": 51665944, + "step": 89050 + }, + { + "epoch": 13.264075067024129, + "grad_norm": 0.0005693643470294774, + "learning_rate": 1.5376803722333983e-05, + "loss": 0.0, + "num_input_tokens_seen": 51669016, + "step": 89055 + }, + { + "epoch": 13.264819779565087, + "grad_norm": 52.50261306762695, + "learning_rate": 1.5373804760928978e-05, + "loss": 0.0388, + "num_input_tokens_seen": 51671960, + "step": 89060 + }, + { + "epoch": 13.265564492106048, + "grad_norm": 3.8431822758866474e-05, + "learning_rate": 1.537080596214876e-05, + "loss": 0.0002, + "num_input_tokens_seen": 51674776, + "step": 89065 + }, + { + "epoch": 13.266309204647007, + "grad_norm": 5.575897693634033, + "learning_rate": 1.5367807326043976e-05, + "loss": 0.1534, + "num_input_tokens_seen": 51677624, + "step": 89070 + }, + { + "epoch": 13.267053917187965, + "grad_norm": 0.0004939649952575564, + "learning_rate": 1.5364808852665307e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51680376, + "step": 89075 + }, + { + "epoch": 13.267798629728924, + "grad_norm": 0.0020848147105425596, + "learning_rate": 1.53618105420634e-05, + "loss": 0.0, + "num_input_tokens_seen": 51683128, + "step": 89080 + }, + { + "epoch": 13.268543342269885, + "grad_norm": 0.009378244169056416, + "learning_rate": 1.5358812394288906e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51686040, + "step": 89085 + }, + { + "epoch": 13.269288054810843, + "grad_norm": 0.0013470043195411563, + "learning_rate": 1.5355814409392475e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51688920, + "step": 89090 + }, + { + "epoch": 13.270032767351802, + "grad_norm": 0.005225462839007378, + "learning_rate": 1.5352816587424762e-05, + "loss": 0.0, + "num_input_tokens_seen": 51691896, + "step": 89095 + }, + { + "epoch": 13.270777479892761, + "grad_norm": 0.00432712584733963, + "learning_rate": 1.53498189284364e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51694776, + "step": 89100 + }, + { + "epoch": 13.271522192433721, + "grad_norm": 0.0008840931695885956, + "learning_rate": 1.534682143247805e-05, + "loss": 0.0002, + "num_input_tokens_seen": 51697560, + "step": 89105 + }, + { + "epoch": 13.27226690497468, + "grad_norm": 0.026457639411091805, + "learning_rate": 1.534382409960034e-05, + "loss": 0.1292, + "num_input_tokens_seen": 51700344, + "step": 89110 + }, + { + "epoch": 13.273011617515639, + "grad_norm": 1.243341088294983, + "learning_rate": 1.5340826929853903e-05, + "loss": 0.0011, + "num_input_tokens_seen": 51702904, + "step": 89115 + }, + { + "epoch": 13.273756330056598, + "grad_norm": 0.0012577935121953487, + "learning_rate": 1.5337829923289382e-05, + "loss": 0.2407, + "num_input_tokens_seen": 51705496, + "step": 89120 + }, + { + "epoch": 13.274501042597556, + "grad_norm": 0.16763180494308472, + "learning_rate": 1.5334833079957394e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51708344, + "step": 89125 + }, + { + "epoch": 13.275245755138517, + "grad_norm": 0.028446413576602936, + "learning_rate": 1.5331836399908588e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51711032, + "step": 89130 + }, + { + "epoch": 13.275990467679476, + "grad_norm": 0.00012101225729566067, + "learning_rate": 1.5328839883193575e-05, + "loss": 0.0, + "num_input_tokens_seen": 51713976, + "step": 89135 + }, + { + "epoch": 13.276735180220435, + "grad_norm": 0.00024903524899855256, + "learning_rate": 1.5325843529862987e-05, + "loss": 0.0, + "num_input_tokens_seen": 51716824, + "step": 89140 + }, + { + "epoch": 13.277479892761393, + "grad_norm": 0.002596293343231082, + "learning_rate": 1.532284733996744e-05, + "loss": 0.0002, + "num_input_tokens_seen": 51719736, + "step": 89145 + }, + { + "epoch": 13.278224605302354, + "grad_norm": 0.0030758713837713003, + "learning_rate": 1.5319851313557548e-05, + "loss": 0.0, + "num_input_tokens_seen": 51722712, + "step": 89150 + }, + { + "epoch": 13.278969317843313, + "grad_norm": 0.015510699711740017, + "learning_rate": 1.5316855450683937e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51725784, + "step": 89155 + }, + { + "epoch": 13.279714030384271, + "grad_norm": 0.027826962992548943, + "learning_rate": 1.53138597513972e-05, + "loss": 0.0, + "num_input_tokens_seen": 51728728, + "step": 89160 + }, + { + "epoch": 13.28045874292523, + "grad_norm": 0.03713950887322426, + "learning_rate": 1.5310864215747966e-05, + "loss": 0.0015, + "num_input_tokens_seen": 51731576, + "step": 89165 + }, + { + "epoch": 13.28120345546619, + "grad_norm": 0.004477124195545912, + "learning_rate": 1.5307868843786828e-05, + "loss": 0.0004, + "num_input_tokens_seen": 51734616, + "step": 89170 + }, + { + "epoch": 13.28194816800715, + "grad_norm": 3.9671530723571777, + "learning_rate": 1.53048736355644e-05, + "loss": 0.0008, + "num_input_tokens_seen": 51737624, + "step": 89175 + }, + { + "epoch": 13.282692880548108, + "grad_norm": 0.010364017449319363, + "learning_rate": 1.5301878591131273e-05, + "loss": 0.0975, + "num_input_tokens_seen": 51740568, + "step": 89180 + }, + { + "epoch": 13.283437593089067, + "grad_norm": 0.0009146661614067852, + "learning_rate": 1.529888371053806e-05, + "loss": 0.0, + "num_input_tokens_seen": 51743384, + "step": 89185 + }, + { + "epoch": 13.284182305630027, + "grad_norm": 0.0010068862466141582, + "learning_rate": 1.5295888993835345e-05, + "loss": 0.0, + "num_input_tokens_seen": 51746328, + "step": 89190 + }, + { + "epoch": 13.284927018170986, + "grad_norm": 0.005353680346161127, + "learning_rate": 1.5292894441073712e-05, + "loss": 0.0, + "num_input_tokens_seen": 51749720, + "step": 89195 + }, + { + "epoch": 13.285671730711945, + "grad_norm": 0.005316535942256451, + "learning_rate": 1.5289900052303774e-05, + "loss": 0.0, + "num_input_tokens_seen": 51752344, + "step": 89200 + }, + { + "epoch": 13.286416443252904, + "grad_norm": 0.0001003161451080814, + "learning_rate": 1.5286905827576094e-05, + "loss": 0.1719, + "num_input_tokens_seen": 51755160, + "step": 89205 + }, + { + "epoch": 13.287161155793864, + "grad_norm": 0.001388954813592136, + "learning_rate": 1.5283911766941277e-05, + "loss": 0.0, + "num_input_tokens_seen": 51757912, + "step": 89210 + }, + { + "epoch": 13.287905868334823, + "grad_norm": 0.19723735749721527, + "learning_rate": 1.528091787044989e-05, + "loss": 0.0006, + "num_input_tokens_seen": 51760920, + "step": 89215 + }, + { + "epoch": 13.288650580875782, + "grad_norm": 5.6664323806762695, + "learning_rate": 1.5277924138152528e-05, + "loss": 0.0047, + "num_input_tokens_seen": 51763928, + "step": 89220 + }, + { + "epoch": 13.28939529341674, + "grad_norm": 0.015412078239023685, + "learning_rate": 1.527493057009975e-05, + "loss": 0.0, + "num_input_tokens_seen": 51767032, + "step": 89225 + }, + { + "epoch": 13.290140005957701, + "grad_norm": 9.493506513535976e-05, + "learning_rate": 1.5271937166342132e-05, + "loss": 0.0, + "num_input_tokens_seen": 51769432, + "step": 89230 + }, + { + "epoch": 13.29088471849866, + "grad_norm": 0.0014366073301061988, + "learning_rate": 1.526894392693025e-05, + "loss": 0.0002, + "num_input_tokens_seen": 51772664, + "step": 89235 + }, + { + "epoch": 13.291629431039619, + "grad_norm": 0.0011291987029835582, + "learning_rate": 1.5265950851914668e-05, + "loss": 0.0003, + "num_input_tokens_seen": 51775576, + "step": 89240 + }, + { + "epoch": 13.292374143580577, + "grad_norm": 0.003348050406202674, + "learning_rate": 1.526295794134596e-05, + "loss": 0.0, + "num_input_tokens_seen": 51778552, + "step": 89245 + }, + { + "epoch": 13.293118856121538, + "grad_norm": 0.001027395948767662, + "learning_rate": 1.5259965195274678e-05, + "loss": 0.0645, + "num_input_tokens_seen": 51781336, + "step": 89250 + }, + { + "epoch": 13.293863568662497, + "grad_norm": 0.0007579508237540722, + "learning_rate": 1.5256972613751386e-05, + "loss": 0.0, + "num_input_tokens_seen": 51784056, + "step": 89255 + }, + { + "epoch": 13.294608281203455, + "grad_norm": 69.09147644042969, + "learning_rate": 1.5253980196826634e-05, + "loss": 0.1512, + "num_input_tokens_seen": 51787000, + "step": 89260 + }, + { + "epoch": 13.295352993744414, + "grad_norm": 0.009284496307373047, + "learning_rate": 1.5250987944550988e-05, + "loss": 0.0, + "num_input_tokens_seen": 51789880, + "step": 89265 + }, + { + "epoch": 13.296097706285375, + "grad_norm": 0.0019228027667850256, + "learning_rate": 1.5247995856974995e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51792984, + "step": 89270 + }, + { + "epoch": 13.296842418826333, + "grad_norm": 0.0006102696643210948, + "learning_rate": 1.5245003934149194e-05, + "loss": 0.0, + "num_input_tokens_seen": 51795960, + "step": 89275 + }, + { + "epoch": 13.297587131367292, + "grad_norm": 0.00043701441609300673, + "learning_rate": 1.5242012176124135e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51798872, + "step": 89280 + }, + { + "epoch": 13.298331843908251, + "grad_norm": 0.004503579344600439, + "learning_rate": 1.5239020582950364e-05, + "loss": 0.0, + "num_input_tokens_seen": 51801752, + "step": 89285 + }, + { + "epoch": 13.299076556449211, + "grad_norm": 0.00019570766016840935, + "learning_rate": 1.5236029154678425e-05, + "loss": 0.0, + "num_input_tokens_seen": 51804568, + "step": 89290 + }, + { + "epoch": 13.29982126899017, + "grad_norm": 0.00012226143735460937, + "learning_rate": 1.523303789135884e-05, + "loss": 0.0, + "num_input_tokens_seen": 51807224, + "step": 89295 + }, + { + "epoch": 13.300565981531129, + "grad_norm": 0.0003792842908296734, + "learning_rate": 1.5230046793042163e-05, + "loss": 0.0, + "num_input_tokens_seen": 51810328, + "step": 89300 + }, + { + "epoch": 13.301310694072088, + "grad_norm": 0.0013332064263522625, + "learning_rate": 1.5227055859778917e-05, + "loss": 0.0, + "num_input_tokens_seen": 51813016, + "step": 89305 + }, + { + "epoch": 13.302055406613047, + "grad_norm": 0.0003888688515871763, + "learning_rate": 1.5224065091619622e-05, + "loss": 0.0827, + "num_input_tokens_seen": 51816056, + "step": 89310 + }, + { + "epoch": 13.302800119154007, + "grad_norm": 0.00024814336211420596, + "learning_rate": 1.5221074488614818e-05, + "loss": 0.0006, + "num_input_tokens_seen": 51819192, + "step": 89315 + }, + { + "epoch": 13.303544831694966, + "grad_norm": 7.856563024688512e-05, + "learning_rate": 1.521808405081501e-05, + "loss": 0.0, + "num_input_tokens_seen": 51821848, + "step": 89320 + }, + { + "epoch": 13.304289544235925, + "grad_norm": 0.0050266501493752, + "learning_rate": 1.521509377827074e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51824504, + "step": 89325 + }, + { + "epoch": 13.305034256776883, + "grad_norm": 0.0007202732376754284, + "learning_rate": 1.5212103671032507e-05, + "loss": 0.0, + "num_input_tokens_seen": 51827384, + "step": 89330 + }, + { + "epoch": 13.305778969317844, + "grad_norm": 0.003124336013570428, + "learning_rate": 1.5209113729150845e-05, + "loss": 0.0, + "num_input_tokens_seen": 51830008, + "step": 89335 + }, + { + "epoch": 13.306523681858803, + "grad_norm": 0.0007283605518750846, + "learning_rate": 1.520612395267625e-05, + "loss": 0.0, + "num_input_tokens_seen": 51832888, + "step": 89340 + }, + { + "epoch": 13.307268394399761, + "grad_norm": 0.00019101263023912907, + "learning_rate": 1.5203134341659242e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51836056, + "step": 89345 + }, + { + "epoch": 13.30801310694072, + "grad_norm": 0.002369494177401066, + "learning_rate": 1.520014489615032e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51838744, + "step": 89350 + }, + { + "epoch": 13.30875781948168, + "grad_norm": 0.0010380041785538197, + "learning_rate": 1.5197155616199982e-05, + "loss": 0.0, + "num_input_tokens_seen": 51841528, + "step": 89355 + }, + { + "epoch": 13.30950253202264, + "grad_norm": 0.00025794937391765416, + "learning_rate": 1.5194166501858747e-05, + "loss": 0.0, + "num_input_tokens_seen": 51844568, + "step": 89360 + }, + { + "epoch": 13.310247244563598, + "grad_norm": 0.000979748205281794, + "learning_rate": 1.5191177553177094e-05, + "loss": 0.0003, + "num_input_tokens_seen": 51847832, + "step": 89365 + }, + { + "epoch": 13.310991957104557, + "grad_norm": 0.002708369167521596, + "learning_rate": 1.5188188770205533e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51851992, + "step": 89370 + }, + { + "epoch": 13.311736669645517, + "grad_norm": 0.015024461783468723, + "learning_rate": 1.518520015299455e-05, + "loss": 0.0, + "num_input_tokens_seen": 51855192, + "step": 89375 + }, + { + "epoch": 13.312481382186476, + "grad_norm": 0.018653422594070435, + "learning_rate": 1.5182211701594634e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51857880, + "step": 89380 + }, + { + "epoch": 13.313226094727435, + "grad_norm": 0.005925075151026249, + "learning_rate": 1.5179223416056268e-05, + "loss": 0.0, + "num_input_tokens_seen": 51861080, + "step": 89385 + }, + { + "epoch": 13.313970807268394, + "grad_norm": 0.0042539737187325954, + "learning_rate": 1.517623529642995e-05, + "loss": 0.0, + "num_input_tokens_seen": 51863928, + "step": 89390 + }, + { + "epoch": 13.314715519809354, + "grad_norm": 0.0003390984202269465, + "learning_rate": 1.517324734276615e-05, + "loss": 0.0, + "num_input_tokens_seen": 51866904, + "step": 89395 + }, + { + "epoch": 13.315460232350313, + "grad_norm": 0.0014302660711109638, + "learning_rate": 1.5170259555115343e-05, + "loss": 0.0, + "num_input_tokens_seen": 51869784, + "step": 89400 + }, + { + "epoch": 13.316204944891272, + "grad_norm": 0.007185786962509155, + "learning_rate": 1.5167271933528015e-05, + "loss": 0.0002, + "num_input_tokens_seen": 51872664, + "step": 89405 + }, + { + "epoch": 13.31694965743223, + "grad_norm": 0.008743546903133392, + "learning_rate": 1.5164284478054636e-05, + "loss": 0.0, + "num_input_tokens_seen": 51875640, + "step": 89410 + }, + { + "epoch": 13.317694369973191, + "grad_norm": 0.0009545486536808312, + "learning_rate": 1.5161297188745673e-05, + "loss": 0.0, + "num_input_tokens_seen": 51878392, + "step": 89415 + }, + { + "epoch": 13.31843908251415, + "grad_norm": 0.003071622457355261, + "learning_rate": 1.5158310065651588e-05, + "loss": 0.0012, + "num_input_tokens_seen": 51881496, + "step": 89420 + }, + { + "epoch": 13.319183795055109, + "grad_norm": 0.003690878627821803, + "learning_rate": 1.5155323108822861e-05, + "loss": 0.0, + "num_input_tokens_seen": 51884440, + "step": 89425 + }, + { + "epoch": 13.319928507596067, + "grad_norm": 0.0016549137653782964, + "learning_rate": 1.5152336318309942e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51887256, + "step": 89430 + }, + { + "epoch": 13.320673220137028, + "grad_norm": 0.01601378247141838, + "learning_rate": 1.5149349694163283e-05, + "loss": 0.0, + "num_input_tokens_seen": 51890136, + "step": 89435 + }, + { + "epoch": 13.321417932677987, + "grad_norm": 0.000479917973279953, + "learning_rate": 1.5146363236433362e-05, + "loss": 0.0, + "num_input_tokens_seen": 51893208, + "step": 89440 + }, + { + "epoch": 13.322162645218945, + "grad_norm": 0.00022249843459576368, + "learning_rate": 1.5143376945170612e-05, + "loss": 0.0, + "num_input_tokens_seen": 51896056, + "step": 89445 + }, + { + "epoch": 13.322907357759904, + "grad_norm": 0.0012679595965892076, + "learning_rate": 1.5140390820425495e-05, + "loss": 0.0, + "num_input_tokens_seen": 51899096, + "step": 89450 + }, + { + "epoch": 13.323652070300863, + "grad_norm": 0.006992999464273453, + "learning_rate": 1.5137404862248447e-05, + "loss": 0.0, + "num_input_tokens_seen": 51901784, + "step": 89455 + }, + { + "epoch": 13.324396782841823, + "grad_norm": 0.0008838145877234638, + "learning_rate": 1.5134419070689926e-05, + "loss": 0.0, + "num_input_tokens_seen": 51904440, + "step": 89460 + }, + { + "epoch": 13.325141495382782, + "grad_norm": 0.0020775406155735254, + "learning_rate": 1.5131433445800363e-05, + "loss": 0.0, + "num_input_tokens_seen": 51907192, + "step": 89465 + }, + { + "epoch": 13.325886207923741, + "grad_norm": 0.0005420511006377637, + "learning_rate": 1.5128447987630207e-05, + "loss": 0.04, + "num_input_tokens_seen": 51910040, + "step": 89470 + }, + { + "epoch": 13.3266309204647, + "grad_norm": 0.11186076700687408, + "learning_rate": 1.5125462696229892e-05, + "loss": 0.0002, + "num_input_tokens_seen": 51912824, + "step": 89475 + }, + { + "epoch": 13.32737563300566, + "grad_norm": 0.014256076887249947, + "learning_rate": 1.5122477571649846e-05, + "loss": 0.0, + "num_input_tokens_seen": 51915672, + "step": 89480 + }, + { + "epoch": 13.328120345546619, + "grad_norm": 0.000993377878330648, + "learning_rate": 1.5119492613940503e-05, + "loss": 0.0002, + "num_input_tokens_seen": 51918744, + "step": 89485 + }, + { + "epoch": 13.328865058087578, + "grad_norm": 0.0001341797033092007, + "learning_rate": 1.5116507823152282e-05, + "loss": 0.0, + "num_input_tokens_seen": 51921432, + "step": 89490 + }, + { + "epoch": 13.329609770628537, + "grad_norm": 0.0005150517099536955, + "learning_rate": 1.5113523199335624e-05, + "loss": 0.0, + "num_input_tokens_seen": 51924248, + "step": 89495 + }, + { + "epoch": 13.330354483169497, + "grad_norm": 21.49188232421875, + "learning_rate": 1.5110538742540936e-05, + "loss": 0.1355, + "num_input_tokens_seen": 51927032, + "step": 89500 + }, + { + "epoch": 13.331099195710456, + "grad_norm": 0.0002306381647940725, + "learning_rate": 1.5107554452818653e-05, + "loss": 0.0, + "num_input_tokens_seen": 51929656, + "step": 89505 + }, + { + "epoch": 13.331843908251415, + "grad_norm": 0.0011158434208482504, + "learning_rate": 1.5104570330219187e-05, + "loss": 0.0, + "num_input_tokens_seen": 51932408, + "step": 89510 + }, + { + "epoch": 13.332588620792373, + "grad_norm": 0.0009719476802274585, + "learning_rate": 1.510158637479294e-05, + "loss": 0.0, + "num_input_tokens_seen": 51935224, + "step": 89515 + }, + { + "epoch": 13.333333333333334, + "grad_norm": 0.004247117787599564, + "learning_rate": 1.5098602586590335e-05, + "loss": 0.0, + "num_input_tokens_seen": 51938168, + "step": 89520 + }, + { + "epoch": 13.334078045874293, + "grad_norm": 0.3357067406177521, + "learning_rate": 1.5095618965661767e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51941112, + "step": 89525 + }, + { + "epoch": 13.334822758415251, + "grad_norm": 0.008575805462896824, + "learning_rate": 1.5092635512057662e-05, + "loss": 0.0947, + "num_input_tokens_seen": 51944024, + "step": 89530 + }, + { + "epoch": 13.33556747095621, + "grad_norm": 0.00042892168858088553, + "learning_rate": 1.5089652225828399e-05, + "loss": 0.0097, + "num_input_tokens_seen": 51946904, + "step": 89535 + }, + { + "epoch": 13.33631218349717, + "grad_norm": 0.07323016971349716, + "learning_rate": 1.5086669107024398e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51949592, + "step": 89540 + }, + { + "epoch": 13.33705689603813, + "grad_norm": 0.0008644880726933479, + "learning_rate": 1.5083686155696043e-05, + "loss": 0.0, + "num_input_tokens_seen": 51952344, + "step": 89545 + }, + { + "epoch": 13.337801608579088, + "grad_norm": 0.007307155057787895, + "learning_rate": 1.5080703371893737e-05, + "loss": 0.0, + "num_input_tokens_seen": 51955256, + "step": 89550 + }, + { + "epoch": 13.338546321120047, + "grad_norm": 7.140624802559614e-05, + "learning_rate": 1.5077720755667868e-05, + "loss": 0.0, + "num_input_tokens_seen": 51958264, + "step": 89555 + }, + { + "epoch": 13.339291033661008, + "grad_norm": 0.0010484574595466256, + "learning_rate": 1.5074738307068809e-05, + "loss": 0.0, + "num_input_tokens_seen": 51961112, + "step": 89560 + }, + { + "epoch": 13.340035746201966, + "grad_norm": 3.36372759193182e-05, + "learning_rate": 1.5071756026146972e-05, + "loss": 0.0, + "num_input_tokens_seen": 51964152, + "step": 89565 + }, + { + "epoch": 13.340780458742925, + "grad_norm": 5.9502421208890155e-05, + "learning_rate": 1.506877391295271e-05, + "loss": 0.197, + "num_input_tokens_seen": 51967128, + "step": 89570 + }, + { + "epoch": 13.341525171283884, + "grad_norm": 0.07523191720247269, + "learning_rate": 1.5065791967536436e-05, + "loss": 0.0, + "num_input_tokens_seen": 51970104, + "step": 89575 + }, + { + "epoch": 13.342269883824844, + "grad_norm": 0.07399286329746246, + "learning_rate": 1.50628101899485e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51973304, + "step": 89580 + }, + { + "epoch": 13.343014596365803, + "grad_norm": 0.00216754712164402, + "learning_rate": 1.5059828580239296e-05, + "loss": 0.2389, + "num_input_tokens_seen": 51976024, + "step": 89585 + }, + { + "epoch": 13.343759308906762, + "grad_norm": 0.004493166692554951, + "learning_rate": 1.5056847138459185e-05, + "loss": 0.0155, + "num_input_tokens_seen": 51979128, + "step": 89590 + }, + { + "epoch": 13.34450402144772, + "grad_norm": 0.0006935963756404817, + "learning_rate": 1.5053865864658523e-05, + "loss": 0.0, + "num_input_tokens_seen": 51981816, + "step": 89595 + }, + { + "epoch": 13.345248733988681, + "grad_norm": 0.00034500050242058933, + "learning_rate": 1.5050884758887698e-05, + "loss": 0.2875, + "num_input_tokens_seen": 51984984, + "step": 89600 + }, + { + "epoch": 13.34599344652964, + "grad_norm": 0.016527066007256508, + "learning_rate": 1.504790382119706e-05, + "loss": 0.0003, + "num_input_tokens_seen": 51987768, + "step": 89605 + }, + { + "epoch": 13.346738159070599, + "grad_norm": 0.0025201248936355114, + "learning_rate": 1.5044923051636972e-05, + "loss": 0.0, + "num_input_tokens_seen": 51990712, + "step": 89610 + }, + { + "epoch": 13.347482871611557, + "grad_norm": 0.0036631047260016203, + "learning_rate": 1.504194245025779e-05, + "loss": 0.0, + "num_input_tokens_seen": 51993784, + "step": 89615 + }, + { + "epoch": 13.348227584152518, + "grad_norm": 0.001003822311758995, + "learning_rate": 1.5038962017109875e-05, + "loss": 0.0, + "num_input_tokens_seen": 51996536, + "step": 89620 + }, + { + "epoch": 13.348972296693477, + "grad_norm": 0.0022892942652106285, + "learning_rate": 1.5035981752243561e-05, + "loss": 0.0001, + "num_input_tokens_seen": 51999512, + "step": 89625 + }, + { + "epoch": 13.349717009234435, + "grad_norm": 0.006559832487255335, + "learning_rate": 1.5033001655709222e-05, + "loss": 0.0, + "num_input_tokens_seen": 52002136, + "step": 89630 + }, + { + "epoch": 13.350461721775394, + "grad_norm": 0.00023535318905487657, + "learning_rate": 1.5030021727557189e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52004824, + "step": 89635 + }, + { + "epoch": 13.351206434316353, + "grad_norm": 0.009734287858009338, + "learning_rate": 1.5027041967837802e-05, + "loss": 0.0, + "num_input_tokens_seen": 52007768, + "step": 89640 + }, + { + "epoch": 13.351951146857314, + "grad_norm": 0.0006461309967562556, + "learning_rate": 1.5024062376601406e-05, + "loss": 0.0, + "num_input_tokens_seen": 52010520, + "step": 89645 + }, + { + "epoch": 13.352695859398272, + "grad_norm": 0.004863432142883539, + "learning_rate": 1.502108295389833e-05, + "loss": 0.0, + "num_input_tokens_seen": 52013560, + "step": 89650 + }, + { + "epoch": 13.353440571939231, + "grad_norm": 0.00019871532276738435, + "learning_rate": 1.5018103699778923e-05, + "loss": 0.0, + "num_input_tokens_seen": 52016728, + "step": 89655 + }, + { + "epoch": 13.35418528448019, + "grad_norm": 0.0021332837641239166, + "learning_rate": 1.5015124614293501e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52019768, + "step": 89660 + }, + { + "epoch": 13.35492999702115, + "grad_norm": 0.03285318985581398, + "learning_rate": 1.5012145697492407e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52022392, + "step": 89665 + }, + { + "epoch": 13.35567470956211, + "grad_norm": 0.0012683249078691006, + "learning_rate": 1.5009166949425965e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52025560, + "step": 89670 + }, + { + "epoch": 13.356419422103068, + "grad_norm": 0.005877806805074215, + "learning_rate": 1.5006188370144486e-05, + "loss": 0.0, + "num_input_tokens_seen": 52028344, + "step": 89675 + }, + { + "epoch": 13.357164134644027, + "grad_norm": 0.02797473780810833, + "learning_rate": 1.5003209959698302e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52031000, + "step": 89680 + }, + { + "epoch": 13.357908847184987, + "grad_norm": 6.167461833683774e-05, + "learning_rate": 1.5000231718137717e-05, + "loss": 0.0, + "num_input_tokens_seen": 52033944, + "step": 89685 + }, + { + "epoch": 13.358653559725946, + "grad_norm": 0.00012358446838334203, + "learning_rate": 1.4997253645513063e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52036920, + "step": 89690 + }, + { + "epoch": 13.359398272266905, + "grad_norm": 0.0003271369496360421, + "learning_rate": 1.499427574187463e-05, + "loss": 0.0023, + "num_input_tokens_seen": 52039864, + "step": 89695 + }, + { + "epoch": 13.360142984807863, + "grad_norm": 0.0052860877476632595, + "learning_rate": 1.4991298007272753e-05, + "loss": 0.0, + "num_input_tokens_seen": 52042712, + "step": 89700 + }, + { + "epoch": 13.360887697348824, + "grad_norm": 51.687950134277344, + "learning_rate": 1.4988320441757714e-05, + "loss": 0.0988, + "num_input_tokens_seen": 52045688, + "step": 89705 + }, + { + "epoch": 13.361632409889783, + "grad_norm": 0.0003491007664706558, + "learning_rate": 1.4985343045379836e-05, + "loss": 0.0003, + "num_input_tokens_seen": 52048696, + "step": 89710 + }, + { + "epoch": 13.362377122430741, + "grad_norm": 0.0005592597299255431, + "learning_rate": 1.4982365818189407e-05, + "loss": 0.0, + "num_input_tokens_seen": 52051896, + "step": 89715 + }, + { + "epoch": 13.3631218349717, + "grad_norm": 0.0004893157747574151, + "learning_rate": 1.4979388760236712e-05, + "loss": 0.0, + "num_input_tokens_seen": 52055128, + "step": 89720 + }, + { + "epoch": 13.36386654751266, + "grad_norm": 0.004298990126699209, + "learning_rate": 1.4976411871572074e-05, + "loss": 0.0, + "num_input_tokens_seen": 52057912, + "step": 89725 + }, + { + "epoch": 13.36461126005362, + "grad_norm": 0.0012330315075814724, + "learning_rate": 1.4973435152245757e-05, + "loss": 0.0, + "num_input_tokens_seen": 52060696, + "step": 89730 + }, + { + "epoch": 13.365355972594578, + "grad_norm": 0.0006833979859948158, + "learning_rate": 1.4970458602308077e-05, + "loss": 0.0, + "num_input_tokens_seen": 52063672, + "step": 89735 + }, + { + "epoch": 13.366100685135537, + "grad_norm": 5.616966154775582e-05, + "learning_rate": 1.4967482221809299e-05, + "loss": 0.2386, + "num_input_tokens_seen": 52066616, + "step": 89740 + }, + { + "epoch": 13.366845397676498, + "grad_norm": 0.005297776777297258, + "learning_rate": 1.4964506010799711e-05, + "loss": 0.0, + "num_input_tokens_seen": 52069368, + "step": 89745 + }, + { + "epoch": 13.367590110217456, + "grad_norm": 0.0009112043771892786, + "learning_rate": 1.4961529969329602e-05, + "loss": 0.0011, + "num_input_tokens_seen": 52072088, + "step": 89750 + }, + { + "epoch": 13.368334822758415, + "grad_norm": 0.0025034542195498943, + "learning_rate": 1.4958554097449228e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52075192, + "step": 89755 + }, + { + "epoch": 13.369079535299374, + "grad_norm": 0.0005582289304584265, + "learning_rate": 1.4955578395208886e-05, + "loss": 0.0577, + "num_input_tokens_seen": 52077976, + "step": 89760 + }, + { + "epoch": 13.369824247840334, + "grad_norm": 0.0009502055472694337, + "learning_rate": 1.4952602862658832e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52080824, + "step": 89765 + }, + { + "epoch": 13.370568960381293, + "grad_norm": 0.0001428972464054823, + "learning_rate": 1.494962749984935e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52083928, + "step": 89770 + }, + { + "epoch": 13.371313672922252, + "grad_norm": 0.0026809561531990767, + "learning_rate": 1.4946652306830688e-05, + "loss": 0.0, + "num_input_tokens_seen": 52086808, + "step": 89775 + }, + { + "epoch": 13.37205838546321, + "grad_norm": 0.0007656597881577909, + "learning_rate": 1.4943677283653124e-05, + "loss": 0.1782, + "num_input_tokens_seen": 52089624, + "step": 89780 + }, + { + "epoch": 13.372803098004171, + "grad_norm": 0.00428394740447402, + "learning_rate": 1.4940702430366905e-05, + "loss": 0.0014, + "num_input_tokens_seen": 52092344, + "step": 89785 + }, + { + "epoch": 13.37354781054513, + "grad_norm": 0.21422260999679565, + "learning_rate": 1.4937727747022302e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52095448, + "step": 89790 + }, + { + "epoch": 13.374292523086089, + "grad_norm": 0.0011083815479651093, + "learning_rate": 1.493475323366956e-05, + "loss": 0.0026, + "num_input_tokens_seen": 52098520, + "step": 89795 + }, + { + "epoch": 13.375037235627047, + "grad_norm": 0.0038379975594580173, + "learning_rate": 1.4931778890358924e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52101304, + "step": 89800 + }, + { + "epoch": 13.375781948168008, + "grad_norm": 0.0008488956955261528, + "learning_rate": 1.492880471714066e-05, + "loss": 0.1345, + "num_input_tokens_seen": 52104216, + "step": 89805 + }, + { + "epoch": 13.376526660708967, + "grad_norm": 0.0033426249865442514, + "learning_rate": 1.4925830714065003e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52107192, + "step": 89810 + }, + { + "epoch": 13.377271373249926, + "grad_norm": 0.0011283090570941567, + "learning_rate": 1.4922856881182199e-05, + "loss": 0.0, + "num_input_tokens_seen": 52109944, + "step": 89815 + }, + { + "epoch": 13.378016085790884, + "grad_norm": 17.474735260009766, + "learning_rate": 1.4919883218542474e-05, + "loss": 0.0025, + "num_input_tokens_seen": 52112824, + "step": 89820 + }, + { + "epoch": 13.378760798331843, + "grad_norm": 0.0029545703437179327, + "learning_rate": 1.4916909726196093e-05, + "loss": 0.0, + "num_input_tokens_seen": 52115736, + "step": 89825 + }, + { + "epoch": 13.379505510872804, + "grad_norm": 119.69184875488281, + "learning_rate": 1.4913936404193268e-05, + "loss": 0.1093, + "num_input_tokens_seen": 52118808, + "step": 89830 + }, + { + "epoch": 13.380250223413762, + "grad_norm": 0.05794348195195198, + "learning_rate": 1.4910963252584231e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52121560, + "step": 89835 + }, + { + "epoch": 13.380994935954721, + "grad_norm": 0.0005081310519017279, + "learning_rate": 1.4907990271419222e-05, + "loss": 0.0439, + "num_input_tokens_seen": 52124536, + "step": 89840 + }, + { + "epoch": 13.38173964849568, + "grad_norm": 0.0013693938963115215, + "learning_rate": 1.4905017460748458e-05, + "loss": 0.0, + "num_input_tokens_seen": 52127128, + "step": 89845 + }, + { + "epoch": 13.38248436103664, + "grad_norm": 9.837785182753578e-05, + "learning_rate": 1.4902044820622168e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52130296, + "step": 89850 + }, + { + "epoch": 13.3832290735776, + "grad_norm": 0.0034068457316607237, + "learning_rate": 1.489907235109056e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52133080, + "step": 89855 + }, + { + "epoch": 13.383973786118558, + "grad_norm": 0.008050752803683281, + "learning_rate": 1.4896100052203865e-05, + "loss": 0.0016, + "num_input_tokens_seen": 52135896, + "step": 89860 + }, + { + "epoch": 13.384718498659517, + "grad_norm": 4.886210918426514, + "learning_rate": 1.4893127924012281e-05, + "loss": 0.005, + "num_input_tokens_seen": 52138808, + "step": 89865 + }, + { + "epoch": 13.385463211200477, + "grad_norm": 0.00021087129425723106, + "learning_rate": 1.4890155966566039e-05, + "loss": 0.1346, + "num_input_tokens_seen": 52141592, + "step": 89870 + }, + { + "epoch": 13.386207923741436, + "grad_norm": 0.0027708816342055798, + "learning_rate": 1.4887184179915336e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52144248, + "step": 89875 + }, + { + "epoch": 13.386952636282395, + "grad_norm": 0.004737548064440489, + "learning_rate": 1.4884212564110379e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52146904, + "step": 89880 + }, + { + "epoch": 13.387697348823353, + "grad_norm": 0.005644996650516987, + "learning_rate": 1.4881241119201367e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52149688, + "step": 89885 + }, + { + "epoch": 13.388442061364314, + "grad_norm": 0.001760921673849225, + "learning_rate": 1.4878269845238496e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52152600, + "step": 89890 + }, + { + "epoch": 13.389186773905273, + "grad_norm": 0.06923316419124603, + "learning_rate": 1.4875298742271976e-05, + "loss": 0.0004, + "num_input_tokens_seen": 52155480, + "step": 89895 + }, + { + "epoch": 13.389931486446232, + "grad_norm": 0.0007611755863763392, + "learning_rate": 1.4872327810351986e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52158328, + "step": 89900 + }, + { + "epoch": 13.39067619898719, + "grad_norm": 0.006108501460403204, + "learning_rate": 1.4869357049528731e-05, + "loss": 0.0051, + "num_input_tokens_seen": 52161176, + "step": 89905 + }, + { + "epoch": 13.39142091152815, + "grad_norm": 0.0011040611425414681, + "learning_rate": 1.4866386459852394e-05, + "loss": 0.005, + "num_input_tokens_seen": 52164216, + "step": 89910 + }, + { + "epoch": 13.39216562406911, + "grad_norm": 0.000917410827241838, + "learning_rate": 1.4863416041373158e-05, + "loss": 0.0, + "num_input_tokens_seen": 52167160, + "step": 89915 + }, + { + "epoch": 13.392910336610068, + "grad_norm": 0.004479973576962948, + "learning_rate": 1.4860445794141204e-05, + "loss": 0.0762, + "num_input_tokens_seen": 52169848, + "step": 89920 + }, + { + "epoch": 13.393655049151027, + "grad_norm": 0.007262878119945526, + "learning_rate": 1.4857475718206706e-05, + "loss": 0.0, + "num_input_tokens_seen": 52172760, + "step": 89925 + }, + { + "epoch": 13.394399761691988, + "grad_norm": 0.004838470369577408, + "learning_rate": 1.4854505813619857e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52175672, + "step": 89930 + }, + { + "epoch": 13.395144474232946, + "grad_norm": 0.0032147704623639584, + "learning_rate": 1.4851536080430817e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52178968, + "step": 89935 + }, + { + "epoch": 13.395889186773905, + "grad_norm": 0.025277014821767807, + "learning_rate": 1.4848566518689757e-05, + "loss": 0.0, + "num_input_tokens_seen": 52181592, + "step": 89940 + }, + { + "epoch": 13.396633899314864, + "grad_norm": 0.0007910836138762534, + "learning_rate": 1.4845597128446853e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52184408, + "step": 89945 + }, + { + "epoch": 13.397378611855824, + "grad_norm": 6.985613435972482e-05, + "learning_rate": 1.4842627909752266e-05, + "loss": 0.0, + "num_input_tokens_seen": 52187352, + "step": 89950 + }, + { + "epoch": 13.398123324396783, + "grad_norm": 0.0009516527061350644, + "learning_rate": 1.4839658862656158e-05, + "loss": 0.1515, + "num_input_tokens_seen": 52190264, + "step": 89955 + }, + { + "epoch": 13.398868036937742, + "grad_norm": 0.039569683372974396, + "learning_rate": 1.4836689987208677e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52192952, + "step": 89960 + }, + { + "epoch": 13.3996127494787, + "grad_norm": 0.0004381113685667515, + "learning_rate": 1.483372128346e-05, + "loss": 0.0, + "num_input_tokens_seen": 52196024, + "step": 89965 + }, + { + "epoch": 13.400357462019661, + "grad_norm": 0.0021063571330159903, + "learning_rate": 1.4830752751460264e-05, + "loss": 0.0, + "num_input_tokens_seen": 52198616, + "step": 89970 + }, + { + "epoch": 13.40110217456062, + "grad_norm": 0.0003634906024672091, + "learning_rate": 1.482778439125963e-05, + "loss": 0.0307, + "num_input_tokens_seen": 52201560, + "step": 89975 + }, + { + "epoch": 13.401846887101579, + "grad_norm": 0.0005657581496052444, + "learning_rate": 1.482481620290823e-05, + "loss": 0.1875, + "num_input_tokens_seen": 52204248, + "step": 89980 + }, + { + "epoch": 13.402591599642538, + "grad_norm": 0.0006497553549706936, + "learning_rate": 1.4821848186456228e-05, + "loss": 0.0, + "num_input_tokens_seen": 52207320, + "step": 89985 + }, + { + "epoch": 13.403336312183498, + "grad_norm": 0.008065985515713692, + "learning_rate": 1.4818880341953745e-05, + "loss": 0.0025, + "num_input_tokens_seen": 52210232, + "step": 89990 + }, + { + "epoch": 13.404081024724457, + "grad_norm": 0.0005440423847176135, + "learning_rate": 1.4815912669450943e-05, + "loss": 0.0403, + "num_input_tokens_seen": 52213848, + "step": 89995 + }, + { + "epoch": 13.404825737265416, + "grad_norm": 0.005459535401314497, + "learning_rate": 1.4812945168997947e-05, + "loss": 0.0092, + "num_input_tokens_seen": 52217144, + "step": 90000 + }, + { + "epoch": 13.405570449806374, + "grad_norm": 0.0001575800561113283, + "learning_rate": 1.4809977840644881e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52219992, + "step": 90005 + }, + { + "epoch": 13.406315162347333, + "grad_norm": 0.003030756488442421, + "learning_rate": 1.4807010684441891e-05, + "loss": 0.0, + "num_input_tokens_seen": 52222872, + "step": 90010 + }, + { + "epoch": 13.407059874888294, + "grad_norm": 0.00047250138595700264, + "learning_rate": 1.4804043700439083e-05, + "loss": 0.1906, + "num_input_tokens_seen": 52225816, + "step": 90015 + }, + { + "epoch": 13.407804587429252, + "grad_norm": 0.0036068493500351906, + "learning_rate": 1.4801076888686605e-05, + "loss": 0.0796, + "num_input_tokens_seen": 52228952, + "step": 90020 + }, + { + "epoch": 13.408549299970211, + "grad_norm": 0.00041674674139358103, + "learning_rate": 1.4798110249234556e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52231864, + "step": 90025 + }, + { + "epoch": 13.40929401251117, + "grad_norm": 0.28847530484199524, + "learning_rate": 1.4795143782133075e-05, + "loss": 0.0627, + "num_input_tokens_seen": 52234808, + "step": 90030 + }, + { + "epoch": 13.41003872505213, + "grad_norm": 0.005335016641765833, + "learning_rate": 1.4792177487432271e-05, + "loss": 0.0536, + "num_input_tokens_seen": 52237784, + "step": 90035 + }, + { + "epoch": 13.41078343759309, + "grad_norm": 0.0003383853763807565, + "learning_rate": 1.4789211365182249e-05, + "loss": 0.0, + "num_input_tokens_seen": 52240856, + "step": 90040 + }, + { + "epoch": 13.411528150134048, + "grad_norm": 15.344229698181152, + "learning_rate": 1.4786245415433125e-05, + "loss": 0.0012, + "num_input_tokens_seen": 52243736, + "step": 90045 + }, + { + "epoch": 13.412272862675007, + "grad_norm": 97.10061645507812, + "learning_rate": 1.4783279638234994e-05, + "loss": 0.1191, + "num_input_tokens_seen": 52246488, + "step": 90050 + }, + { + "epoch": 13.413017575215967, + "grad_norm": 0.0005202136817388237, + "learning_rate": 1.4780314033637982e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52249400, + "step": 90055 + }, + { + "epoch": 13.413762287756926, + "grad_norm": 0.01246673334389925, + "learning_rate": 1.4777348601692168e-05, + "loss": 0.0, + "num_input_tokens_seen": 52252312, + "step": 90060 + }, + { + "epoch": 13.414507000297885, + "grad_norm": 143.28024291992188, + "learning_rate": 1.4774383342447667e-05, + "loss": 0.0146, + "num_input_tokens_seen": 52255224, + "step": 90065 + }, + { + "epoch": 13.415251712838844, + "grad_norm": 0.0009076157584786415, + "learning_rate": 1.477141825595456e-05, + "loss": 0.2188, + "num_input_tokens_seen": 52258008, + "step": 90070 + }, + { + "epoch": 13.415996425379804, + "grad_norm": 0.0021733357571065426, + "learning_rate": 1.4768453342262955e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52261080, + "step": 90075 + }, + { + "epoch": 13.416741137920763, + "grad_norm": 0.0008569390629418194, + "learning_rate": 1.4765488601422934e-05, + "loss": 0.005, + "num_input_tokens_seen": 52264312, + "step": 90080 + }, + { + "epoch": 13.417485850461722, + "grad_norm": 0.0004933393793180585, + "learning_rate": 1.4762524033484565e-05, + "loss": 0.0, + "num_input_tokens_seen": 52266840, + "step": 90085 + }, + { + "epoch": 13.41823056300268, + "grad_norm": 0.00031581794610247016, + "learning_rate": 1.475955963849796e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52269848, + "step": 90090 + }, + { + "epoch": 13.418975275543641, + "grad_norm": 0.010399503633379936, + "learning_rate": 1.4756595416513175e-05, + "loss": 0.0, + "num_input_tokens_seen": 52272760, + "step": 90095 + }, + { + "epoch": 13.4197199880846, + "grad_norm": 0.017755165696144104, + "learning_rate": 1.4753631367580312e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52275640, + "step": 90100 + }, + { + "epoch": 13.420464700625558, + "grad_norm": 6.104887870606035e-05, + "learning_rate": 1.4750667491749428e-05, + "loss": 0.0003, + "num_input_tokens_seen": 52278680, + "step": 90105 + }, + { + "epoch": 13.421209413166517, + "grad_norm": 0.004224953707307577, + "learning_rate": 1.4747703789070606e-05, + "loss": 0.0, + "num_input_tokens_seen": 52281336, + "step": 90110 + }, + { + "epoch": 13.421954125707478, + "grad_norm": 0.02384316362440586, + "learning_rate": 1.4744740259593907e-05, + "loss": 0.0, + "num_input_tokens_seen": 52284184, + "step": 90115 + }, + { + "epoch": 13.422698838248436, + "grad_norm": 0.021760838106274605, + "learning_rate": 1.4741776903369386e-05, + "loss": 0.0, + "num_input_tokens_seen": 52287448, + "step": 90120 + }, + { + "epoch": 13.423443550789395, + "grad_norm": 0.0009223510860465467, + "learning_rate": 1.4738813720447132e-05, + "loss": 0.0007, + "num_input_tokens_seen": 52290264, + "step": 90125 + }, + { + "epoch": 13.424188263330354, + "grad_norm": 0.00231354427523911, + "learning_rate": 1.473585071087718e-05, + "loss": 0.0234, + "num_input_tokens_seen": 52292984, + "step": 90130 + }, + { + "epoch": 13.424932975871315, + "grad_norm": 6.9160691964498255e-06, + "learning_rate": 1.4732887874709605e-05, + "loss": 0.1256, + "num_input_tokens_seen": 52295800, + "step": 90135 + }, + { + "epoch": 13.425677688412273, + "grad_norm": 0.0009043971658684313, + "learning_rate": 1.4729925211994455e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52298776, + "step": 90140 + }, + { + "epoch": 13.426422400953232, + "grad_norm": 0.0005601105513051152, + "learning_rate": 1.4726962722781783e-05, + "loss": 0.0253, + "num_input_tokens_seen": 52301944, + "step": 90145 + }, + { + "epoch": 13.42716711349419, + "grad_norm": 0.0012229450512677431, + "learning_rate": 1.4724000407121624e-05, + "loss": 0.0, + "num_input_tokens_seen": 52305176, + "step": 90150 + }, + { + "epoch": 13.42791182603515, + "grad_norm": 0.12539000809192657, + "learning_rate": 1.4721038265064044e-05, + "loss": 0.0003, + "num_input_tokens_seen": 52308056, + "step": 90155 + }, + { + "epoch": 13.42865653857611, + "grad_norm": 17.468164443969727, + "learning_rate": 1.4718076296659078e-05, + "loss": 0.2126, + "num_input_tokens_seen": 52310776, + "step": 90160 + }, + { + "epoch": 13.429401251117069, + "grad_norm": 0.001715418417006731, + "learning_rate": 1.471511450195675e-05, + "loss": 0.0003, + "num_input_tokens_seen": 52313624, + "step": 90165 + }, + { + "epoch": 13.430145963658028, + "grad_norm": 0.0005864565609954298, + "learning_rate": 1.4712152881007118e-05, + "loss": 0.0041, + "num_input_tokens_seen": 52316472, + "step": 90170 + }, + { + "epoch": 13.430890676198986, + "grad_norm": 0.00013698435213882476, + "learning_rate": 1.4709191433860206e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52319288, + "step": 90175 + }, + { + "epoch": 13.431635388739947, + "grad_norm": 5.8261015510652214e-05, + "learning_rate": 1.4706230160566048e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52322360, + "step": 90180 + }, + { + "epoch": 13.432380101280906, + "grad_norm": 0.006779439281672239, + "learning_rate": 1.4703269061174663e-05, + "loss": 0.0452, + "num_input_tokens_seen": 52325240, + "step": 90185 + }, + { + "epoch": 13.433124813821864, + "grad_norm": 0.00039142186869867146, + "learning_rate": 1.4700308135736085e-05, + "loss": 0.1005, + "num_input_tokens_seen": 52328312, + "step": 90190 + }, + { + "epoch": 13.433869526362823, + "grad_norm": 0.0030172262340784073, + "learning_rate": 1.4697347384300338e-05, + "loss": 0.0, + "num_input_tokens_seen": 52331032, + "step": 90195 + }, + { + "epoch": 13.434614238903784, + "grad_norm": 0.004988531116396189, + "learning_rate": 1.4694386806917426e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52333944, + "step": 90200 + }, + { + "epoch": 13.435358951444742, + "grad_norm": 0.005024812649935484, + "learning_rate": 1.4691426403637382e-05, + "loss": 0.0, + "num_input_tokens_seen": 52337048, + "step": 90205 + }, + { + "epoch": 13.436103663985701, + "grad_norm": 0.23730508983135223, + "learning_rate": 1.4688466174510209e-05, + "loss": 0.1692, + "num_input_tokens_seen": 52340088, + "step": 90210 + }, + { + "epoch": 13.43684837652666, + "grad_norm": 0.020978767424821854, + "learning_rate": 1.4685506119585924e-05, + "loss": 0.0, + "num_input_tokens_seen": 52342968, + "step": 90215 + }, + { + "epoch": 13.43759308906762, + "grad_norm": 3.139847831334919e-05, + "learning_rate": 1.4682546238914521e-05, + "loss": 0.1689, + "num_input_tokens_seen": 52345656, + "step": 90220 + }, + { + "epoch": 13.43833780160858, + "grad_norm": 0.0017595929093658924, + "learning_rate": 1.4679586532546025e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52348760, + "step": 90225 + }, + { + "epoch": 13.439082514149538, + "grad_norm": 4.048434257507324, + "learning_rate": 1.467662700053041e-05, + "loss": 0.027, + "num_input_tokens_seen": 52351544, + "step": 90230 + }, + { + "epoch": 13.439827226690497, + "grad_norm": 0.21965213119983673, + "learning_rate": 1.4673667642917705e-05, + "loss": 0.0399, + "num_input_tokens_seen": 52354424, + "step": 90235 + }, + { + "epoch": 13.440571939231457, + "grad_norm": 0.006441324483603239, + "learning_rate": 1.4670708459757885e-05, + "loss": 0.0, + "num_input_tokens_seen": 52357176, + "step": 90240 + }, + { + "epoch": 13.441316651772416, + "grad_norm": 0.00014098260726314038, + "learning_rate": 1.4667749451100943e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52359960, + "step": 90245 + }, + { + "epoch": 13.442061364313375, + "grad_norm": 0.00023058298393152654, + "learning_rate": 1.4664790616996881e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52363096, + "step": 90250 + }, + { + "epoch": 13.442806076854334, + "grad_norm": 0.008146905340254307, + "learning_rate": 1.4661831957495665e-05, + "loss": 0.0468, + "num_input_tokens_seen": 52365784, + "step": 90255 + }, + { + "epoch": 13.443550789395294, + "grad_norm": 0.0012224747333675623, + "learning_rate": 1.4658873472647299e-05, + "loss": 0.0, + "num_input_tokens_seen": 52368760, + "step": 90260 + }, + { + "epoch": 13.444295501936253, + "grad_norm": 18.694904327392578, + "learning_rate": 1.4655915162501754e-05, + "loss": 0.1257, + "num_input_tokens_seen": 52371672, + "step": 90265 + }, + { + "epoch": 13.445040214477212, + "grad_norm": 20.140277862548828, + "learning_rate": 1.4652957027109009e-05, + "loss": 0.0938, + "num_input_tokens_seen": 52374520, + "step": 90270 + }, + { + "epoch": 13.44578492701817, + "grad_norm": 0.03673020005226135, + "learning_rate": 1.4649999066519043e-05, + "loss": 0.0003, + "num_input_tokens_seen": 52377528, + "step": 90275 + }, + { + "epoch": 13.446529639559131, + "grad_norm": 0.016460254788398743, + "learning_rate": 1.4647041280781821e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52380312, + "step": 90280 + }, + { + "epoch": 13.44727435210009, + "grad_norm": 0.004612195771187544, + "learning_rate": 1.4644083669947314e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52383352, + "step": 90285 + }, + { + "epoch": 13.448019064641048, + "grad_norm": 0.0013668602332472801, + "learning_rate": 1.464112623406548e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52386232, + "step": 90290 + }, + { + "epoch": 13.448763777182007, + "grad_norm": 0.00015069113578647375, + "learning_rate": 1.46381689731863e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52388984, + "step": 90295 + }, + { + "epoch": 13.449508489722968, + "grad_norm": 0.02727825567126274, + "learning_rate": 1.463521188735972e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52391640, + "step": 90300 + }, + { + "epoch": 13.450253202263927, + "grad_norm": 0.007990529760718346, + "learning_rate": 1.4632254976635706e-05, + "loss": 0.0737, + "num_input_tokens_seen": 52394392, + "step": 90305 + }, + { + "epoch": 13.450997914804885, + "grad_norm": 0.004238023888319731, + "learning_rate": 1.4629298241064196e-05, + "loss": 0.1129, + "num_input_tokens_seen": 52397080, + "step": 90310 + }, + { + "epoch": 13.451742627345844, + "grad_norm": 0.011924941092729568, + "learning_rate": 1.462634168069516e-05, + "loss": 0.1349, + "num_input_tokens_seen": 52400248, + "step": 90315 + }, + { + "epoch": 13.452487339886805, + "grad_norm": 0.0031466693617403507, + "learning_rate": 1.4623385295578542e-05, + "loss": 0.0033, + "num_input_tokens_seen": 52403128, + "step": 90320 + }, + { + "epoch": 13.453232052427763, + "grad_norm": 0.00038648772169835865, + "learning_rate": 1.462042908576427e-05, + "loss": 0.0, + "num_input_tokens_seen": 52405880, + "step": 90325 + }, + { + "epoch": 13.453976764968722, + "grad_norm": 0.30070286989212036, + "learning_rate": 1.461747305130231e-05, + "loss": 0.0009, + "num_input_tokens_seen": 52408600, + "step": 90330 + }, + { + "epoch": 13.45472147750968, + "grad_norm": 0.0023240866139531136, + "learning_rate": 1.4614517192242588e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52411320, + "step": 90335 + }, + { + "epoch": 13.45546619005064, + "grad_norm": 0.011999426409602165, + "learning_rate": 1.4611561508635047e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52413816, + "step": 90340 + }, + { + "epoch": 13.4562109025916, + "grad_norm": 0.014145677909255028, + "learning_rate": 1.4608606000529601e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52416824, + "step": 90345 + }, + { + "epoch": 13.456955615132559, + "grad_norm": 0.006210649851709604, + "learning_rate": 1.4605650667976211e-05, + "loss": 0.0033, + "num_input_tokens_seen": 52419992, + "step": 90350 + }, + { + "epoch": 13.457700327673518, + "grad_norm": 0.026004649698734283, + "learning_rate": 1.4602695511024785e-05, + "loss": 0.0892, + "num_input_tokens_seen": 52422776, + "step": 90355 + }, + { + "epoch": 13.458445040214476, + "grad_norm": 0.020893096923828125, + "learning_rate": 1.4599740529725242e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52426072, + "step": 90360 + }, + { + "epoch": 13.459189752755437, + "grad_norm": 0.003505058353766799, + "learning_rate": 1.4596785724127526e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52428632, + "step": 90365 + }, + { + "epoch": 13.459934465296396, + "grad_norm": 1.2603944540023804, + "learning_rate": 1.4593831094281529e-05, + "loss": 0.0006, + "num_input_tokens_seen": 52431480, + "step": 90370 + }, + { + "epoch": 13.460679177837354, + "grad_norm": 0.005025065969675779, + "learning_rate": 1.4590876640237189e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52434424, + "step": 90375 + }, + { + "epoch": 13.461423890378313, + "grad_norm": 35.20611572265625, + "learning_rate": 1.4587922362044399e-05, + "loss": 0.1108, + "num_input_tokens_seen": 52437304, + "step": 90380 + }, + { + "epoch": 13.462168602919274, + "grad_norm": 0.013310756534337997, + "learning_rate": 1.4584968259753088e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52440184, + "step": 90385 + }, + { + "epoch": 13.462913315460233, + "grad_norm": 0.00019135684124194086, + "learning_rate": 1.4582014333413153e-05, + "loss": 0.1532, + "num_input_tokens_seen": 52442968, + "step": 90390 + }, + { + "epoch": 13.463658028001191, + "grad_norm": 0.010220494121313095, + "learning_rate": 1.4579060583074497e-05, + "loss": 0.0108, + "num_input_tokens_seen": 52445784, + "step": 90395 + }, + { + "epoch": 13.46440274054215, + "grad_norm": 0.001658123335801065, + "learning_rate": 1.4576107008787022e-05, + "loss": 0.0, + "num_input_tokens_seen": 52448856, + "step": 90400 + }, + { + "epoch": 13.46514745308311, + "grad_norm": 0.004821713548153639, + "learning_rate": 1.4573153610600615e-05, + "loss": 0.0, + "num_input_tokens_seen": 52451864, + "step": 90405 + }, + { + "epoch": 13.46589216562407, + "grad_norm": 6.42706363578327e-05, + "learning_rate": 1.4570200388565189e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52454840, + "step": 90410 + }, + { + "epoch": 13.466636878165028, + "grad_norm": 0.017417924478650093, + "learning_rate": 1.4567247342730617e-05, + "loss": 0.0, + "num_input_tokens_seen": 52457848, + "step": 90415 + }, + { + "epoch": 13.467381590705987, + "grad_norm": 0.003911626059561968, + "learning_rate": 1.4564294473146808e-05, + "loss": 0.0008, + "num_input_tokens_seen": 52460440, + "step": 90420 + }, + { + "epoch": 13.468126303246947, + "grad_norm": 0.00023829216661397368, + "learning_rate": 1.456134177986363e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52463288, + "step": 90425 + }, + { + "epoch": 13.468871015787906, + "grad_norm": 0.00014635719708167017, + "learning_rate": 1.455838926293098e-05, + "loss": 0.0, + "num_input_tokens_seen": 52466104, + "step": 90430 + }, + { + "epoch": 13.469615728328865, + "grad_norm": 0.006645573303103447, + "learning_rate": 1.4555436922398732e-05, + "loss": 0.0, + "num_input_tokens_seen": 52469400, + "step": 90435 + }, + { + "epoch": 13.470360440869824, + "grad_norm": 0.0025548329576849937, + "learning_rate": 1.4552484758316754e-05, + "loss": 0.0, + "num_input_tokens_seen": 52472312, + "step": 90440 + }, + { + "epoch": 13.471105153410784, + "grad_norm": 4.31908956670668e-05, + "learning_rate": 1.4549532770734936e-05, + "loss": 0.0, + "num_input_tokens_seen": 52475064, + "step": 90445 + }, + { + "epoch": 13.471849865951743, + "grad_norm": 0.0020624729804694653, + "learning_rate": 1.4546580959703138e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52477880, + "step": 90450 + }, + { + "epoch": 13.472594578492702, + "grad_norm": 0.00014129024930298328, + "learning_rate": 1.4543629325271225e-05, + "loss": 0.2588, + "num_input_tokens_seen": 52480792, + "step": 90455 + }, + { + "epoch": 13.47333929103366, + "grad_norm": 0.8822560906410217, + "learning_rate": 1.4540677867489072e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52483704, + "step": 90460 + }, + { + "epoch": 13.474084003574621, + "grad_norm": 0.007692682556807995, + "learning_rate": 1.4537726586406538e-05, + "loss": 0.0, + "num_input_tokens_seen": 52486680, + "step": 90465 + }, + { + "epoch": 13.47482871611558, + "grad_norm": 0.00537778390571475, + "learning_rate": 1.453477548207347e-05, + "loss": 0.0217, + "num_input_tokens_seen": 52489848, + "step": 90470 + }, + { + "epoch": 13.475573428656539, + "grad_norm": 0.025228746235370636, + "learning_rate": 1.4531824554539747e-05, + "loss": 0.1172, + "num_input_tokens_seen": 52492600, + "step": 90475 + }, + { + "epoch": 13.476318141197497, + "grad_norm": 0.007671456318348646, + "learning_rate": 1.4528873803855206e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52495416, + "step": 90480 + }, + { + "epoch": 13.477062853738458, + "grad_norm": 0.0007537428173236549, + "learning_rate": 1.4525923230069689e-05, + "loss": 0.0, + "num_input_tokens_seen": 52498264, + "step": 90485 + }, + { + "epoch": 13.477807566279417, + "grad_norm": 0.0008681355393491685, + "learning_rate": 1.4522972833233068e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52501496, + "step": 90490 + }, + { + "epoch": 13.478552278820375, + "grad_norm": 0.0031556657049804926, + "learning_rate": 1.4520022613395157e-05, + "loss": 0.0, + "num_input_tokens_seen": 52504568, + "step": 90495 + }, + { + "epoch": 13.479296991361334, + "grad_norm": 0.007183549925684929, + "learning_rate": 1.4517072570605824e-05, + "loss": 0.0, + "num_input_tokens_seen": 52507608, + "step": 90500 + }, + { + "epoch": 13.480041703902295, + "grad_norm": 0.07333120703697205, + "learning_rate": 1.4514122704914887e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52510392, + "step": 90505 + }, + { + "epoch": 13.480786416443253, + "grad_norm": 0.02425302378833294, + "learning_rate": 1.4511173016372199e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52513464, + "step": 90510 + }, + { + "epoch": 13.481531128984212, + "grad_norm": 0.0015448638005182147, + "learning_rate": 1.4508223505027581e-05, + "loss": 0.0, + "num_input_tokens_seen": 52516536, + "step": 90515 + }, + { + "epoch": 13.482275841525171, + "grad_norm": 0.001371530583128333, + "learning_rate": 1.4505274170930866e-05, + "loss": 0.0, + "num_input_tokens_seen": 52519512, + "step": 90520 + }, + { + "epoch": 13.48302055406613, + "grad_norm": 0.03054928407073021, + "learning_rate": 1.450232501413188e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52522456, + "step": 90525 + }, + { + "epoch": 13.48376526660709, + "grad_norm": 0.005611999426037073, + "learning_rate": 1.4499376034680429e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52525656, + "step": 90530 + }, + { + "epoch": 13.484509979148049, + "grad_norm": 0.019743746146559715, + "learning_rate": 1.4496427232626358e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52528568, + "step": 90535 + }, + { + "epoch": 13.485254691689008, + "grad_norm": 0.002222024602815509, + "learning_rate": 1.4493478608019461e-05, + "loss": 0.0148, + "num_input_tokens_seen": 52531448, + "step": 90540 + }, + { + "epoch": 13.485999404229966, + "grad_norm": 0.00010943857341771945, + "learning_rate": 1.449053016090958e-05, + "loss": 0.0029, + "num_input_tokens_seen": 52534488, + "step": 90545 + }, + { + "epoch": 13.486744116770927, + "grad_norm": 0.0002923564752563834, + "learning_rate": 1.4487581891346497e-05, + "loss": 0.2563, + "num_input_tokens_seen": 52537304, + "step": 90550 + }, + { + "epoch": 13.487488829311886, + "grad_norm": 0.017648793756961823, + "learning_rate": 1.4484633799380046e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52539928, + "step": 90555 + }, + { + "epoch": 13.488233541852845, + "grad_norm": 0.0003586981038097292, + "learning_rate": 1.4481685885060017e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52542712, + "step": 90560 + }, + { + "epoch": 13.488978254393803, + "grad_norm": 0.009078536182641983, + "learning_rate": 1.4478738148436205e-05, + "loss": 0.0003, + "num_input_tokens_seen": 52545688, + "step": 90565 + }, + { + "epoch": 13.489722966934764, + "grad_norm": 0.006587964948266745, + "learning_rate": 1.447579058955843e-05, + "loss": 0.0015, + "num_input_tokens_seen": 52548856, + "step": 90570 + }, + { + "epoch": 13.490467679475723, + "grad_norm": 0.0017626115586608648, + "learning_rate": 1.4472843208476466e-05, + "loss": 0.0, + "num_input_tokens_seen": 52552216, + "step": 90575 + }, + { + "epoch": 13.491212392016681, + "grad_norm": 0.018553799018263817, + "learning_rate": 1.446989600524013e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52555288, + "step": 90580 + }, + { + "epoch": 13.49195710455764, + "grad_norm": 0.003555783536285162, + "learning_rate": 1.4466948979899194e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52558264, + "step": 90585 + }, + { + "epoch": 13.4927018170986, + "grad_norm": 0.005760041996836662, + "learning_rate": 1.446400213250345e-05, + "loss": 0.0, + "num_input_tokens_seen": 52561208, + "step": 90590 + }, + { + "epoch": 13.49344652963956, + "grad_norm": 1.9814966435660608e-05, + "learning_rate": 1.4461055463102673e-05, + "loss": 0.0, + "num_input_tokens_seen": 52563896, + "step": 90595 + }, + { + "epoch": 13.494191242180518, + "grad_norm": 0.0007523995009250939, + "learning_rate": 1.4458108971746665e-05, + "loss": 0.0, + "num_input_tokens_seen": 52566808, + "step": 90600 + }, + { + "epoch": 13.494935954721477, + "grad_norm": 4.6214976464398205e-05, + "learning_rate": 1.4455162658485188e-05, + "loss": 0.0, + "num_input_tokens_seen": 52569624, + "step": 90605 + }, + { + "epoch": 13.495680667262437, + "grad_norm": 0.20379145443439484, + "learning_rate": 1.4452216523368011e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52573016, + "step": 90610 + }, + { + "epoch": 13.496425379803396, + "grad_norm": 0.0006086663925088942, + "learning_rate": 1.4449270566444929e-05, + "loss": 0.0, + "num_input_tokens_seen": 52575864, + "step": 90615 + }, + { + "epoch": 13.497170092344355, + "grad_norm": 0.0003152074059471488, + "learning_rate": 1.4446324787765686e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52578936, + "step": 90620 + }, + { + "epoch": 13.497914804885314, + "grad_norm": 0.000708229374140501, + "learning_rate": 1.4443379187380068e-05, + "loss": 0.0, + "num_input_tokens_seen": 52581944, + "step": 90625 + }, + { + "epoch": 13.498659517426274, + "grad_norm": 0.0023365975357592106, + "learning_rate": 1.4440433765337819e-05, + "loss": 0.0192, + "num_input_tokens_seen": 52584632, + "step": 90630 + }, + { + "epoch": 13.499404229967233, + "grad_norm": 0.009402208961546421, + "learning_rate": 1.443748852168872e-05, + "loss": 0.0178, + "num_input_tokens_seen": 52587576, + "step": 90635 + }, + { + "epoch": 13.500148942508192, + "grad_norm": 0.3144727945327759, + "learning_rate": 1.443454345648252e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52590648, + "step": 90640 + }, + { + "epoch": 13.50089365504915, + "grad_norm": 0.010639403015375137, + "learning_rate": 1.4431598569768955e-05, + "loss": 0.0, + "num_input_tokens_seen": 52593496, + "step": 90645 + }, + { + "epoch": 13.501638367590111, + "grad_norm": 0.0002209533122368157, + "learning_rate": 1.4428653861597802e-05, + "loss": 0.0, + "num_input_tokens_seen": 52596440, + "step": 90650 + }, + { + "epoch": 13.50238308013107, + "grad_norm": 0.0007091471925377846, + "learning_rate": 1.44257093320188e-05, + "loss": 0.0, + "num_input_tokens_seen": 52599448, + "step": 90655 + }, + { + "epoch": 13.503127792672029, + "grad_norm": 0.00177253398578614, + "learning_rate": 1.4422764981081691e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52602168, + "step": 90660 + }, + { + "epoch": 13.503872505212987, + "grad_norm": 0.00015975130372680724, + "learning_rate": 1.4419820808836207e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52605016, + "step": 90665 + }, + { + "epoch": 13.504617217753946, + "grad_norm": 0.005839542951434851, + "learning_rate": 1.441687681533211e-05, + "loss": 0.0069, + "num_input_tokens_seen": 52607960, + "step": 90670 + }, + { + "epoch": 13.505361930294907, + "grad_norm": 0.00033573602559044957, + "learning_rate": 1.4413933000619106e-05, + "loss": 0.0175, + "num_input_tokens_seen": 52610648, + "step": 90675 + }, + { + "epoch": 13.506106642835865, + "grad_norm": 0.0036710742861032486, + "learning_rate": 1.441098936474696e-05, + "loss": 0.0, + "num_input_tokens_seen": 52613560, + "step": 90680 + }, + { + "epoch": 13.506851355376824, + "grad_norm": 0.0009858862031251192, + "learning_rate": 1.4408045907765385e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52616312, + "step": 90685 + }, + { + "epoch": 13.507596067917785, + "grad_norm": 5.915142173762433e-05, + "learning_rate": 1.44051026297241e-05, + "loss": 0.0, + "num_input_tokens_seen": 52619000, + "step": 90690 + }, + { + "epoch": 13.508340780458743, + "grad_norm": 169.71023559570312, + "learning_rate": 1.4402159530672849e-05, + "loss": 0.1719, + "num_input_tokens_seen": 52621848, + "step": 90695 + }, + { + "epoch": 13.509085492999702, + "grad_norm": 0.008699222467839718, + "learning_rate": 1.4399216610661329e-05, + "loss": 0.0014, + "num_input_tokens_seen": 52624696, + "step": 90700 + }, + { + "epoch": 13.509830205540661, + "grad_norm": 0.0054837726056575775, + "learning_rate": 1.4396273869739279e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52627352, + "step": 90705 + }, + { + "epoch": 13.51057491808162, + "grad_norm": 0.008565113879740238, + "learning_rate": 1.4393331307956399e-05, + "loss": 0.0, + "num_input_tokens_seen": 52630360, + "step": 90710 + }, + { + "epoch": 13.51131963062258, + "grad_norm": 0.0014816162874922156, + "learning_rate": 1.4390388925362413e-05, + "loss": 0.0335, + "num_input_tokens_seen": 52633176, + "step": 90715 + }, + { + "epoch": 13.512064343163539, + "grad_norm": 0.00017403665697202086, + "learning_rate": 1.4387446722007025e-05, + "loss": 0.0, + "num_input_tokens_seen": 52636152, + "step": 90720 + }, + { + "epoch": 13.512809055704498, + "grad_norm": 0.0004479441267903894, + "learning_rate": 1.4384504697939938e-05, + "loss": 0.0, + "num_input_tokens_seen": 52639224, + "step": 90725 + }, + { + "epoch": 13.513553768245457, + "grad_norm": 0.12854720652103424, + "learning_rate": 1.4381562853210856e-05, + "loss": 0.0763, + "num_input_tokens_seen": 52642072, + "step": 90730 + }, + { + "epoch": 13.514298480786417, + "grad_norm": 0.002615525620058179, + "learning_rate": 1.4378621187869467e-05, + "loss": 0.2156, + "num_input_tokens_seen": 52645016, + "step": 90735 + }, + { + "epoch": 13.515043193327376, + "grad_norm": 0.00016096378385555, + "learning_rate": 1.4375679701965488e-05, + "loss": 0.0, + "num_input_tokens_seen": 52647640, + "step": 90740 + }, + { + "epoch": 13.515787905868335, + "grad_norm": 0.00038093538023531437, + "learning_rate": 1.437273839554859e-05, + "loss": 0.0, + "num_input_tokens_seen": 52650872, + "step": 90745 + }, + { + "epoch": 13.516532618409293, + "grad_norm": 0.0011991601204499602, + "learning_rate": 1.436979726866849e-05, + "loss": 0.0, + "num_input_tokens_seen": 52653752, + "step": 90750 + }, + { + "epoch": 13.517277330950254, + "grad_norm": 50.80692672729492, + "learning_rate": 1.436685632137485e-05, + "loss": 0.0046, + "num_input_tokens_seen": 52656600, + "step": 90755 + }, + { + "epoch": 13.518022043491213, + "grad_norm": 0.0010489156702533364, + "learning_rate": 1.4363915553717371e-05, + "loss": 0.0, + "num_input_tokens_seen": 52659320, + "step": 90760 + }, + { + "epoch": 13.518766756032171, + "grad_norm": 2.705612132558599e-05, + "learning_rate": 1.4360974965745732e-05, + "loss": 0.0003, + "num_input_tokens_seen": 52662360, + "step": 90765 + }, + { + "epoch": 13.51951146857313, + "grad_norm": 0.00015733044710941613, + "learning_rate": 1.4358034557509598e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52665240, + "step": 90770 + }, + { + "epoch": 13.52025618111409, + "grad_norm": 0.0002996798139065504, + "learning_rate": 1.4355094329058666e-05, + "loss": 0.0, + "num_input_tokens_seen": 52668024, + "step": 90775 + }, + { + "epoch": 13.52100089365505, + "grad_norm": 0.004575766157358885, + "learning_rate": 1.4352154280442592e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52670776, + "step": 90780 + }, + { + "epoch": 13.521745606196008, + "grad_norm": 0.0012297171633690596, + "learning_rate": 1.4349214411711043e-05, + "loss": 0.0, + "num_input_tokens_seen": 52673688, + "step": 90785 + }, + { + "epoch": 13.522490318736967, + "grad_norm": 0.0032002627849578857, + "learning_rate": 1.43462747229137e-05, + "loss": 0.0, + "num_input_tokens_seen": 52676312, + "step": 90790 + }, + { + "epoch": 13.523235031277927, + "grad_norm": 0.0007849412504583597, + "learning_rate": 1.4343335214100218e-05, + "loss": 0.0, + "num_input_tokens_seen": 52679320, + "step": 90795 + }, + { + "epoch": 13.523979743818886, + "grad_norm": 239.3965301513672, + "learning_rate": 1.4340395885320257e-05, + "loss": 0.0762, + "num_input_tokens_seen": 52682232, + "step": 90800 + }, + { + "epoch": 13.524724456359845, + "grad_norm": 0.0018019721610471606, + "learning_rate": 1.4337456736623462e-05, + "loss": 0.0006, + "num_input_tokens_seen": 52685272, + "step": 90805 + }, + { + "epoch": 13.525469168900804, + "grad_norm": 0.0001432479766663164, + "learning_rate": 1.433451776805951e-05, + "loss": 0.0, + "num_input_tokens_seen": 52688440, + "step": 90810 + }, + { + "epoch": 13.526213881441764, + "grad_norm": 0.0002055466902675107, + "learning_rate": 1.4331578979678029e-05, + "loss": 0.0, + "num_input_tokens_seen": 52691192, + "step": 90815 + }, + { + "epoch": 13.526958593982723, + "grad_norm": 0.0005740756751038134, + "learning_rate": 1.432864037152869e-05, + "loss": 0.0, + "num_input_tokens_seen": 52694008, + "step": 90820 + }, + { + "epoch": 13.527703306523682, + "grad_norm": 0.00046687814756296575, + "learning_rate": 1.4325701943661116e-05, + "loss": 0.0047, + "num_input_tokens_seen": 52696664, + "step": 90825 + }, + { + "epoch": 13.52844801906464, + "grad_norm": 1.7898806618177332e-05, + "learning_rate": 1.432276369612497e-05, + "loss": 0.0, + "num_input_tokens_seen": 52699576, + "step": 90830 + }, + { + "epoch": 13.529192731605601, + "grad_norm": 0.0002530007332097739, + "learning_rate": 1.4319825628969863e-05, + "loss": 0.0, + "num_input_tokens_seen": 52702104, + "step": 90835 + }, + { + "epoch": 13.52993744414656, + "grad_norm": 0.0007002571364864707, + "learning_rate": 1.4316887742245464e-05, + "loss": 0.0, + "num_input_tokens_seen": 52704856, + "step": 90840 + }, + { + "epoch": 13.530682156687519, + "grad_norm": 0.00026272449758835137, + "learning_rate": 1.4313950036001384e-05, + "loss": 0.0, + "num_input_tokens_seen": 52708152, + "step": 90845 + }, + { + "epoch": 13.531426869228477, + "grad_norm": 5.5050939408829436e-05, + "learning_rate": 1.431101251028726e-05, + "loss": 0.0, + "num_input_tokens_seen": 52711128, + "step": 90850 + }, + { + "epoch": 13.532171581769436, + "grad_norm": 0.0002444588753860444, + "learning_rate": 1.4308075165152718e-05, + "loss": 0.0, + "num_input_tokens_seen": 52713720, + "step": 90855 + }, + { + "epoch": 13.532916294310397, + "grad_norm": 0.022992702201008797, + "learning_rate": 1.4305138000647367e-05, + "loss": 0.0003, + "num_input_tokens_seen": 52716408, + "step": 90860 + }, + { + "epoch": 13.533661006851355, + "grad_norm": 0.0038223019801080227, + "learning_rate": 1.4302201016820849e-05, + "loss": 0.0, + "num_input_tokens_seen": 52719032, + "step": 90865 + }, + { + "epoch": 13.534405719392314, + "grad_norm": 0.00382016459479928, + "learning_rate": 1.4299264213722762e-05, + "loss": 0.1097, + "num_input_tokens_seen": 52722104, + "step": 90870 + }, + { + "epoch": 13.535150431933273, + "grad_norm": 2.7417130695539527e-05, + "learning_rate": 1.4296327591402742e-05, + "loss": 0.0, + "num_input_tokens_seen": 52724792, + "step": 90875 + }, + { + "epoch": 13.535895144474233, + "grad_norm": 0.025202861055731773, + "learning_rate": 1.4293391149910384e-05, + "loss": 0.3533, + "num_input_tokens_seen": 52727608, + "step": 90880 + }, + { + "epoch": 13.536639857015192, + "grad_norm": 0.0005959072150290012, + "learning_rate": 1.4290454889295296e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52730488, + "step": 90885 + }, + { + "epoch": 13.537384569556151, + "grad_norm": 0.00020273705013096333, + "learning_rate": 1.4287518809607097e-05, + "loss": 0.0, + "num_input_tokens_seen": 52733048, + "step": 90890 + }, + { + "epoch": 13.53812928209711, + "grad_norm": 0.0029600635170936584, + "learning_rate": 1.428458291089537e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52735928, + "step": 90895 + }, + { + "epoch": 13.53887399463807, + "grad_norm": 0.00015269375580828637, + "learning_rate": 1.4281647193209732e-05, + "loss": 0.0, + "num_input_tokens_seen": 52738552, + "step": 90900 + }, + { + "epoch": 13.539618707179029, + "grad_norm": 0.00046521355397999287, + "learning_rate": 1.4278711656599764e-05, + "loss": 0.0, + "num_input_tokens_seen": 52741528, + "step": 90905 + }, + { + "epoch": 13.540363419719988, + "grad_norm": 0.01849053055047989, + "learning_rate": 1.4275776301115074e-05, + "loss": 0.2263, + "num_input_tokens_seen": 52744280, + "step": 90910 + }, + { + "epoch": 13.541108132260947, + "grad_norm": 0.00014853356697130948, + "learning_rate": 1.4272841126805242e-05, + "loss": 0.0, + "num_input_tokens_seen": 52747160, + "step": 90915 + }, + { + "epoch": 13.541852844801907, + "grad_norm": 3.615639070631005e-05, + "learning_rate": 1.4269906133719863e-05, + "loss": 0.0, + "num_input_tokens_seen": 52749912, + "step": 90920 + }, + { + "epoch": 13.542597557342866, + "grad_norm": 0.08586856722831726, + "learning_rate": 1.4266971321908507e-05, + "loss": 0.1657, + "num_input_tokens_seen": 52753048, + "step": 90925 + }, + { + "epoch": 13.543342269883825, + "grad_norm": 55.53636169433594, + "learning_rate": 1.4264036691420756e-05, + "loss": 0.1472, + "num_input_tokens_seen": 52756056, + "step": 90930 + }, + { + "epoch": 13.544086982424783, + "grad_norm": 0.0009671246516518295, + "learning_rate": 1.42611022423062e-05, + "loss": 0.0, + "num_input_tokens_seen": 52759224, + "step": 90935 + }, + { + "epoch": 13.544831694965744, + "grad_norm": 0.0007376450230367482, + "learning_rate": 1.42581679746144e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52762040, + "step": 90940 + }, + { + "epoch": 13.545576407506703, + "grad_norm": 4.6853874664520845e-05, + "learning_rate": 1.4255233888394947e-05, + "loss": 0.0013, + "num_input_tokens_seen": 52764760, + "step": 90945 + }, + { + "epoch": 13.546321120047661, + "grad_norm": 0.0003653534222394228, + "learning_rate": 1.4252299983697381e-05, + "loss": 0.0813, + "num_input_tokens_seen": 52767576, + "step": 90950 + }, + { + "epoch": 13.54706583258862, + "grad_norm": 1.941025402629748e-05, + "learning_rate": 1.4249366260571299e-05, + "loss": 0.0, + "num_input_tokens_seen": 52770360, + "step": 90955 + }, + { + "epoch": 13.54781054512958, + "grad_norm": 0.00024343925178982317, + "learning_rate": 1.4246432719066244e-05, + "loss": 0.0, + "num_input_tokens_seen": 52773240, + "step": 90960 + }, + { + "epoch": 13.54855525767054, + "grad_norm": 0.00016267190221697092, + "learning_rate": 1.4243499359231771e-05, + "loss": 0.0, + "num_input_tokens_seen": 52776248, + "step": 90965 + }, + { + "epoch": 13.549299970211498, + "grad_norm": 0.0028255449142307043, + "learning_rate": 1.4240566181117451e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52779160, + "step": 90970 + }, + { + "epoch": 13.550044682752457, + "grad_norm": 0.0011853785254061222, + "learning_rate": 1.4237633184772822e-05, + "loss": 0.0, + "num_input_tokens_seen": 52781912, + "step": 90975 + }, + { + "epoch": 13.550789395293418, + "grad_norm": 0.11468016356229782, + "learning_rate": 1.423470037024745e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52784952, + "step": 90980 + }, + { + "epoch": 13.551534107834376, + "grad_norm": 0.00021933752577751875, + "learning_rate": 1.423176773759088e-05, + "loss": 0.0, + "num_input_tokens_seen": 52787832, + "step": 90985 + }, + { + "epoch": 13.552278820375335, + "grad_norm": 0.003455772064626217, + "learning_rate": 1.4228835286852643e-05, + "loss": 0.0, + "num_input_tokens_seen": 52791000, + "step": 90990 + }, + { + "epoch": 13.553023532916294, + "grad_norm": 0.0014355535386130214, + "learning_rate": 1.4225903018082278e-05, + "loss": 0.0, + "num_input_tokens_seen": 52794040, + "step": 90995 + }, + { + "epoch": 13.553768245457253, + "grad_norm": 2.227095365524292, + "learning_rate": 1.4222970931329343e-05, + "loss": 0.0007, + "num_input_tokens_seen": 52796952, + "step": 91000 + }, + { + "epoch": 13.554512957998213, + "grad_norm": 9.796835365705192e-05, + "learning_rate": 1.4220039026643361e-05, + "loss": 0.2281, + "num_input_tokens_seen": 52800024, + "step": 91005 + }, + { + "epoch": 13.555257670539172, + "grad_norm": 0.002169468207284808, + "learning_rate": 1.4217107304073851e-05, + "loss": 0.0, + "num_input_tokens_seen": 52803096, + "step": 91010 + }, + { + "epoch": 13.55600238308013, + "grad_norm": 0.0017043373081833124, + "learning_rate": 1.4214175763670365e-05, + "loss": 0.1258, + "num_input_tokens_seen": 52806040, + "step": 91015 + }, + { + "epoch": 13.556747095621091, + "grad_norm": 0.001091773621737957, + "learning_rate": 1.4211244405482408e-05, + "loss": 0.0, + "num_input_tokens_seen": 52808888, + "step": 91020 + }, + { + "epoch": 13.55749180816205, + "grad_norm": 1.0826724974322133e-05, + "learning_rate": 1.420831322955952e-05, + "loss": 0.0, + "num_input_tokens_seen": 52811640, + "step": 91025 + }, + { + "epoch": 13.558236520703009, + "grad_norm": 0.00045935469097457826, + "learning_rate": 1.4205382235951204e-05, + "loss": 0.1162, + "num_input_tokens_seen": 52814840, + "step": 91030 + }, + { + "epoch": 13.558981233243967, + "grad_norm": 0.0008270899415947497, + "learning_rate": 1.4202451424706991e-05, + "loss": 0.0, + "num_input_tokens_seen": 52817944, + "step": 91035 + }, + { + "epoch": 13.559725945784926, + "grad_norm": 0.0012576787266880274, + "learning_rate": 1.4199520795876387e-05, + "loss": 0.0013, + "num_input_tokens_seen": 52820696, + "step": 91040 + }, + { + "epoch": 13.560470658325887, + "grad_norm": 0.01577511429786682, + "learning_rate": 1.4196590349508896e-05, + "loss": 0.0, + "num_input_tokens_seen": 52823544, + "step": 91045 + }, + { + "epoch": 13.561215370866845, + "grad_norm": 0.00018959809676744044, + "learning_rate": 1.4193660085654037e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52826392, + "step": 91050 + }, + { + "epoch": 13.561960083407804, + "grad_norm": 63.981849670410156, + "learning_rate": 1.419073000436131e-05, + "loss": 0.0079, + "num_input_tokens_seen": 52829112, + "step": 91055 + }, + { + "epoch": 13.562704795948763, + "grad_norm": 3.5416273021837696e-05, + "learning_rate": 1.4187800105680213e-05, + "loss": 0.1439, + "num_input_tokens_seen": 52831928, + "step": 91060 + }, + { + "epoch": 13.563449508489724, + "grad_norm": 0.0011341121280565858, + "learning_rate": 1.4184870389660235e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52834680, + "step": 91065 + }, + { + "epoch": 13.564194221030682, + "grad_norm": 0.09803842008113861, + "learning_rate": 1.4181940856350889e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52837624, + "step": 91070 + }, + { + "epoch": 13.564938933571641, + "grad_norm": 0.0017149976920336485, + "learning_rate": 1.4179011505801648e-05, + "loss": 0.0, + "num_input_tokens_seen": 52840312, + "step": 91075 + }, + { + "epoch": 13.5656836461126, + "grad_norm": 0.017112061381340027, + "learning_rate": 1.4176082338062019e-05, + "loss": 0.0003, + "num_input_tokens_seen": 52843128, + "step": 91080 + }, + { + "epoch": 13.56642835865356, + "grad_norm": 0.00491404440253973, + "learning_rate": 1.4173153353181477e-05, + "loss": 0.0002, + "num_input_tokens_seen": 52845592, + "step": 91085 + }, + { + "epoch": 13.567173071194519, + "grad_norm": 0.007027660962194204, + "learning_rate": 1.4170224551209493e-05, + "loss": 0.0, + "num_input_tokens_seen": 52848440, + "step": 91090 + }, + { + "epoch": 13.567917783735478, + "grad_norm": 0.02336389385163784, + "learning_rate": 1.4167295932195573e-05, + "loss": 0.0003, + "num_input_tokens_seen": 52851448, + "step": 91095 + }, + { + "epoch": 13.568662496276437, + "grad_norm": 0.009122130461037159, + "learning_rate": 1.4164367496189169e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52854328, + "step": 91100 + }, + { + "epoch": 13.569407208817397, + "grad_norm": 0.14437828958034515, + "learning_rate": 1.4161439243239768e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52857240, + "step": 91105 + }, + { + "epoch": 13.570151921358356, + "grad_norm": 0.023835081607103348, + "learning_rate": 1.4158511173396838e-05, + "loss": 0.0005, + "num_input_tokens_seen": 52860120, + "step": 91110 + }, + { + "epoch": 13.570896633899315, + "grad_norm": 0.0008377747726626694, + "learning_rate": 1.4155583286709833e-05, + "loss": 0.0003, + "num_input_tokens_seen": 52863192, + "step": 91115 + }, + { + "epoch": 13.571641346440273, + "grad_norm": 0.00554572744295001, + "learning_rate": 1.4152655583228235e-05, + "loss": 0.0017, + "num_input_tokens_seen": 52866296, + "step": 91120 + }, + { + "epoch": 13.572386058981234, + "grad_norm": 0.08747879415750504, + "learning_rate": 1.41497280630015e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52869144, + "step": 91125 + }, + { + "epoch": 13.573130771522193, + "grad_norm": 0.0003619859926402569, + "learning_rate": 1.414680072607908e-05, + "loss": 0.0, + "num_input_tokens_seen": 52872216, + "step": 91130 + }, + { + "epoch": 13.573875484063151, + "grad_norm": 0.00013492388825397938, + "learning_rate": 1.414387357251042e-05, + "loss": 0.0, + "num_input_tokens_seen": 52874872, + "step": 91135 + }, + { + "epoch": 13.57462019660411, + "grad_norm": 0.004613145720213652, + "learning_rate": 1.4140946602344993e-05, + "loss": 0.0, + "num_input_tokens_seen": 52877976, + "step": 91140 + }, + { + "epoch": 13.57536490914507, + "grad_norm": 0.0025837826542556286, + "learning_rate": 1.4138019815632226e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52880632, + "step": 91145 + }, + { + "epoch": 13.57610962168603, + "grad_norm": 0.0005754658486694098, + "learning_rate": 1.4135093212421584e-05, + "loss": 0.0, + "num_input_tokens_seen": 52883608, + "step": 91150 + }, + { + "epoch": 13.576854334226988, + "grad_norm": 0.00014490146713797003, + "learning_rate": 1.4132166792762491e-05, + "loss": 0.0, + "num_input_tokens_seen": 52886616, + "step": 91155 + }, + { + "epoch": 13.577599046767947, + "grad_norm": 0.005145468283444643, + "learning_rate": 1.4129240556704403e-05, + "loss": 0.0, + "num_input_tokens_seen": 52889592, + "step": 91160 + }, + { + "epoch": 13.578343759308908, + "grad_norm": 0.00462760217487812, + "learning_rate": 1.4126314504296751e-05, + "loss": 0.0, + "num_input_tokens_seen": 52892536, + "step": 91165 + }, + { + "epoch": 13.579088471849866, + "grad_norm": 0.00010618253872962669, + "learning_rate": 1.412338863558895e-05, + "loss": 0.1035, + "num_input_tokens_seen": 52895256, + "step": 91170 + }, + { + "epoch": 13.579833184390825, + "grad_norm": 0.0065814899280667305, + "learning_rate": 1.4120462950630453e-05, + "loss": 0.0, + "num_input_tokens_seen": 52898136, + "step": 91175 + }, + { + "epoch": 13.580577896931784, + "grad_norm": 0.0010137248318642378, + "learning_rate": 1.411753744947068e-05, + "loss": 0.0, + "num_input_tokens_seen": 52901080, + "step": 91180 + }, + { + "epoch": 13.581322609472743, + "grad_norm": 193.33193969726562, + "learning_rate": 1.4114612132159049e-05, + "loss": 0.2076, + "num_input_tokens_seen": 52903928, + "step": 91185 + }, + { + "epoch": 13.582067322013703, + "grad_norm": 0.0006863207672722638, + "learning_rate": 1.4111686998744975e-05, + "loss": 0.1073, + "num_input_tokens_seen": 52906712, + "step": 91190 + }, + { + "epoch": 13.582812034554662, + "grad_norm": 0.0010578364599496126, + "learning_rate": 1.410876204927789e-05, + "loss": 0.0, + "num_input_tokens_seen": 52909496, + "step": 91195 + }, + { + "epoch": 13.58355674709562, + "grad_norm": 0.0007587157306261361, + "learning_rate": 1.4105837283807194e-05, + "loss": 0.0276, + "num_input_tokens_seen": 52912568, + "step": 91200 + }, + { + "epoch": 13.584301459636581, + "grad_norm": 0.001587660051882267, + "learning_rate": 1.410291270238231e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52915192, + "step": 91205 + }, + { + "epoch": 13.58504617217754, + "grad_norm": 0.0023420914076268673, + "learning_rate": 1.4099988305052644e-05, + "loss": 0.0471, + "num_input_tokens_seen": 52917976, + "step": 91210 + }, + { + "epoch": 13.585790884718499, + "grad_norm": 0.0019584984984248877, + "learning_rate": 1.4097064091867587e-05, + "loss": 0.0018, + "num_input_tokens_seen": 52921016, + "step": 91215 + }, + { + "epoch": 13.586535597259457, + "grad_norm": 0.0023033474572002888, + "learning_rate": 1.4094140062876559e-05, + "loss": 0.0, + "num_input_tokens_seen": 52923992, + "step": 91220 + }, + { + "epoch": 13.587280309800416, + "grad_norm": 0.0010139766382053494, + "learning_rate": 1.4091216218128945e-05, + "loss": 0.0, + "num_input_tokens_seen": 52926488, + "step": 91225 + }, + { + "epoch": 13.588025022341377, + "grad_norm": 0.00026572938077151775, + "learning_rate": 1.4088292557674155e-05, + "loss": 0.0004, + "num_input_tokens_seen": 52929176, + "step": 91230 + }, + { + "epoch": 13.588769734882336, + "grad_norm": 0.00010863041097763926, + "learning_rate": 1.4085369081561556e-05, + "loss": 0.0, + "num_input_tokens_seen": 52932152, + "step": 91235 + }, + { + "epoch": 13.589514447423294, + "grad_norm": 8.388939022552222e-05, + "learning_rate": 1.408244578984057e-05, + "loss": 0.0, + "num_input_tokens_seen": 52935096, + "step": 91240 + }, + { + "epoch": 13.590259159964253, + "grad_norm": 6.076314926147461, + "learning_rate": 1.4079522682560563e-05, + "loss": 0.0031, + "num_input_tokens_seen": 52937752, + "step": 91245 + }, + { + "epoch": 13.591003872505214, + "grad_norm": 0.0001262215810129419, + "learning_rate": 1.4076599759770919e-05, + "loss": 0.0648, + "num_input_tokens_seen": 52940440, + "step": 91250 + }, + { + "epoch": 13.591748585046172, + "grad_norm": 0.040867678821086884, + "learning_rate": 1.4073677021521026e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52943480, + "step": 91255 + }, + { + "epoch": 13.592493297587131, + "grad_norm": 3.5849559935741127e-05, + "learning_rate": 1.407075446786024e-05, + "loss": 0.0, + "num_input_tokens_seen": 52946424, + "step": 91260 + }, + { + "epoch": 13.59323801012809, + "grad_norm": 7.731417281320319e-05, + "learning_rate": 1.406783209883796e-05, + "loss": 0.1187, + "num_input_tokens_seen": 52949656, + "step": 91265 + }, + { + "epoch": 13.59398272266905, + "grad_norm": 23.930679321289062, + "learning_rate": 1.4064909914503537e-05, + "loss": 0.0124, + "num_input_tokens_seen": 52952600, + "step": 91270 + }, + { + "epoch": 13.59472743521001, + "grad_norm": 7.758835272397846e-05, + "learning_rate": 1.4061987914906354e-05, + "loss": 0.0, + "num_input_tokens_seen": 52955480, + "step": 91275 + }, + { + "epoch": 13.595472147750968, + "grad_norm": 9.597384632797912e-05, + "learning_rate": 1.4059066100095763e-05, + "loss": 0.0, + "num_input_tokens_seen": 52958136, + "step": 91280 + }, + { + "epoch": 13.596216860291927, + "grad_norm": 0.018769467249512672, + "learning_rate": 1.4056144470121137e-05, + "loss": 0.0, + "num_input_tokens_seen": 52960952, + "step": 91285 + }, + { + "epoch": 13.596961572832887, + "grad_norm": 6.121584738139063e-05, + "learning_rate": 1.405322302503183e-05, + "loss": 0.0, + "num_input_tokens_seen": 52964088, + "step": 91290 + }, + { + "epoch": 13.597706285373846, + "grad_norm": 0.006461825221776962, + "learning_rate": 1.4050301764877183e-05, + "loss": 0.0, + "num_input_tokens_seen": 52967096, + "step": 91295 + }, + { + "epoch": 13.598450997914805, + "grad_norm": 0.024998050183057785, + "learning_rate": 1.4047380689706568e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52970104, + "step": 91300 + }, + { + "epoch": 13.599195710455763, + "grad_norm": 0.000877613085322082, + "learning_rate": 1.4044459799569316e-05, + "loss": 0.0, + "num_input_tokens_seen": 52972824, + "step": 91305 + }, + { + "epoch": 13.599940422996724, + "grad_norm": 28.597848892211914, + "learning_rate": 1.4041539094514788e-05, + "loss": 0.1222, + "num_input_tokens_seen": 52975704, + "step": 91310 + }, + { + "epoch": 13.600685135537683, + "grad_norm": 9.90445987554267e-05, + "learning_rate": 1.4038618574592322e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52978424, + "step": 91315 + }, + { + "epoch": 13.601429848078642, + "grad_norm": 8.122484723571688e-05, + "learning_rate": 1.4035698239851253e-05, + "loss": 0.0, + "num_input_tokens_seen": 52981208, + "step": 91320 + }, + { + "epoch": 13.6021745606196, + "grad_norm": 0.15183253586292267, + "learning_rate": 1.403277809034092e-05, + "loss": 0.0003, + "num_input_tokens_seen": 52984248, + "step": 91325 + }, + { + "epoch": 13.60291927316056, + "grad_norm": 0.00018488039495423436, + "learning_rate": 1.4029858126110645e-05, + "loss": 0.0762, + "num_input_tokens_seen": 52986808, + "step": 91330 + }, + { + "epoch": 13.60366398570152, + "grad_norm": 0.0005302113713696599, + "learning_rate": 1.4026938347209778e-05, + "loss": 0.0, + "num_input_tokens_seen": 52990040, + "step": 91335 + }, + { + "epoch": 13.604408698242478, + "grad_norm": 0.0006571800331585109, + "learning_rate": 1.4024018753687624e-05, + "loss": 0.0001, + "num_input_tokens_seen": 52992664, + "step": 91340 + }, + { + "epoch": 13.605153410783437, + "grad_norm": 0.0036583172623068094, + "learning_rate": 1.4021099345593524e-05, + "loss": 0.0007, + "num_input_tokens_seen": 52996536, + "step": 91345 + }, + { + "epoch": 13.605898123324398, + "grad_norm": 0.0005485125002451241, + "learning_rate": 1.4018180122976788e-05, + "loss": 0.0, + "num_input_tokens_seen": 52999320, + "step": 91350 + }, + { + "epoch": 13.606642835865356, + "grad_norm": 1.8439279301674105e-05, + "learning_rate": 1.4015261085886743e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53002200, + "step": 91355 + }, + { + "epoch": 13.607387548406315, + "grad_norm": 0.017105894163250923, + "learning_rate": 1.4012342234372688e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53005240, + "step": 91360 + }, + { + "epoch": 13.608132260947274, + "grad_norm": 0.0006202246295288205, + "learning_rate": 1.4009423568483957e-05, + "loss": 0.0, + "num_input_tokens_seen": 53008056, + "step": 91365 + }, + { + "epoch": 13.608876973488233, + "grad_norm": 0.008501970209181309, + "learning_rate": 1.4006505088269841e-05, + "loss": 0.0436, + "num_input_tokens_seen": 53011064, + "step": 91370 + }, + { + "epoch": 13.609621686029193, + "grad_norm": 0.009096605703234673, + "learning_rate": 1.4003586793779641e-05, + "loss": 0.0376, + "num_input_tokens_seen": 53013848, + "step": 91375 + }, + { + "epoch": 13.610366398570152, + "grad_norm": 0.004781048279255629, + "learning_rate": 1.4000668685062674e-05, + "loss": 0.033, + "num_input_tokens_seen": 53016888, + "step": 91380 + }, + { + "epoch": 13.61111111111111, + "grad_norm": 0.0005208950024098158, + "learning_rate": 1.3997750762168232e-05, + "loss": 0.1159, + "num_input_tokens_seen": 53019928, + "step": 91385 + }, + { + "epoch": 13.61185582365207, + "grad_norm": 0.0032259065192192793, + "learning_rate": 1.3994833025145607e-05, + "loss": 0.0, + "num_input_tokens_seen": 53023000, + "step": 91390 + }, + { + "epoch": 13.61260053619303, + "grad_norm": 0.0004600923857651651, + "learning_rate": 1.3991915474044081e-05, + "loss": 0.2531, + "num_input_tokens_seen": 53025848, + "step": 91395 + }, + { + "epoch": 13.613345248733989, + "grad_norm": 3.0065173632465303e-05, + "learning_rate": 1.398899810891297e-05, + "loss": 0.0, + "num_input_tokens_seen": 53028504, + "step": 91400 + }, + { + "epoch": 13.614089961274948, + "grad_norm": 0.00014509014727082103, + "learning_rate": 1.3986080929801543e-05, + "loss": 0.0, + "num_input_tokens_seen": 53031192, + "step": 91405 + }, + { + "epoch": 13.614834673815906, + "grad_norm": 5.199239603825845e-05, + "learning_rate": 1.3983163936759072e-05, + "loss": 0.0012, + "num_input_tokens_seen": 53034008, + "step": 91410 + }, + { + "epoch": 13.615579386356867, + "grad_norm": 0.00032816475140862167, + "learning_rate": 1.398024712983486e-05, + "loss": 0.0002, + "num_input_tokens_seen": 53036984, + "step": 91415 + }, + { + "epoch": 13.616324098897826, + "grad_norm": 0.0006955708377063274, + "learning_rate": 1.3977330509078165e-05, + "loss": 0.0, + "num_input_tokens_seen": 53039800, + "step": 91420 + }, + { + "epoch": 13.617068811438784, + "grad_norm": 0.0016700542764738202, + "learning_rate": 1.3974414074538277e-05, + "loss": 0.0, + "num_input_tokens_seen": 53042488, + "step": 91425 + }, + { + "epoch": 13.617813523979743, + "grad_norm": 0.0009598437463864684, + "learning_rate": 1.3971497826264448e-05, + "loss": 0.0, + "num_input_tokens_seen": 53045144, + "step": 91430 + }, + { + "epoch": 13.618558236520704, + "grad_norm": 0.0019623725675046444, + "learning_rate": 1.3968581764305965e-05, + "loss": 0.1509, + "num_input_tokens_seen": 53048088, + "step": 91435 + }, + { + "epoch": 13.619302949061662, + "grad_norm": 0.0008824865217320621, + "learning_rate": 1.396566588871208e-05, + "loss": 0.0, + "num_input_tokens_seen": 53050936, + "step": 91440 + }, + { + "epoch": 13.620047661602621, + "grad_norm": 9.318449883721769e-05, + "learning_rate": 1.3962750199532042e-05, + "loss": 0.0, + "num_input_tokens_seen": 53053688, + "step": 91445 + }, + { + "epoch": 13.62079237414358, + "grad_norm": 3.404245217097923e-05, + "learning_rate": 1.3959834696815138e-05, + "loss": 0.0, + "num_input_tokens_seen": 53056632, + "step": 91450 + }, + { + "epoch": 13.62153708668454, + "grad_norm": 0.0004834931460209191, + "learning_rate": 1.39569193806106e-05, + "loss": 0.0, + "num_input_tokens_seen": 53059416, + "step": 91455 + }, + { + "epoch": 13.6222817992255, + "grad_norm": 0.007035439368337393, + "learning_rate": 1.395400425096769e-05, + "loss": 0.0452, + "num_input_tokens_seen": 53062200, + "step": 91460 + }, + { + "epoch": 13.623026511766458, + "grad_norm": 0.00015979656018316746, + "learning_rate": 1.3951089307935639e-05, + "loss": 0.1378, + "num_input_tokens_seen": 53065048, + "step": 91465 + }, + { + "epoch": 13.623771224307417, + "grad_norm": 0.01894228719174862, + "learning_rate": 1.3948174551563713e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53067896, + "step": 91470 + }, + { + "epoch": 13.624515936848377, + "grad_norm": 0.003211847972124815, + "learning_rate": 1.3945259981901138e-05, + "loss": 0.0, + "num_input_tokens_seen": 53070776, + "step": 91475 + }, + { + "epoch": 13.625260649389336, + "grad_norm": 0.0015365902800112963, + "learning_rate": 1.394234559899717e-05, + "loss": 0.0542, + "num_input_tokens_seen": 53073592, + "step": 91480 + }, + { + "epoch": 13.626005361930295, + "grad_norm": 0.0012158629251644015, + "learning_rate": 1.3939431402901034e-05, + "loss": 0.0682, + "num_input_tokens_seen": 53076856, + "step": 91485 + }, + { + "epoch": 13.626750074471254, + "grad_norm": 5.418814659118652, + "learning_rate": 1.3936517393661955e-05, + "loss": 0.0568, + "num_input_tokens_seen": 53080184, + "step": 91490 + }, + { + "epoch": 13.627494787012214, + "grad_norm": 0.006441562436521053, + "learning_rate": 1.393360357132918e-05, + "loss": 0.0, + "num_input_tokens_seen": 53082872, + "step": 91495 + }, + { + "epoch": 13.628239499553173, + "grad_norm": 0.00010271860082866624, + "learning_rate": 1.3930689935951913e-05, + "loss": 0.0, + "num_input_tokens_seen": 53085688, + "step": 91500 + }, + { + "epoch": 13.628984212094132, + "grad_norm": 0.0007236768724396825, + "learning_rate": 1.3927776487579397e-05, + "loss": 0.0, + "num_input_tokens_seen": 53088600, + "step": 91505 + }, + { + "epoch": 13.62972892463509, + "grad_norm": 0.00017060834215953946, + "learning_rate": 1.3924863226260849e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53091384, + "step": 91510 + }, + { + "epoch": 13.63047363717605, + "grad_norm": 0.0009611294372007251, + "learning_rate": 1.3921950152045477e-05, + "loss": 0.0002, + "num_input_tokens_seen": 53094360, + "step": 91515 + }, + { + "epoch": 13.63121834971701, + "grad_norm": 2.7338874133420177e-05, + "learning_rate": 1.391903726498249e-05, + "loss": 0.0, + "num_input_tokens_seen": 53097208, + "step": 91520 + }, + { + "epoch": 13.631963062257968, + "grad_norm": 0.0008671397226862609, + "learning_rate": 1.3916124565121114e-05, + "loss": 0.0, + "num_input_tokens_seen": 53100280, + "step": 91525 + }, + { + "epoch": 13.632707774798927, + "grad_norm": 0.001232488895766437, + "learning_rate": 1.391321205251055e-05, + "loss": 0.0191, + "num_input_tokens_seen": 53103224, + "step": 91530 + }, + { + "epoch": 13.633452487339888, + "grad_norm": 0.001142822322435677, + "learning_rate": 1.3910299727199991e-05, + "loss": 0.0013, + "num_input_tokens_seen": 53105944, + "step": 91535 + }, + { + "epoch": 13.634197199880846, + "grad_norm": 0.00145053886808455, + "learning_rate": 1.3907387589238657e-05, + "loss": 0.0, + "num_input_tokens_seen": 53108920, + "step": 91540 + }, + { + "epoch": 13.634941912421805, + "grad_norm": 0.010527299717068672, + "learning_rate": 1.3904475638675724e-05, + "loss": 0.0, + "num_input_tokens_seen": 53111704, + "step": 91545 + }, + { + "epoch": 13.635686624962764, + "grad_norm": 0.00018611017731018364, + "learning_rate": 1.3901563875560408e-05, + "loss": 0.0, + "num_input_tokens_seen": 53114552, + "step": 91550 + }, + { + "epoch": 13.636431337503723, + "grad_norm": 0.015494193881750107, + "learning_rate": 1.3898652299941883e-05, + "loss": 0.1501, + "num_input_tokens_seen": 53117496, + "step": 91555 + }, + { + "epoch": 13.637176050044683, + "grad_norm": 0.00017990701599046588, + "learning_rate": 1.3895740911869351e-05, + "loss": 0.0, + "num_input_tokens_seen": 53120344, + "step": 91560 + }, + { + "epoch": 13.637920762585642, + "grad_norm": 0.002553667174652219, + "learning_rate": 1.389282971139199e-05, + "loss": 0.0, + "num_input_tokens_seen": 53123096, + "step": 91565 + }, + { + "epoch": 13.6386654751266, + "grad_norm": 1.324015738646267e-05, + "learning_rate": 1.3889918698558976e-05, + "loss": 0.0, + "num_input_tokens_seen": 53126040, + "step": 91570 + }, + { + "epoch": 13.63941018766756, + "grad_norm": 0.0003004714089911431, + "learning_rate": 1.3887007873419503e-05, + "loss": 0.1036, + "num_input_tokens_seen": 53128920, + "step": 91575 + }, + { + "epoch": 13.64015490020852, + "grad_norm": 8.888162847142667e-05, + "learning_rate": 1.3884097236022736e-05, + "loss": 0.0228, + "num_input_tokens_seen": 53131736, + "step": 91580 + }, + { + "epoch": 13.640899612749479, + "grad_norm": 8.911194890970364e-05, + "learning_rate": 1.3881186786417848e-05, + "loss": 0.0423, + "num_input_tokens_seen": 53134552, + "step": 91585 + }, + { + "epoch": 13.641644325290438, + "grad_norm": 14.252079010009766, + "learning_rate": 1.3878276524654e-05, + "loss": 0.0508, + "num_input_tokens_seen": 53137592, + "step": 91590 + }, + { + "epoch": 13.642389037831396, + "grad_norm": 6.218580529093742e-05, + "learning_rate": 1.3875366450780375e-05, + "loss": 0.0, + "num_input_tokens_seen": 53140408, + "step": 91595 + }, + { + "epoch": 13.643133750372357, + "grad_norm": 0.0001243875885847956, + "learning_rate": 1.387245656484612e-05, + "loss": 0.0, + "num_input_tokens_seen": 53143288, + "step": 91600 + }, + { + "epoch": 13.643878462913316, + "grad_norm": 0.00022931673447601497, + "learning_rate": 1.3869546866900409e-05, + "loss": 0.0, + "num_input_tokens_seen": 53146296, + "step": 91605 + }, + { + "epoch": 13.644623175454274, + "grad_norm": 3.26181689160876e-05, + "learning_rate": 1.3866637356992393e-05, + "loss": 0.0, + "num_input_tokens_seen": 53149208, + "step": 91610 + }, + { + "epoch": 13.645367887995233, + "grad_norm": 0.0019740420393645763, + "learning_rate": 1.3863728035171214e-05, + "loss": 0.0, + "num_input_tokens_seen": 53152024, + "step": 91615 + }, + { + "epoch": 13.646112600536194, + "grad_norm": 0.002423797966912389, + "learning_rate": 1.386081890148604e-05, + "loss": 0.0, + "num_input_tokens_seen": 53154712, + "step": 91620 + }, + { + "epoch": 13.646857313077152, + "grad_norm": 5.7971792557509616e-05, + "learning_rate": 1.3857909955985999e-05, + "loss": 0.0, + "num_input_tokens_seen": 53157688, + "step": 91625 + }, + { + "epoch": 13.647602025618111, + "grad_norm": 90.88182067871094, + "learning_rate": 1.3855001198720255e-05, + "loss": 0.2313, + "num_input_tokens_seen": 53160472, + "step": 91630 + }, + { + "epoch": 13.64834673815907, + "grad_norm": 0.005852966569364071, + "learning_rate": 1.3852092629737928e-05, + "loss": 0.0537, + "num_input_tokens_seen": 53162968, + "step": 91635 + }, + { + "epoch": 13.64909145070003, + "grad_norm": 0.0002989376662299037, + "learning_rate": 1.3849184249088176e-05, + "loss": 0.0, + "num_input_tokens_seen": 53165656, + "step": 91640 + }, + { + "epoch": 13.64983616324099, + "grad_norm": 0.001568279112689197, + "learning_rate": 1.3846276056820123e-05, + "loss": 0.0, + "num_input_tokens_seen": 53168344, + "step": 91645 + }, + { + "epoch": 13.650580875781948, + "grad_norm": 0.00022049825929570943, + "learning_rate": 1.3843368052982903e-05, + "loss": 0.0, + "num_input_tokens_seen": 53171320, + "step": 91650 + }, + { + "epoch": 13.651325588322907, + "grad_norm": 0.001526777632534504, + "learning_rate": 1.3840460237625635e-05, + "loss": 0.0, + "num_input_tokens_seen": 53174392, + "step": 91655 + }, + { + "epoch": 13.652070300863867, + "grad_norm": 15.105551719665527, + "learning_rate": 1.3837552610797444e-05, + "loss": 0.1719, + "num_input_tokens_seen": 53177336, + "step": 91660 + }, + { + "epoch": 13.652815013404826, + "grad_norm": 0.00010662407294148579, + "learning_rate": 1.3834645172547467e-05, + "loss": 0.0244, + "num_input_tokens_seen": 53180152, + "step": 91665 + }, + { + "epoch": 13.653559725945785, + "grad_norm": 0.02453426830470562, + "learning_rate": 1.3831737922924798e-05, + "loss": 0.0047, + "num_input_tokens_seen": 53183480, + "step": 91670 + }, + { + "epoch": 13.654304438486744, + "grad_norm": 0.0014757837634533644, + "learning_rate": 1.3828830861978579e-05, + "loss": 0.1626, + "num_input_tokens_seen": 53186296, + "step": 91675 + }, + { + "epoch": 13.655049151027704, + "grad_norm": 0.0007263869629241526, + "learning_rate": 1.3825923989757896e-05, + "loss": 0.0, + "num_input_tokens_seen": 53189080, + "step": 91680 + }, + { + "epoch": 13.655793863568663, + "grad_norm": 0.0020028012804687023, + "learning_rate": 1.382301730631188e-05, + "loss": 0.0, + "num_input_tokens_seen": 53192312, + "step": 91685 + }, + { + "epoch": 13.656538576109622, + "grad_norm": 0.0009128133533522487, + "learning_rate": 1.382011081168963e-05, + "loss": 0.0079, + "num_input_tokens_seen": 53195384, + "step": 91690 + }, + { + "epoch": 13.65728328865058, + "grad_norm": 0.006349336821585894, + "learning_rate": 1.3817204505940235e-05, + "loss": 0.0829, + "num_input_tokens_seen": 53198552, + "step": 91695 + }, + { + "epoch": 13.65802800119154, + "grad_norm": 0.0002501068520359695, + "learning_rate": 1.3814298389112811e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53201304, + "step": 91700 + }, + { + "epoch": 13.6587727137325, + "grad_norm": 1.4425787412619684e-05, + "learning_rate": 1.381139246125644e-05, + "loss": 0.0282, + "num_input_tokens_seen": 53204184, + "step": 91705 + }, + { + "epoch": 13.659517426273458, + "grad_norm": 0.0014422289095818996, + "learning_rate": 1.3808486722420233e-05, + "loss": 0.0, + "num_input_tokens_seen": 53207064, + "step": 91710 + }, + { + "epoch": 13.660262138814417, + "grad_norm": 168.91358947753906, + "learning_rate": 1.3805581172653265e-05, + "loss": 0.2126, + "num_input_tokens_seen": 53210136, + "step": 91715 + }, + { + "epoch": 13.661006851355378, + "grad_norm": 0.019949063658714294, + "learning_rate": 1.3802675812004626e-05, + "loss": 0.0, + "num_input_tokens_seen": 53213144, + "step": 91720 + }, + { + "epoch": 13.661751563896336, + "grad_norm": 1.909971069835592e-05, + "learning_rate": 1.3799770640523398e-05, + "loss": 0.0, + "num_input_tokens_seen": 53216024, + "step": 91725 + }, + { + "epoch": 13.662496276437295, + "grad_norm": 0.000568079762160778, + "learning_rate": 1.3796865658258654e-05, + "loss": 0.0, + "num_input_tokens_seen": 53219096, + "step": 91730 + }, + { + "epoch": 13.663240988978254, + "grad_norm": 0.0034149731509387493, + "learning_rate": 1.3793960865259486e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53221976, + "step": 91735 + }, + { + "epoch": 13.663985701519213, + "grad_norm": 0.0009598804754205048, + "learning_rate": 1.3791056261574952e-05, + "loss": 0.0174, + "num_input_tokens_seen": 53224760, + "step": 91740 + }, + { + "epoch": 13.664730414060173, + "grad_norm": 0.00028842760366387665, + "learning_rate": 1.3788151847254139e-05, + "loss": 0.2256, + "num_input_tokens_seen": 53227704, + "step": 91745 + }, + { + "epoch": 13.665475126601132, + "grad_norm": 0.002830137964338064, + "learning_rate": 1.3785247622346098e-05, + "loss": 0.2001, + "num_input_tokens_seen": 53230520, + "step": 91750 + }, + { + "epoch": 13.66621983914209, + "grad_norm": 0.17748421430587769, + "learning_rate": 1.3782343586899906e-05, + "loss": 0.0003, + "num_input_tokens_seen": 53233688, + "step": 91755 + }, + { + "epoch": 13.66696455168305, + "grad_norm": 0.0013111504958942533, + "learning_rate": 1.377943974096461e-05, + "loss": 0.0, + "num_input_tokens_seen": 53236440, + "step": 91760 + }, + { + "epoch": 13.66770926422401, + "grad_norm": 0.006597112398594618, + "learning_rate": 1.3776536084589287e-05, + "loss": 0.0, + "num_input_tokens_seen": 53239352, + "step": 91765 + }, + { + "epoch": 13.668453976764969, + "grad_norm": 0.02123972401022911, + "learning_rate": 1.377363261782298e-05, + "loss": 0.0002, + "num_input_tokens_seen": 53242200, + "step": 91770 + }, + { + "epoch": 13.669198689305928, + "grad_norm": 4.268431439413689e-05, + "learning_rate": 1.3770729340714728e-05, + "loss": 0.0, + "num_input_tokens_seen": 53245080, + "step": 91775 + }, + { + "epoch": 13.669943401846886, + "grad_norm": 0.00037806446198374033, + "learning_rate": 1.3767826253313599e-05, + "loss": 0.0, + "num_input_tokens_seen": 53248248, + "step": 91780 + }, + { + "epoch": 13.670688114387847, + "grad_norm": 0.32261258363723755, + "learning_rate": 1.376492335566863e-05, + "loss": 0.0002, + "num_input_tokens_seen": 53251256, + "step": 91785 + }, + { + "epoch": 13.671432826928806, + "grad_norm": 0.02751258760690689, + "learning_rate": 1.3762020647828866e-05, + "loss": 0.002, + "num_input_tokens_seen": 53254136, + "step": 91790 + }, + { + "epoch": 13.672177539469764, + "grad_norm": 0.00022704785806126893, + "learning_rate": 1.375911812984333e-05, + "loss": 0.0, + "num_input_tokens_seen": 53256888, + "step": 91795 + }, + { + "epoch": 13.672922252010723, + "grad_norm": 0.002835338469594717, + "learning_rate": 1.3756215801761074e-05, + "loss": 0.0, + "num_input_tokens_seen": 53259928, + "step": 91800 + }, + { + "epoch": 13.673666964551684, + "grad_norm": 0.0018225116655230522, + "learning_rate": 1.3753313663631119e-05, + "loss": 0.0, + "num_input_tokens_seen": 53262648, + "step": 91805 + }, + { + "epoch": 13.674411677092642, + "grad_norm": 0.0040304334834218025, + "learning_rate": 1.375041171550251e-05, + "loss": 0.0, + "num_input_tokens_seen": 53265496, + "step": 91810 + }, + { + "epoch": 13.675156389633601, + "grad_norm": 0.0015420292038470507, + "learning_rate": 1.3747509957424259e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53268408, + "step": 91815 + }, + { + "epoch": 13.67590110217456, + "grad_norm": 0.006008269265294075, + "learning_rate": 1.3744608389445379e-05, + "loss": 0.0002, + "num_input_tokens_seen": 53271224, + "step": 91820 + }, + { + "epoch": 13.67664581471552, + "grad_norm": 0.0004649545007850975, + "learning_rate": 1.3741707011614912e-05, + "loss": 0.0, + "num_input_tokens_seen": 53273944, + "step": 91825 + }, + { + "epoch": 13.67739052725648, + "grad_norm": 0.1174183264374733, + "learning_rate": 1.3738805823981857e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53276920, + "step": 91830 + }, + { + "epoch": 13.678135239797438, + "grad_norm": 1.5151927073020488e-05, + "learning_rate": 1.3735904826595236e-05, + "loss": 0.0, + "num_input_tokens_seen": 53279736, + "step": 91835 + }, + { + "epoch": 13.678879952338397, + "grad_norm": 15.34936809539795, + "learning_rate": 1.3733004019504058e-05, + "loss": 0.0192, + "num_input_tokens_seen": 53282744, + "step": 91840 + }, + { + "epoch": 13.679624664879357, + "grad_norm": 0.004475702997297049, + "learning_rate": 1.3730103402757327e-05, + "loss": 0.1882, + "num_input_tokens_seen": 53285560, + "step": 91845 + }, + { + "epoch": 13.680369377420316, + "grad_norm": 0.0006594198639504611, + "learning_rate": 1.3727202976404033e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53288312, + "step": 91850 + }, + { + "epoch": 13.681114089961275, + "grad_norm": 0.003793732263147831, + "learning_rate": 1.3724302740493198e-05, + "loss": 0.0, + "num_input_tokens_seen": 53291288, + "step": 91855 + }, + { + "epoch": 13.681858802502234, + "grad_norm": 0.00025816785637289286, + "learning_rate": 1.372140269507381e-05, + "loss": 0.0, + "num_input_tokens_seen": 53294200, + "step": 91860 + }, + { + "epoch": 13.682603515043194, + "grad_norm": 0.023192601278424263, + "learning_rate": 1.3718502840194847e-05, + "loss": 0.0, + "num_input_tokens_seen": 53296888, + "step": 91865 + }, + { + "epoch": 13.683348227584153, + "grad_norm": 0.01915374957025051, + "learning_rate": 1.3715603175905322e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53299768, + "step": 91870 + }, + { + "epoch": 13.684092940125112, + "grad_norm": 5.544930536416359e-05, + "learning_rate": 1.3712703702254203e-05, + "loss": 0.0, + "num_input_tokens_seen": 53302424, + "step": 91875 + }, + { + "epoch": 13.68483765266607, + "grad_norm": 0.0001260907738469541, + "learning_rate": 1.3709804419290496e-05, + "loss": 0.2022, + "num_input_tokens_seen": 53305336, + "step": 91880 + }, + { + "epoch": 13.68558236520703, + "grad_norm": 0.0021568567026406527, + "learning_rate": 1.3706905327063158e-05, + "loss": 0.0, + "num_input_tokens_seen": 53308152, + "step": 91885 + }, + { + "epoch": 13.68632707774799, + "grad_norm": 0.0010860370239242911, + "learning_rate": 1.3704006425621185e-05, + "loss": 0.0426, + "num_input_tokens_seen": 53310968, + "step": 91890 + }, + { + "epoch": 13.687071790288948, + "grad_norm": 0.13845930993556976, + "learning_rate": 1.3701107715013542e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53314008, + "step": 91895 + }, + { + "epoch": 13.687816502829907, + "grad_norm": 0.00016905709344428033, + "learning_rate": 1.369820919528919e-05, + "loss": 0.0913, + "num_input_tokens_seen": 53317016, + "step": 91900 + }, + { + "epoch": 13.688561215370868, + "grad_norm": 0.00032360205659642816, + "learning_rate": 1.3695310866497119e-05, + "loss": 0.0, + "num_input_tokens_seen": 53320344, + "step": 91905 + }, + { + "epoch": 13.689305927911827, + "grad_norm": 0.0008767535327933729, + "learning_rate": 1.3692412728686282e-05, + "loss": 0.0, + "num_input_tokens_seen": 53323160, + "step": 91910 + }, + { + "epoch": 13.690050640452785, + "grad_norm": 0.01808851584792137, + "learning_rate": 1.3689514781905638e-05, + "loss": 0.0, + "num_input_tokens_seen": 53325688, + "step": 91915 + }, + { + "epoch": 13.690795352993744, + "grad_norm": 0.005317701026797295, + "learning_rate": 1.3686617026204138e-05, + "loss": 0.0, + "num_input_tokens_seen": 53328152, + "step": 91920 + }, + { + "epoch": 13.691540065534703, + "grad_norm": 0.0020851222798228264, + "learning_rate": 1.368371946163075e-05, + "loss": 0.0, + "num_input_tokens_seen": 53331160, + "step": 91925 + }, + { + "epoch": 13.692284778075663, + "grad_norm": 0.005704440176486969, + "learning_rate": 1.3680822088234427e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53334200, + "step": 91930 + }, + { + "epoch": 13.693029490616622, + "grad_norm": 0.04264254868030548, + "learning_rate": 1.3677924906064097e-05, + "loss": 0.0047, + "num_input_tokens_seen": 53337080, + "step": 91935 + }, + { + "epoch": 13.69377420315758, + "grad_norm": 0.0007007938693277538, + "learning_rate": 1.3675027915168729e-05, + "loss": 0.0, + "num_input_tokens_seen": 53339736, + "step": 91940 + }, + { + "epoch": 13.69451891569854, + "grad_norm": 5.916356894886121e-05, + "learning_rate": 1.3672131115597241e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53342424, + "step": 91945 + }, + { + "epoch": 13.6952636282395, + "grad_norm": 0.003956676460802555, + "learning_rate": 1.3669234507398601e-05, + "loss": 0.0913, + "num_input_tokens_seen": 53345368, + "step": 91950 + }, + { + "epoch": 13.696008340780459, + "grad_norm": 0.008022344671189785, + "learning_rate": 1.3666338090621716e-05, + "loss": 0.0, + "num_input_tokens_seen": 53348344, + "step": 91955 + }, + { + "epoch": 13.696753053321418, + "grad_norm": 0.05547786504030228, + "learning_rate": 1.3663441865315538e-05, + "loss": 0.0009, + "num_input_tokens_seen": 53351256, + "step": 91960 + }, + { + "epoch": 13.697497765862376, + "grad_norm": 1.7943206330528483e-05, + "learning_rate": 1.3660545831528975e-05, + "loss": 0.0244, + "num_input_tokens_seen": 53354360, + "step": 91965 + }, + { + "epoch": 13.698242478403337, + "grad_norm": 0.0017130112973973155, + "learning_rate": 1.365764998931098e-05, + "loss": 0.0, + "num_input_tokens_seen": 53357656, + "step": 91970 + }, + { + "epoch": 13.698987190944296, + "grad_norm": 0.00015204146620817482, + "learning_rate": 1.365475433871046e-05, + "loss": 0.1176, + "num_input_tokens_seen": 53360536, + "step": 91975 + }, + { + "epoch": 13.699731903485254, + "grad_norm": 0.009627578780055046, + "learning_rate": 1.3651858879776336e-05, + "loss": 0.0, + "num_input_tokens_seen": 53363256, + "step": 91980 + }, + { + "epoch": 13.700476616026213, + "grad_norm": 0.0062835002318024635, + "learning_rate": 1.3648963612557519e-05, + "loss": 0.0, + "num_input_tokens_seen": 53366264, + "step": 91985 + }, + { + "epoch": 13.701221328567174, + "grad_norm": 0.0022764832247048616, + "learning_rate": 1.3646068537102916e-05, + "loss": 0.0, + "num_input_tokens_seen": 53369048, + "step": 91990 + }, + { + "epoch": 13.701966041108133, + "grad_norm": 0.008189392276108265, + "learning_rate": 1.3643173653461454e-05, + "loss": 0.0, + "num_input_tokens_seen": 53371960, + "step": 91995 + }, + { + "epoch": 13.702710753649091, + "grad_norm": 0.008514860644936562, + "learning_rate": 1.3640278961682023e-05, + "loss": 0.0, + "num_input_tokens_seen": 53374776, + "step": 92000 + }, + { + "epoch": 13.70345546619005, + "grad_norm": 0.1336369514465332, + "learning_rate": 1.3637384461813546e-05, + "loss": 0.0006, + "num_input_tokens_seen": 53377688, + "step": 92005 + }, + { + "epoch": 13.70420017873101, + "grad_norm": 1.8627671003341675, + "learning_rate": 1.3634490153904905e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53380504, + "step": 92010 + }, + { + "epoch": 13.70494489127197, + "grad_norm": 0.005048996303230524, + "learning_rate": 1.3631596038004994e-05, + "loss": 0.0002, + "num_input_tokens_seen": 53383544, + "step": 92015 + }, + { + "epoch": 13.705689603812928, + "grad_norm": 0.008784358389675617, + "learning_rate": 1.3628702114162722e-05, + "loss": 0.0, + "num_input_tokens_seen": 53386136, + "step": 92020 + }, + { + "epoch": 13.706434316353887, + "grad_norm": 0.0020983319263905287, + "learning_rate": 1.3625808382426964e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53389144, + "step": 92025 + }, + { + "epoch": 13.707179028894847, + "grad_norm": 9.871752738952637, + "learning_rate": 1.3622914842846619e-05, + "loss": 0.0541, + "num_input_tokens_seen": 53391960, + "step": 92030 + }, + { + "epoch": 13.707923741435806, + "grad_norm": 0.0040075029246509075, + "learning_rate": 1.3620021495470556e-05, + "loss": 0.0003, + "num_input_tokens_seen": 53394584, + "step": 92035 + }, + { + "epoch": 13.708668453976765, + "grad_norm": 8.048282325034961e-05, + "learning_rate": 1.3617128340347673e-05, + "loss": 0.2036, + "num_input_tokens_seen": 53397848, + "step": 92040 + }, + { + "epoch": 13.709413166517724, + "grad_norm": 0.006359498482197523, + "learning_rate": 1.361423537752684e-05, + "loss": 0.0, + "num_input_tokens_seen": 53400888, + "step": 92045 + }, + { + "epoch": 13.710157879058684, + "grad_norm": 131.9449462890625, + "learning_rate": 1.3611342607056925e-05, + "loss": 0.0173, + "num_input_tokens_seen": 53403736, + "step": 92050 + }, + { + "epoch": 13.710902591599643, + "grad_norm": 0.09415074437856674, + "learning_rate": 1.3608450028986804e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53406712, + "step": 92055 + }, + { + "epoch": 13.711647304140602, + "grad_norm": 0.0009114404092542827, + "learning_rate": 1.3605557643365333e-05, + "loss": 0.0, + "num_input_tokens_seen": 53409464, + "step": 92060 + }, + { + "epoch": 13.71239201668156, + "grad_norm": 0.3735698461532593, + "learning_rate": 1.3602665450241392e-05, + "loss": 0.0002, + "num_input_tokens_seen": 53412312, + "step": 92065 + }, + { + "epoch": 13.71313672922252, + "grad_norm": 0.0003940227907150984, + "learning_rate": 1.3599773449663828e-05, + "loss": 0.0056, + "num_input_tokens_seen": 53415128, + "step": 92070 + }, + { + "epoch": 13.71388144176348, + "grad_norm": 0.00032029792782850564, + "learning_rate": 1.3596881641681513e-05, + "loss": 0.0, + "num_input_tokens_seen": 53417976, + "step": 92075 + }, + { + "epoch": 13.714626154304439, + "grad_norm": 0.05442894250154495, + "learning_rate": 1.3593990026343284e-05, + "loss": 0.0002, + "num_input_tokens_seen": 53421016, + "step": 92080 + }, + { + "epoch": 13.715370866845397, + "grad_norm": 0.0005618627183139324, + "learning_rate": 1.3591098603698007e-05, + "loss": 0.0013, + "num_input_tokens_seen": 53423992, + "step": 92085 + }, + { + "epoch": 13.716115579386356, + "grad_norm": 0.0015968071529641747, + "learning_rate": 1.3588207373794526e-05, + "loss": 0.2156, + "num_input_tokens_seen": 53427352, + "step": 92090 + }, + { + "epoch": 13.716860291927317, + "grad_norm": 0.0004376147117000073, + "learning_rate": 1.3585316336681675e-05, + "loss": 0.0, + "num_input_tokens_seen": 53430008, + "step": 92095 + }, + { + "epoch": 13.717605004468275, + "grad_norm": 4.578520020004362e-05, + "learning_rate": 1.3582425492408313e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53432888, + "step": 92100 + }, + { + "epoch": 13.718349717009234, + "grad_norm": 0.0028097201138734818, + "learning_rate": 1.3579534841023256e-05, + "loss": 0.0624, + "num_input_tokens_seen": 53435864, + "step": 92105 + }, + { + "epoch": 13.719094429550193, + "grad_norm": 0.0004424025537446141, + "learning_rate": 1.357664438257536e-05, + "loss": 0.0, + "num_input_tokens_seen": 53438808, + "step": 92110 + }, + { + "epoch": 13.719839142091153, + "grad_norm": 4.606444781529717e-05, + "learning_rate": 1.3573754117113446e-05, + "loss": 0.0, + "num_input_tokens_seen": 53441784, + "step": 92115 + }, + { + "epoch": 13.720583854632112, + "grad_norm": 0.0035675461404025555, + "learning_rate": 1.3570864044686349e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53444600, + "step": 92120 + }, + { + "epoch": 13.721328567173071, + "grad_norm": 0.0035138230305165052, + "learning_rate": 1.3567974165342873e-05, + "loss": 0.0, + "num_input_tokens_seen": 53447608, + "step": 92125 + }, + { + "epoch": 13.72207327971403, + "grad_norm": 0.005371475592255592, + "learning_rate": 1.3565084479131865e-05, + "loss": 0.2219, + "num_input_tokens_seen": 53450360, + "step": 92130 + }, + { + "epoch": 13.72281799225499, + "grad_norm": 1.0032713362306822e-05, + "learning_rate": 1.3562194986102134e-05, + "loss": 0.0, + "num_input_tokens_seen": 53453656, + "step": 92135 + }, + { + "epoch": 13.723562704795949, + "grad_norm": 0.0009471806697547436, + "learning_rate": 1.3559305686302482e-05, + "loss": 0.0, + "num_input_tokens_seen": 53456280, + "step": 92140 + }, + { + "epoch": 13.724307417336908, + "grad_norm": 0.000126900224131532, + "learning_rate": 1.3556416579781745e-05, + "loss": 0.0, + "num_input_tokens_seen": 53459064, + "step": 92145 + }, + { + "epoch": 13.725052129877866, + "grad_norm": 0.003195627825334668, + "learning_rate": 1.355352766658871e-05, + "loss": 0.0, + "num_input_tokens_seen": 53461880, + "step": 92150 + }, + { + "epoch": 13.725796842418827, + "grad_norm": 160.68576049804688, + "learning_rate": 1.3550638946772198e-05, + "loss": 0.25, + "num_input_tokens_seen": 53464632, + "step": 92155 + }, + { + "epoch": 13.726541554959786, + "grad_norm": 0.001510453992523253, + "learning_rate": 1.3547750420380994e-05, + "loss": 0.0, + "num_input_tokens_seen": 53467544, + "step": 92160 + }, + { + "epoch": 13.727286267500745, + "grad_norm": 0.0017067169537767768, + "learning_rate": 1.3544862087463922e-05, + "loss": 0.0734, + "num_input_tokens_seen": 53470360, + "step": 92165 + }, + { + "epoch": 13.728030980041703, + "grad_norm": 0.08491846919059753, + "learning_rate": 1.3541973948069757e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53473144, + "step": 92170 + }, + { + "epoch": 13.728775692582664, + "grad_norm": 24.500381469726562, + "learning_rate": 1.3539086002247301e-05, + "loss": 0.16, + "num_input_tokens_seen": 53475928, + "step": 92175 + }, + { + "epoch": 13.729520405123623, + "grad_norm": 4.324200563132763e-05, + "learning_rate": 1.3536198250045326e-05, + "loss": 0.0, + "num_input_tokens_seen": 53478584, + "step": 92180 + }, + { + "epoch": 13.730265117664581, + "grad_norm": 77.04821014404297, + "learning_rate": 1.353331069151264e-05, + "loss": 0.0057, + "num_input_tokens_seen": 53481624, + "step": 92185 + }, + { + "epoch": 13.73100983020554, + "grad_norm": 0.0003875505644828081, + "learning_rate": 1.3530423326698015e-05, + "loss": 0.0059, + "num_input_tokens_seen": 53484568, + "step": 92190 + }, + { + "epoch": 13.7317545427465, + "grad_norm": 297.50347900390625, + "learning_rate": 1.3527536155650224e-05, + "loss": 0.04, + "num_input_tokens_seen": 53487608, + "step": 92195 + }, + { + "epoch": 13.73249925528746, + "grad_norm": 0.0001344569172943011, + "learning_rate": 1.3524649178418058e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53490744, + "step": 92200 + }, + { + "epoch": 13.733243967828418, + "grad_norm": 0.005940190050750971, + "learning_rate": 1.3521762395050272e-05, + "loss": 0.011, + "num_input_tokens_seen": 53493624, + "step": 92205 + }, + { + "epoch": 13.733988680369377, + "grad_norm": 0.00685374578461051, + "learning_rate": 1.3518875805595654e-05, + "loss": 0.0, + "num_input_tokens_seen": 53496312, + "step": 92210 + }, + { + "epoch": 13.734733392910336, + "grad_norm": 0.00087642518337816, + "learning_rate": 1.3515989410102959e-05, + "loss": 0.0, + "num_input_tokens_seen": 53499256, + "step": 92215 + }, + { + "epoch": 13.735478105451296, + "grad_norm": 0.0007588310400024056, + "learning_rate": 1.3513103208620941e-05, + "loss": 0.0, + "num_input_tokens_seen": 53502456, + "step": 92220 + }, + { + "epoch": 13.736222817992255, + "grad_norm": 0.005464463960379362, + "learning_rate": 1.3510217201198383e-05, + "loss": 0.0, + "num_input_tokens_seen": 53505624, + "step": 92225 + }, + { + "epoch": 13.736967530533214, + "grad_norm": 9.954362030839548e-05, + "learning_rate": 1.3507331387884015e-05, + "loss": 0.0, + "num_input_tokens_seen": 53508600, + "step": 92230 + }, + { + "epoch": 13.737712243074174, + "grad_norm": 0.023334801197052002, + "learning_rate": 1.3504445768726612e-05, + "loss": 0.0, + "num_input_tokens_seen": 53511448, + "step": 92235 + }, + { + "epoch": 13.738456955615133, + "grad_norm": 0.00035012964508496225, + "learning_rate": 1.3501560343774917e-05, + "loss": 0.0073, + "num_input_tokens_seen": 53514168, + "step": 92240 + }, + { + "epoch": 13.739201668156092, + "grad_norm": 0.016934381797909737, + "learning_rate": 1.3498675113077669e-05, + "loss": 0.1724, + "num_input_tokens_seen": 53517368, + "step": 92245 + }, + { + "epoch": 13.73994638069705, + "grad_norm": 0.0029092738404870033, + "learning_rate": 1.3495790076683617e-05, + "loss": 0.0, + "num_input_tokens_seen": 53519992, + "step": 92250 + }, + { + "epoch": 13.74069109323801, + "grad_norm": 0.00187965901568532, + "learning_rate": 1.3492905234641492e-05, + "loss": 0.0, + "num_input_tokens_seen": 53522808, + "step": 92255 + }, + { + "epoch": 13.74143580577897, + "grad_norm": 0.0024160973262041807, + "learning_rate": 1.3490020587000046e-05, + "loss": 0.0, + "num_input_tokens_seen": 53525560, + "step": 92260 + }, + { + "epoch": 13.742180518319929, + "grad_norm": 23.661149978637695, + "learning_rate": 1.3487136133807992e-05, + "loss": 0.0052, + "num_input_tokens_seen": 53528600, + "step": 92265 + }, + { + "epoch": 13.742925230860887, + "grad_norm": 0.0024544273037463427, + "learning_rate": 1.3484251875114085e-05, + "loss": 0.0, + "num_input_tokens_seen": 53531608, + "step": 92270 + }, + { + "epoch": 13.743669943401846, + "grad_norm": 0.0012928424403071404, + "learning_rate": 1.3481367810967027e-05, + "loss": 0.0, + "num_input_tokens_seen": 53534552, + "step": 92275 + }, + { + "epoch": 13.744414655942807, + "grad_norm": 108.93408966064453, + "learning_rate": 1.3478483941415565e-05, + "loss": 0.0194, + "num_input_tokens_seen": 53537144, + "step": 92280 + }, + { + "epoch": 13.745159368483765, + "grad_norm": 0.0006766415317542851, + "learning_rate": 1.3475600266508395e-05, + "loss": 0.2031, + "num_input_tokens_seen": 53539896, + "step": 92285 + }, + { + "epoch": 13.745904081024724, + "grad_norm": 0.00033362070098519325, + "learning_rate": 1.3472716786294254e-05, + "loss": 0.0298, + "num_input_tokens_seen": 53542616, + "step": 92290 + }, + { + "epoch": 13.746648793565683, + "grad_norm": 0.002517597982659936, + "learning_rate": 1.3469833500821848e-05, + "loss": 0.0, + "num_input_tokens_seen": 53545784, + "step": 92295 + }, + { + "epoch": 13.747393506106643, + "grad_norm": 0.0034474674612283707, + "learning_rate": 1.3466950410139878e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53548824, + "step": 92300 + }, + { + "epoch": 13.748138218647602, + "grad_norm": 0.00026026341947726905, + "learning_rate": 1.3464067514297069e-05, + "loss": 0.0, + "num_input_tokens_seen": 53552120, + "step": 92305 + }, + { + "epoch": 13.748882931188561, + "grad_norm": 0.00024239622871391475, + "learning_rate": 1.3461184813342116e-05, + "loss": 0.0007, + "num_input_tokens_seen": 53555544, + "step": 92310 + }, + { + "epoch": 13.74962764372952, + "grad_norm": 20.719907760620117, + "learning_rate": 1.3458302307323714e-05, + "loss": 0.3797, + "num_input_tokens_seen": 53558520, + "step": 92315 + }, + { + "epoch": 13.75037235627048, + "grad_norm": 0.10924883931875229, + "learning_rate": 1.3455419996290558e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53561432, + "step": 92320 + }, + { + "epoch": 13.751117068811439, + "grad_norm": 0.0005548466579057276, + "learning_rate": 1.3452537880291355e-05, + "loss": 0.0, + "num_input_tokens_seen": 53564312, + "step": 92325 + }, + { + "epoch": 13.751861781352398, + "grad_norm": 0.0006608981639146805, + "learning_rate": 1.3449655959374791e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53567288, + "step": 92330 + }, + { + "epoch": 13.752606493893357, + "grad_norm": 0.00039713658043183386, + "learning_rate": 1.3446774233589537e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53570072, + "step": 92335 + }, + { + "epoch": 13.753351206434317, + "grad_norm": 0.0008575966348871589, + "learning_rate": 1.3443892702984302e-05, + "loss": 0.0, + "num_input_tokens_seen": 53572856, + "step": 92340 + }, + { + "epoch": 13.754095918975276, + "grad_norm": 0.004231983330100775, + "learning_rate": 1.3441011367607743e-05, + "loss": 0.0, + "num_input_tokens_seen": 53575864, + "step": 92345 + }, + { + "epoch": 13.754840631516235, + "grad_norm": 25.598379135131836, + "learning_rate": 1.343813022750856e-05, + "loss": 0.3695, + "num_input_tokens_seen": 53578936, + "step": 92350 + }, + { + "epoch": 13.755585344057193, + "grad_norm": 0.0015534480335190892, + "learning_rate": 1.3435249282735407e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53581912, + "step": 92355 + }, + { + "epoch": 13.756330056598154, + "grad_norm": 13.341434478759766, + "learning_rate": 1.343236853333697e-05, + "loss": 0.0273, + "num_input_tokens_seen": 53584632, + "step": 92360 + }, + { + "epoch": 13.757074769139113, + "grad_norm": 0.0008210930973291397, + "learning_rate": 1.3429487979361905e-05, + "loss": 0.0, + "num_input_tokens_seen": 53587672, + "step": 92365 + }, + { + "epoch": 13.757819481680071, + "grad_norm": 145.92843627929688, + "learning_rate": 1.342660762085889e-05, + "loss": 0.0561, + "num_input_tokens_seen": 53590712, + "step": 92370 + }, + { + "epoch": 13.75856419422103, + "grad_norm": 0.0007547989371232688, + "learning_rate": 1.3423727457876572e-05, + "loss": 0.0, + "num_input_tokens_seen": 53593784, + "step": 92375 + }, + { + "epoch": 13.75930890676199, + "grad_norm": 0.05523452162742615, + "learning_rate": 1.3420847490463614e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53596600, + "step": 92380 + }, + { + "epoch": 13.76005361930295, + "grad_norm": 9.501466751098633, + "learning_rate": 1.3417967718668672e-05, + "loss": 0.0497, + "num_input_tokens_seen": 53599352, + "step": 92385 + }, + { + "epoch": 13.760798331843908, + "grad_norm": 0.0023271662648767233, + "learning_rate": 1.3415088142540383e-05, + "loss": 0.0038, + "num_input_tokens_seen": 53601848, + "step": 92390 + }, + { + "epoch": 13.761543044384867, + "grad_norm": 0.0028456749860197306, + "learning_rate": 1.3412208762127415e-05, + "loss": 0.0, + "num_input_tokens_seen": 53604760, + "step": 92395 + }, + { + "epoch": 13.762287756925826, + "grad_norm": 0.00023741259064991027, + "learning_rate": 1.3409329577478391e-05, + "loss": 0.0, + "num_input_tokens_seen": 53607608, + "step": 92400 + }, + { + "epoch": 13.763032469466786, + "grad_norm": 4.079761028289795, + "learning_rate": 1.3406450588641978e-05, + "loss": 0.0075, + "num_input_tokens_seen": 53610520, + "step": 92405 + }, + { + "epoch": 13.763777182007745, + "grad_norm": 0.0009275089832954109, + "learning_rate": 1.3403571795666786e-05, + "loss": 0.007, + "num_input_tokens_seen": 53613432, + "step": 92410 + }, + { + "epoch": 13.764521894548704, + "grad_norm": 11.360148429870605, + "learning_rate": 1.3400693198601472e-05, + "loss": 0.1914, + "num_input_tokens_seen": 53616248, + "step": 92415 + }, + { + "epoch": 13.765266607089664, + "grad_norm": 0.0008201213204301894, + "learning_rate": 1.339781479749466e-05, + "loss": 0.0081, + "num_input_tokens_seen": 53619064, + "step": 92420 + }, + { + "epoch": 13.766011319630623, + "grad_norm": 0.008631307631731033, + "learning_rate": 1.3394936592394963e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53622136, + "step": 92425 + }, + { + "epoch": 13.766756032171582, + "grad_norm": 34.66910934448242, + "learning_rate": 1.3392058583351027e-05, + "loss": 0.0314, + "num_input_tokens_seen": 53624984, + "step": 92430 + }, + { + "epoch": 13.76750074471254, + "grad_norm": 0.0016104151727631688, + "learning_rate": 1.3389180770411456e-05, + "loss": 0.0003, + "num_input_tokens_seen": 53627832, + "step": 92435 + }, + { + "epoch": 13.7682454572535, + "grad_norm": 0.03211840242147446, + "learning_rate": 1.3386303153624882e-05, + "loss": 0.0, + "num_input_tokens_seen": 53630680, + "step": 92440 + }, + { + "epoch": 13.76899016979446, + "grad_norm": 0.0004050621937494725, + "learning_rate": 1.3383425733039914e-05, + "loss": 0.0, + "num_input_tokens_seen": 53633336, + "step": 92445 + }, + { + "epoch": 13.769734882335419, + "grad_norm": 0.0037765027955174446, + "learning_rate": 1.3380548508705162e-05, + "loss": 0.013, + "num_input_tokens_seen": 53636248, + "step": 92450 + }, + { + "epoch": 13.770479594876377, + "grad_norm": 0.0012591982958838344, + "learning_rate": 1.3377671480669235e-05, + "loss": 0.0, + "num_input_tokens_seen": 53639064, + "step": 92455 + }, + { + "epoch": 13.771224307417336, + "grad_norm": 0.00022400097805075347, + "learning_rate": 1.3374794648980721e-05, + "loss": 0.0285, + "num_input_tokens_seen": 53642136, + "step": 92460 + }, + { + "epoch": 13.771969019958297, + "grad_norm": 0.0003876692207995802, + "learning_rate": 1.337191801368825e-05, + "loss": 0.0763, + "num_input_tokens_seen": 53645208, + "step": 92465 + }, + { + "epoch": 13.772713732499255, + "grad_norm": 0.048633597791194916, + "learning_rate": 1.3369041574840396e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53648280, + "step": 92470 + }, + { + "epoch": 13.773458445040214, + "grad_norm": 0.0037404263857752085, + "learning_rate": 1.3366165332485772e-05, + "loss": 0.0007, + "num_input_tokens_seen": 53651352, + "step": 92475 + }, + { + "epoch": 13.774203157581173, + "grad_norm": 0.002103834878653288, + "learning_rate": 1.3363289286672952e-05, + "loss": 0.0074, + "num_input_tokens_seen": 53654136, + "step": 92480 + }, + { + "epoch": 13.774947870122134, + "grad_norm": 0.003372622886672616, + "learning_rate": 1.3360413437450542e-05, + "loss": 0.0511, + "num_input_tokens_seen": 53656824, + "step": 92485 + }, + { + "epoch": 13.775692582663092, + "grad_norm": 0.008376527577638626, + "learning_rate": 1.3357537784867105e-05, + "loss": 0.3128, + "num_input_tokens_seen": 53659800, + "step": 92490 + }, + { + "epoch": 13.776437295204051, + "grad_norm": 0.005844379775226116, + "learning_rate": 1.3354662328971246e-05, + "loss": 0.0, + "num_input_tokens_seen": 53662616, + "step": 92495 + }, + { + "epoch": 13.77718200774501, + "grad_norm": 4.7007571993162856e-05, + "learning_rate": 1.3351787069811533e-05, + "loss": 0.0012, + "num_input_tokens_seen": 53665688, + "step": 92500 + }, + { + "epoch": 13.77792672028597, + "grad_norm": 0.0007271672948263586, + "learning_rate": 1.3348912007436537e-05, + "loss": 0.0, + "num_input_tokens_seen": 53668696, + "step": 92505 + }, + { + "epoch": 13.778671432826929, + "grad_norm": 20.57430648803711, + "learning_rate": 1.3346037141894829e-05, + "loss": 0.2626, + "num_input_tokens_seen": 53671768, + "step": 92510 + }, + { + "epoch": 13.779416145367888, + "grad_norm": 0.01486595906317234, + "learning_rate": 1.3343162473234972e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53674616, + "step": 92515 + }, + { + "epoch": 13.780160857908847, + "grad_norm": 0.0005226851790212095, + "learning_rate": 1.3340288001505546e-05, + "loss": 0.0063, + "num_input_tokens_seen": 53677368, + "step": 92520 + }, + { + "epoch": 13.780905570449807, + "grad_norm": 9.163719177246094, + "learning_rate": 1.3337413726755093e-05, + "loss": 0.1288, + "num_input_tokens_seen": 53680376, + "step": 92525 + }, + { + "epoch": 13.781650282990766, + "grad_norm": 34.30894088745117, + "learning_rate": 1.3334539649032193e-05, + "loss": 0.0592, + "num_input_tokens_seen": 53683448, + "step": 92530 + }, + { + "epoch": 13.782394995531725, + "grad_norm": 0.01728241890668869, + "learning_rate": 1.3331665768385387e-05, + "loss": 0.004, + "num_input_tokens_seen": 53686104, + "step": 92535 + }, + { + "epoch": 13.783139708072683, + "grad_norm": 0.00459726108238101, + "learning_rate": 1.3328792084863223e-05, + "loss": 0.0, + "num_input_tokens_seen": 53689048, + "step": 92540 + }, + { + "epoch": 13.783884420613644, + "grad_norm": 0.26587674021720886, + "learning_rate": 1.3325918598514265e-05, + "loss": 0.0563, + "num_input_tokens_seen": 53691800, + "step": 92545 + }, + { + "epoch": 13.784629133154603, + "grad_norm": 0.0006750444881618023, + "learning_rate": 1.3323045309387033e-05, + "loss": 0.0002, + "num_input_tokens_seen": 53694776, + "step": 92550 + }, + { + "epoch": 13.785373845695561, + "grad_norm": 0.002459669718518853, + "learning_rate": 1.3320172217530094e-05, + "loss": 0.0, + "num_input_tokens_seen": 53697592, + "step": 92555 + }, + { + "epoch": 13.78611855823652, + "grad_norm": 0.0012462746817618608, + "learning_rate": 1.3317299322991966e-05, + "loss": 0.0002, + "num_input_tokens_seen": 53700632, + "step": 92560 + }, + { + "epoch": 13.78686327077748, + "grad_norm": 0.08485615998506546, + "learning_rate": 1.33144266258212e-05, + "loss": 0.0003, + "num_input_tokens_seen": 53703704, + "step": 92565 + }, + { + "epoch": 13.78760798331844, + "grad_norm": 0.9453611373901367, + "learning_rate": 1.3311554126066323e-05, + "loss": 0.0006, + "num_input_tokens_seen": 53706488, + "step": 92570 + }, + { + "epoch": 13.788352695859398, + "grad_norm": 0.0002946767199318856, + "learning_rate": 1.3308681823775853e-05, + "loss": 0.0, + "num_input_tokens_seen": 53709464, + "step": 92575 + }, + { + "epoch": 13.789097408400357, + "grad_norm": 0.0022116254549473524, + "learning_rate": 1.3305809718998324e-05, + "loss": 0.0091, + "num_input_tokens_seen": 53712376, + "step": 92580 + }, + { + "epoch": 13.789842120941316, + "grad_norm": 0.0026587327010929585, + "learning_rate": 1.3302937811782249e-05, + "loss": 0.0, + "num_input_tokens_seen": 53715256, + "step": 92585 + }, + { + "epoch": 13.790586833482276, + "grad_norm": 0.02825954183936119, + "learning_rate": 1.3300066102176157e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53718136, + "step": 92590 + }, + { + "epoch": 13.791331546023235, + "grad_norm": 0.0049362159334123135, + "learning_rate": 1.3297194590228545e-05, + "loss": 0.0, + "num_input_tokens_seen": 53720984, + "step": 92595 + }, + { + "epoch": 13.792076258564194, + "grad_norm": 0.006742444355040789, + "learning_rate": 1.3294323275987953e-05, + "loss": 0.0004, + "num_input_tokens_seen": 53724152, + "step": 92600 + }, + { + "epoch": 13.792820971105153, + "grad_norm": 0.012755978852510452, + "learning_rate": 1.3291452159502853e-05, + "loss": 0.0, + "num_input_tokens_seen": 53726904, + "step": 92605 + }, + { + "epoch": 13.793565683646113, + "grad_norm": 0.03853200748562813, + "learning_rate": 1.3288581240821785e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53729944, + "step": 92610 + }, + { + "epoch": 13.794310396187072, + "grad_norm": 0.004537619184702635, + "learning_rate": 1.3285710519993233e-05, + "loss": 0.0291, + "num_input_tokens_seen": 53732696, + "step": 92615 + }, + { + "epoch": 13.79505510872803, + "grad_norm": 0.0062688919715583324, + "learning_rate": 1.3282839997065689e-05, + "loss": 0.0004, + "num_input_tokens_seen": 53735448, + "step": 92620 + }, + { + "epoch": 13.79579982126899, + "grad_norm": 0.002680728677660227, + "learning_rate": 1.327996967208766e-05, + "loss": 0.0002, + "num_input_tokens_seen": 53738168, + "step": 92625 + }, + { + "epoch": 13.79654453380995, + "grad_norm": 0.0028742512222379446, + "learning_rate": 1.3277099545107622e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53740984, + "step": 92630 + }, + { + "epoch": 13.797289246350909, + "grad_norm": 0.0006382535793818533, + "learning_rate": 1.3274229616174084e-05, + "loss": 0.0068, + "num_input_tokens_seen": 53744216, + "step": 92635 + }, + { + "epoch": 13.798033958891867, + "grad_norm": 0.0015657177427783608, + "learning_rate": 1.3271359885335515e-05, + "loss": 0.0, + "num_input_tokens_seen": 53747032, + "step": 92640 + }, + { + "epoch": 13.798778671432826, + "grad_norm": 0.010112601332366467, + "learning_rate": 1.3268490352640405e-05, + "loss": 0.0002, + "num_input_tokens_seen": 53749848, + "step": 92645 + }, + { + "epoch": 13.799523383973787, + "grad_norm": 0.0005102517898194492, + "learning_rate": 1.3265621018137216e-05, + "loss": 0.0, + "num_input_tokens_seen": 53752952, + "step": 92650 + }, + { + "epoch": 13.800268096514746, + "grad_norm": 0.001566148130223155, + "learning_rate": 1.3262751881874443e-05, + "loss": 0.0104, + "num_input_tokens_seen": 53756344, + "step": 92655 + }, + { + "epoch": 13.801012809055704, + "grad_norm": 0.0060460977256298065, + "learning_rate": 1.3259882943900547e-05, + "loss": 0.0002, + "num_input_tokens_seen": 53759064, + "step": 92660 + }, + { + "epoch": 13.801757521596663, + "grad_norm": 0.00028150095022283494, + "learning_rate": 1.325701420426399e-05, + "loss": 0.0, + "num_input_tokens_seen": 53761816, + "step": 92665 + }, + { + "epoch": 13.802502234137624, + "grad_norm": 6.952571857254952e-05, + "learning_rate": 1.3254145663013251e-05, + "loss": 0.1814, + "num_input_tokens_seen": 53764824, + "step": 92670 + }, + { + "epoch": 13.803246946678582, + "grad_norm": 0.0034000715240836143, + "learning_rate": 1.3251277320196772e-05, + "loss": 0.0, + "num_input_tokens_seen": 53768024, + "step": 92675 + }, + { + "epoch": 13.803991659219541, + "grad_norm": 0.00045899528777226806, + "learning_rate": 1.3248409175863033e-05, + "loss": 0.0, + "num_input_tokens_seen": 53770808, + "step": 92680 + }, + { + "epoch": 13.8047363717605, + "grad_norm": 0.3984106779098511, + "learning_rate": 1.3245541230060465e-05, + "loss": 0.0103, + "num_input_tokens_seen": 53773624, + "step": 92685 + }, + { + "epoch": 13.80548108430146, + "grad_norm": 0.0009452840895392001, + "learning_rate": 1.3242673482837544e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53776856, + "step": 92690 + }, + { + "epoch": 13.80622579684242, + "grad_norm": 0.004822365008294582, + "learning_rate": 1.3239805934242704e-05, + "loss": 0.0003, + "num_input_tokens_seen": 53779640, + "step": 92695 + }, + { + "epoch": 13.806970509383378, + "grad_norm": 0.0006052397657185793, + "learning_rate": 1.3236938584324382e-05, + "loss": 0.0009, + "num_input_tokens_seen": 53782520, + "step": 92700 + }, + { + "epoch": 13.807715221924337, + "grad_norm": 0.00011741592606995255, + "learning_rate": 1.3234071433131034e-05, + "loss": 0.0, + "num_input_tokens_seen": 53785528, + "step": 92705 + }, + { + "epoch": 13.808459934465297, + "grad_norm": 0.0002871238102670759, + "learning_rate": 1.323120448071109e-05, + "loss": 0.0, + "num_input_tokens_seen": 53788120, + "step": 92710 + }, + { + "epoch": 13.809204647006256, + "grad_norm": 0.004221697803586721, + "learning_rate": 1.3228337727112988e-05, + "loss": 0.0, + "num_input_tokens_seen": 53791064, + "step": 92715 + }, + { + "epoch": 13.809949359547215, + "grad_norm": 0.0022199053782969713, + "learning_rate": 1.3225471172385145e-05, + "loss": 0.0, + "num_input_tokens_seen": 53794136, + "step": 92720 + }, + { + "epoch": 13.810694072088173, + "grad_norm": 0.00022928143152967095, + "learning_rate": 1.3222604816576011e-05, + "loss": 0.0153, + "num_input_tokens_seen": 53797144, + "step": 92725 + }, + { + "epoch": 13.811438784629132, + "grad_norm": 7.84181829658337e-05, + "learning_rate": 1.3219738659733988e-05, + "loss": 0.0004, + "num_input_tokens_seen": 53800120, + "step": 92730 + }, + { + "epoch": 13.812183497170093, + "grad_norm": 0.0006451298831962049, + "learning_rate": 1.3216872701907515e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53803320, + "step": 92735 + }, + { + "epoch": 13.812928209711052, + "grad_norm": 0.00170116254594177, + "learning_rate": 1.3214006943145002e-05, + "loss": 0.0, + "num_input_tokens_seen": 53806168, + "step": 92740 + }, + { + "epoch": 13.81367292225201, + "grad_norm": 0.011902187019586563, + "learning_rate": 1.3211141383494856e-05, + "loss": 0.0051, + "num_input_tokens_seen": 53808824, + "step": 92745 + }, + { + "epoch": 13.81441763479297, + "grad_norm": 0.0027529948856681585, + "learning_rate": 1.32082760230055e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53811608, + "step": 92750 + }, + { + "epoch": 13.81516234733393, + "grad_norm": 0.0003628893755376339, + "learning_rate": 1.3205410861725331e-05, + "loss": 0.0, + "num_input_tokens_seen": 53814552, + "step": 92755 + }, + { + "epoch": 13.815907059874888, + "grad_norm": 13.569428443908691, + "learning_rate": 1.3202545899702768e-05, + "loss": 0.0798, + "num_input_tokens_seen": 53817752, + "step": 92760 + }, + { + "epoch": 13.816651772415847, + "grad_norm": 0.1034923568367958, + "learning_rate": 1.3199681136986186e-05, + "loss": 0.0015, + "num_input_tokens_seen": 53820408, + "step": 92765 + }, + { + "epoch": 13.817396484956806, + "grad_norm": 0.001168523682281375, + "learning_rate": 1.3196816573624013e-05, + "loss": 0.0159, + "num_input_tokens_seen": 53823416, + "step": 92770 + }, + { + "epoch": 13.818141197497766, + "grad_norm": 0.017474880442023277, + "learning_rate": 1.3193952209664625e-05, + "loss": 0.0, + "num_input_tokens_seen": 53826680, + "step": 92775 + }, + { + "epoch": 13.818885910038725, + "grad_norm": 0.002397777047008276, + "learning_rate": 1.319108804515642e-05, + "loss": 0.015, + "num_input_tokens_seen": 53829496, + "step": 92780 + }, + { + "epoch": 13.819630622579684, + "grad_norm": 0.003616592613980174, + "learning_rate": 1.3188224080147776e-05, + "loss": 0.0, + "num_input_tokens_seen": 53832696, + "step": 92785 + }, + { + "epoch": 13.820375335120643, + "grad_norm": 0.0025981897488236427, + "learning_rate": 1.318536031468707e-05, + "loss": 0.0, + "num_input_tokens_seen": 53835512, + "step": 92790 + }, + { + "epoch": 13.821120047661603, + "grad_norm": 0.004189474042505026, + "learning_rate": 1.3182496748822706e-05, + "loss": 0.0, + "num_input_tokens_seen": 53838328, + "step": 92795 + }, + { + "epoch": 13.821864760202562, + "grad_norm": 0.003876415314152837, + "learning_rate": 1.3179633382603041e-05, + "loss": 0.0, + "num_input_tokens_seen": 53840792, + "step": 92800 + }, + { + "epoch": 13.82260947274352, + "grad_norm": 0.00040909129893407226, + "learning_rate": 1.3176770216076462e-05, + "loss": 0.0, + "num_input_tokens_seen": 53844600, + "step": 92805 + }, + { + "epoch": 13.82335418528448, + "grad_norm": 52.44028854370117, + "learning_rate": 1.3173907249291326e-05, + "loss": 0.1981, + "num_input_tokens_seen": 53847480, + "step": 92810 + }, + { + "epoch": 13.82409889782544, + "grad_norm": 0.003672498045489192, + "learning_rate": 1.3171044482296017e-05, + "loss": 0.0, + "num_input_tokens_seen": 53850328, + "step": 92815 + }, + { + "epoch": 13.824843610366399, + "grad_norm": 30.899093627929688, + "learning_rate": 1.3168181915138889e-05, + "loss": 0.0111, + "num_input_tokens_seen": 53853144, + "step": 92820 + }, + { + "epoch": 13.825588322907358, + "grad_norm": 0.00286046857945621, + "learning_rate": 1.316531954786829e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53856376, + "step": 92825 + }, + { + "epoch": 13.826333035448316, + "grad_norm": 0.8968275785446167, + "learning_rate": 1.31624573805326e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53859288, + "step": 92830 + }, + { + "epoch": 13.827077747989277, + "grad_norm": 0.0037762767169624567, + "learning_rate": 1.3159595413180164e-05, + "loss": 0.0002, + "num_input_tokens_seen": 53862104, + "step": 92835 + }, + { + "epoch": 13.827822460530236, + "grad_norm": 0.0003855317481793463, + "learning_rate": 1.3156733645859328e-05, + "loss": 0.1949, + "num_input_tokens_seen": 53864728, + "step": 92840 + }, + { + "epoch": 13.828567173071194, + "grad_norm": 0.00043651595478877425, + "learning_rate": 1.3153872078618428e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53867736, + "step": 92845 + }, + { + "epoch": 13.829311885612153, + "grad_norm": 0.0012904790928587317, + "learning_rate": 1.3151010711505835e-05, + "loss": 0.0, + "num_input_tokens_seen": 53870680, + "step": 92850 + }, + { + "epoch": 13.830056598153114, + "grad_norm": 0.6931465864181519, + "learning_rate": 1.3148149544569868e-05, + "loss": 0.0011, + "num_input_tokens_seen": 53873816, + "step": 92855 + }, + { + "epoch": 13.830801310694072, + "grad_norm": 0.0003981387126259506, + "learning_rate": 1.3145288577858861e-05, + "loss": 0.0, + "num_input_tokens_seen": 53876536, + "step": 92860 + }, + { + "epoch": 13.831546023235031, + "grad_norm": 8.652827818877995e-05, + "learning_rate": 1.3142427811421165e-05, + "loss": 0.1719, + "num_input_tokens_seen": 53879352, + "step": 92865 + }, + { + "epoch": 13.83229073577599, + "grad_norm": 0.0032843726221472025, + "learning_rate": 1.313956724530509e-05, + "loss": 0.0, + "num_input_tokens_seen": 53882360, + "step": 92870 + }, + { + "epoch": 13.83303544831695, + "grad_norm": 0.0002781472576316446, + "learning_rate": 1.3136706879558979e-05, + "loss": 0.1253, + "num_input_tokens_seen": 53885336, + "step": 92875 + }, + { + "epoch": 13.83378016085791, + "grad_norm": 1.2466150522232056, + "learning_rate": 1.3133846714231141e-05, + "loss": 0.0006, + "num_input_tokens_seen": 53888024, + "step": 92880 + }, + { + "epoch": 13.834524873398868, + "grad_norm": 39.38169860839844, + "learning_rate": 1.3130986749369911e-05, + "loss": 0.1907, + "num_input_tokens_seen": 53891160, + "step": 92885 + }, + { + "epoch": 13.835269585939827, + "grad_norm": 0.0008410380687564611, + "learning_rate": 1.3128126985023586e-05, + "loss": 0.0, + "num_input_tokens_seen": 53893944, + "step": 92890 + }, + { + "epoch": 13.836014298480787, + "grad_norm": 0.003218522295355797, + "learning_rate": 1.3125267421240504e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53896728, + "step": 92895 + }, + { + "epoch": 13.836759011021746, + "grad_norm": 0.00015806083683855832, + "learning_rate": 1.3122408058068955e-05, + "loss": 0.0, + "num_input_tokens_seen": 53899416, + "step": 92900 + }, + { + "epoch": 13.837503723562705, + "grad_norm": 0.002597338519990444, + "learning_rate": 1.3119548895557252e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53902424, + "step": 92905 + }, + { + "epoch": 13.838248436103664, + "grad_norm": 0.0001334721891907975, + "learning_rate": 1.3116689933753696e-05, + "loss": 0.1017, + "num_input_tokens_seen": 53905432, + "step": 92910 + }, + { + "epoch": 13.838993148644622, + "grad_norm": 0.007949508726596832, + "learning_rate": 1.3113831172706575e-05, + "loss": 0.0, + "num_input_tokens_seen": 53908120, + "step": 92915 + }, + { + "epoch": 13.839737861185583, + "grad_norm": 0.001575880916789174, + "learning_rate": 1.3110972612464207e-05, + "loss": 0.0, + "num_input_tokens_seen": 53910840, + "step": 92920 + }, + { + "epoch": 13.840482573726542, + "grad_norm": 0.012539470568299294, + "learning_rate": 1.310811425307486e-05, + "loss": 0.0, + "num_input_tokens_seen": 53913336, + "step": 92925 + }, + { + "epoch": 13.8412272862675, + "grad_norm": 0.00809707771986723, + "learning_rate": 1.3105256094586849e-05, + "loss": 0.0, + "num_input_tokens_seen": 53915992, + "step": 92930 + }, + { + "epoch": 13.84197199880846, + "grad_norm": 0.006423078011721373, + "learning_rate": 1.310239813704845e-05, + "loss": 0.0, + "num_input_tokens_seen": 53918776, + "step": 92935 + }, + { + "epoch": 13.84271671134942, + "grad_norm": 0.0010646542068570852, + "learning_rate": 1.3099540380507927e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53921560, + "step": 92940 + }, + { + "epoch": 13.843461423890378, + "grad_norm": 0.005227593705058098, + "learning_rate": 1.3096682825013584e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53924312, + "step": 92945 + }, + { + "epoch": 13.844206136431337, + "grad_norm": 0.017768774181604385, + "learning_rate": 1.309382547061368e-05, + "loss": 0.0763, + "num_input_tokens_seen": 53927064, + "step": 92950 + }, + { + "epoch": 13.844950848972296, + "grad_norm": 0.0027687125839293003, + "learning_rate": 1.3090968317356502e-05, + "loss": 0.0, + "num_input_tokens_seen": 53930168, + "step": 92955 + }, + { + "epoch": 13.845695561513256, + "grad_norm": 0.0018043183954432607, + "learning_rate": 1.3088111365290302e-05, + "loss": 0.0, + "num_input_tokens_seen": 53933048, + "step": 92960 + }, + { + "epoch": 13.846440274054215, + "grad_norm": 0.008689498528838158, + "learning_rate": 1.3085254614463362e-05, + "loss": 0.0035, + "num_input_tokens_seen": 53936120, + "step": 92965 + }, + { + "epoch": 13.847184986595174, + "grad_norm": 5.997172832489014, + "learning_rate": 1.308239806492394e-05, + "loss": 0.0044, + "num_input_tokens_seen": 53938712, + "step": 92970 + }, + { + "epoch": 13.847929699136133, + "grad_norm": 0.001381704700179398, + "learning_rate": 1.3079541716720284e-05, + "loss": 0.0, + "num_input_tokens_seen": 53941304, + "step": 92975 + }, + { + "epoch": 13.848674411677093, + "grad_norm": 0.030311284586787224, + "learning_rate": 1.307668556990066e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53944472, + "step": 92980 + }, + { + "epoch": 13.849419124218052, + "grad_norm": 0.03658384457230568, + "learning_rate": 1.30738296245133e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53947544, + "step": 92985 + }, + { + "epoch": 13.85016383675901, + "grad_norm": 0.04669932648539543, + "learning_rate": 1.3070973880606482e-05, + "loss": 0.3251, + "num_input_tokens_seen": 53950840, + "step": 92990 + }, + { + "epoch": 13.85090854929997, + "grad_norm": 2.9803486540913582e-05, + "learning_rate": 1.3068118338228425e-05, + "loss": 0.0, + "num_input_tokens_seen": 53953912, + "step": 92995 + }, + { + "epoch": 13.85165326184093, + "grad_norm": 0.043957848101854324, + "learning_rate": 1.306526299742739e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53956856, + "step": 93000 + }, + { + "epoch": 13.852397974381889, + "grad_norm": 0.001113787409849465, + "learning_rate": 1.3062407858251598e-05, + "loss": 0.0, + "num_input_tokens_seen": 53959416, + "step": 93005 + }, + { + "epoch": 13.853142686922848, + "grad_norm": 0.008142250590026379, + "learning_rate": 1.3059552920749301e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53962744, + "step": 93010 + }, + { + "epoch": 13.853887399463806, + "grad_norm": 0.0023231476079672575, + "learning_rate": 1.3056698184968714e-05, + "loss": 0.1441, + "num_input_tokens_seen": 53965784, + "step": 93015 + }, + { + "epoch": 13.854632112004767, + "grad_norm": 0.04028192162513733, + "learning_rate": 1.305384365095808e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53968600, + "step": 93020 + }, + { + "epoch": 13.855376824545726, + "grad_norm": 0.12245608121156693, + "learning_rate": 1.305098931876562e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53971192, + "step": 93025 + }, + { + "epoch": 13.856121537086684, + "grad_norm": 7.703884330112487e-05, + "learning_rate": 1.3048135188439537e-05, + "loss": 0.0207, + "num_input_tokens_seen": 53974040, + "step": 93030 + }, + { + "epoch": 13.856866249627643, + "grad_norm": 6.532515048980713, + "learning_rate": 1.3045281260028075e-05, + "loss": 0.062, + "num_input_tokens_seen": 53977080, + "step": 93035 + }, + { + "epoch": 13.857610962168604, + "grad_norm": 0.038031209260225296, + "learning_rate": 1.3042427533579435e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53979992, + "step": 93040 + }, + { + "epoch": 13.858355674709562, + "grad_norm": 0.452561616897583, + "learning_rate": 1.303957400914183e-05, + "loss": 0.0735, + "num_input_tokens_seen": 53983000, + "step": 93045 + }, + { + "epoch": 13.859100387250521, + "grad_norm": 0.0026462541427463293, + "learning_rate": 1.3036720686763454e-05, + "loss": 0.0002, + "num_input_tokens_seen": 53985944, + "step": 93050 + }, + { + "epoch": 13.85984509979148, + "grad_norm": 89.29425811767578, + "learning_rate": 1.3033867566492534e-05, + "loss": 0.6034, + "num_input_tokens_seen": 53989048, + "step": 93055 + }, + { + "epoch": 13.86058981233244, + "grad_norm": 12.217236518859863, + "learning_rate": 1.303101464837726e-05, + "loss": 0.3642, + "num_input_tokens_seen": 53991832, + "step": 93060 + }, + { + "epoch": 13.8613345248734, + "grad_norm": 0.016072001308202744, + "learning_rate": 1.3028161932465815e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53994584, + "step": 93065 + }, + { + "epoch": 13.862079237414358, + "grad_norm": 0.06820740550756454, + "learning_rate": 1.3025309418806422e-05, + "loss": 0.0001, + "num_input_tokens_seen": 53997848, + "step": 93070 + }, + { + "epoch": 13.862823949955317, + "grad_norm": 0.0005291040288284421, + "learning_rate": 1.3022457107447244e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54000856, + "step": 93075 + }, + { + "epoch": 13.863568662496277, + "grad_norm": 0.0015257659833878279, + "learning_rate": 1.3019604998436491e-05, + "loss": 0.0003, + "num_input_tokens_seen": 54004024, + "step": 93080 + }, + { + "epoch": 13.864313375037236, + "grad_norm": 0.01802494004368782, + "learning_rate": 1.301675309182232e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54007064, + "step": 93085 + }, + { + "epoch": 13.865058087578195, + "grad_norm": 2.551158905029297, + "learning_rate": 1.3013901387652941e-05, + "loss": 0.1367, + "num_input_tokens_seen": 54010328, + "step": 93090 + }, + { + "epoch": 13.865802800119154, + "grad_norm": 0.008177515119314194, + "learning_rate": 1.3011049885976505e-05, + "loss": 0.2787, + "num_input_tokens_seen": 54013272, + "step": 93095 + }, + { + "epoch": 13.866547512660112, + "grad_norm": 0.0005674843559972942, + "learning_rate": 1.3008198586841209e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54016088, + "step": 93100 + }, + { + "epoch": 13.867292225201073, + "grad_norm": 0.00318600214086473, + "learning_rate": 1.3005347490295205e-05, + "loss": 0.0003, + "num_input_tokens_seen": 54019032, + "step": 93105 + }, + { + "epoch": 13.868036937742032, + "grad_norm": 0.18073327839374542, + "learning_rate": 1.3002496596386666e-05, + "loss": 0.0064, + "num_input_tokens_seen": 54022072, + "step": 93110 + }, + { + "epoch": 13.86878165028299, + "grad_norm": 0.015653852373361588, + "learning_rate": 1.2999645905163754e-05, + "loss": 0.0353, + "num_input_tokens_seen": 54025144, + "step": 93115 + }, + { + "epoch": 13.86952636282395, + "grad_norm": 0.04012877121567726, + "learning_rate": 1.2996795416674618e-05, + "loss": 0.0909, + "num_input_tokens_seen": 54028056, + "step": 93120 + }, + { + "epoch": 13.87027107536491, + "grad_norm": 0.0501285195350647, + "learning_rate": 1.2993945130967434e-05, + "loss": 0.1131, + "num_input_tokens_seen": 54031000, + "step": 93125 + }, + { + "epoch": 13.871015787905868, + "grad_norm": 0.001999753527343273, + "learning_rate": 1.2991095048090333e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54033784, + "step": 93130 + }, + { + "epoch": 13.871760500446827, + "grad_norm": 0.08205258846282959, + "learning_rate": 1.2988245168091485e-05, + "loss": 0.0085, + "num_input_tokens_seen": 54036664, + "step": 93135 + }, + { + "epoch": 13.872505212987786, + "grad_norm": 0.008011850528419018, + "learning_rate": 1.2985395491019029e-05, + "loss": 0.3189, + "num_input_tokens_seen": 54039512, + "step": 93140 + }, + { + "epoch": 13.873249925528746, + "grad_norm": 0.00962061807513237, + "learning_rate": 1.2982546016921093e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54042584, + "step": 93145 + }, + { + "epoch": 13.873994638069705, + "grad_norm": 0.0075771138072013855, + "learning_rate": 1.297969674584584e-05, + "loss": 0.0269, + "num_input_tokens_seen": 54045240, + "step": 93150 + }, + { + "epoch": 13.874739350610664, + "grad_norm": 0.02489996887743473, + "learning_rate": 1.2976847677841383e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54048344, + "step": 93155 + }, + { + "epoch": 13.875484063151623, + "grad_norm": 0.03766042739152908, + "learning_rate": 1.2973998812955876e-05, + "loss": 0.0007, + "num_input_tokens_seen": 54051128, + "step": 93160 + }, + { + "epoch": 13.876228775692583, + "grad_norm": 0.0021701185032725334, + "learning_rate": 1.2971150151237435e-05, + "loss": 0.0431, + "num_input_tokens_seen": 54053848, + "step": 93165 + }, + { + "epoch": 13.876973488233542, + "grad_norm": 0.12498590350151062, + "learning_rate": 1.2968301692734187e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54056856, + "step": 93170 + }, + { + "epoch": 13.8777182007745, + "grad_norm": 0.0024878927506506443, + "learning_rate": 1.2965453437494243e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54059832, + "step": 93175 + }, + { + "epoch": 13.87846291331546, + "grad_norm": 0.0007919391500763595, + "learning_rate": 1.296260538556574e-05, + "loss": 0.0034, + "num_input_tokens_seen": 54062872, + "step": 93180 + }, + { + "epoch": 13.87920762585642, + "grad_norm": 8.071474075317383, + "learning_rate": 1.295975753699679e-05, + "loss": 0.0705, + "num_input_tokens_seen": 54065784, + "step": 93185 + }, + { + "epoch": 13.879952338397379, + "grad_norm": 0.019427064806222916, + "learning_rate": 1.2956909891835484e-05, + "loss": 0.0018, + "num_input_tokens_seen": 54068568, + "step": 93190 + }, + { + "epoch": 13.880697050938338, + "grad_norm": 0.0016530804568901658, + "learning_rate": 1.2954062450129959e-05, + "loss": 0.0303, + "num_input_tokens_seen": 54071544, + "step": 93195 + }, + { + "epoch": 13.881441763479296, + "grad_norm": 0.011835066601634026, + "learning_rate": 1.2951215211928292e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54074584, + "step": 93200 + }, + { + "epoch": 13.882186476020257, + "grad_norm": 0.0026255217380821705, + "learning_rate": 1.2948368177278614e-05, + "loss": 0.0005, + "num_input_tokens_seen": 54077336, + "step": 93205 + }, + { + "epoch": 13.882931188561216, + "grad_norm": 0.0008540676790289581, + "learning_rate": 1.2945521346228989e-05, + "loss": 0.026, + "num_input_tokens_seen": 54080088, + "step": 93210 + }, + { + "epoch": 13.883675901102174, + "grad_norm": 0.0013041128404438496, + "learning_rate": 1.2942674718827546e-05, + "loss": 0.026, + "num_input_tokens_seen": 54083224, + "step": 93215 + }, + { + "epoch": 13.884420613643133, + "grad_norm": 10.067543983459473, + "learning_rate": 1.2939828295122358e-05, + "loss": 0.0332, + "num_input_tokens_seen": 54086360, + "step": 93220 + }, + { + "epoch": 13.885165326184094, + "grad_norm": 0.02584981359541416, + "learning_rate": 1.2936982075161502e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54089400, + "step": 93225 + }, + { + "epoch": 13.885910038725052, + "grad_norm": 0.008184853941202164, + "learning_rate": 1.2934136058993082e-05, + "loss": 0.1233, + "num_input_tokens_seen": 54092024, + "step": 93230 + }, + { + "epoch": 13.886654751266011, + "grad_norm": 0.0008206461789086461, + "learning_rate": 1.2931290246665173e-05, + "loss": 0.0, + "num_input_tokens_seen": 54094840, + "step": 93235 + }, + { + "epoch": 13.88739946380697, + "grad_norm": 0.004177530761808157, + "learning_rate": 1.2928444638225848e-05, + "loss": 0.0007, + "num_input_tokens_seen": 54097592, + "step": 93240 + }, + { + "epoch": 13.88814417634793, + "grad_norm": 0.007200491614639759, + "learning_rate": 1.2925599233723174e-05, + "loss": 0.0, + "num_input_tokens_seen": 54100312, + "step": 93245 + }, + { + "epoch": 13.88888888888889, + "grad_norm": 0.001473042881116271, + "learning_rate": 1.2922754033205237e-05, + "loss": 0.0004, + "num_input_tokens_seen": 54103160, + "step": 93250 + }, + { + "epoch": 13.889633601429848, + "grad_norm": 0.00828537531197071, + "learning_rate": 1.2919909036720085e-05, + "loss": 0.0, + "num_input_tokens_seen": 54105976, + "step": 93255 + }, + { + "epoch": 13.890378313970807, + "grad_norm": 0.0022769682109355927, + "learning_rate": 1.2917064244315802e-05, + "loss": 0.0003, + "num_input_tokens_seen": 54108664, + "step": 93260 + }, + { + "epoch": 13.891123026511767, + "grad_norm": 0.4279696047306061, + "learning_rate": 1.2914219656040437e-05, + "loss": 0.0003, + "num_input_tokens_seen": 54111448, + "step": 93265 + }, + { + "epoch": 13.891867739052726, + "grad_norm": 0.01176377385854721, + "learning_rate": 1.2911375271942042e-05, + "loss": 0.0, + "num_input_tokens_seen": 54114392, + "step": 93270 + }, + { + "epoch": 13.892612451593685, + "grad_norm": 0.11549631506204605, + "learning_rate": 1.2908531092068682e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54117432, + "step": 93275 + }, + { + "epoch": 13.893357164134644, + "grad_norm": 0.005301472265273333, + "learning_rate": 1.290568711646839e-05, + "loss": 0.0, + "num_input_tokens_seen": 54120568, + "step": 93280 + }, + { + "epoch": 13.894101876675602, + "grad_norm": 0.010547076351940632, + "learning_rate": 1.2902843345189237e-05, + "loss": 0.0011, + "num_input_tokens_seen": 54123288, + "step": 93285 + }, + { + "epoch": 13.894846589216563, + "grad_norm": 0.009016407653689384, + "learning_rate": 1.2899999778279235e-05, + "loss": 0.0664, + "num_input_tokens_seen": 54126616, + "step": 93290 + }, + { + "epoch": 13.895591301757522, + "grad_norm": 0.026977676898241043, + "learning_rate": 1.289715641578645e-05, + "loss": 0.0, + "num_input_tokens_seen": 54129592, + "step": 93295 + }, + { + "epoch": 13.89633601429848, + "grad_norm": 0.005103779956698418, + "learning_rate": 1.2894313257758906e-05, + "loss": 0.2688, + "num_input_tokens_seen": 54132280, + "step": 93300 + }, + { + "epoch": 13.89708072683944, + "grad_norm": 262.8910827636719, + "learning_rate": 1.2891470304244638e-05, + "loss": 0.0227, + "num_input_tokens_seen": 54135192, + "step": 93305 + }, + { + "epoch": 13.8978254393804, + "grad_norm": 6.891616067150608e-05, + "learning_rate": 1.288862755529167e-05, + "loss": 0.0004, + "num_input_tokens_seen": 54138040, + "step": 93310 + }, + { + "epoch": 13.898570151921358, + "grad_norm": 0.004871691111475229, + "learning_rate": 1.2885785010948023e-05, + "loss": 0.0, + "num_input_tokens_seen": 54140984, + "step": 93315 + }, + { + "epoch": 13.899314864462317, + "grad_norm": 0.4193144738674164, + "learning_rate": 1.2882942671261733e-05, + "loss": 0.0026, + "num_input_tokens_seen": 54143768, + "step": 93320 + }, + { + "epoch": 13.900059577003276, + "grad_norm": 0.019322004169225693, + "learning_rate": 1.2880100536280803e-05, + "loss": 0.1413, + "num_input_tokens_seen": 54147032, + "step": 93325 + }, + { + "epoch": 13.900804289544237, + "grad_norm": 0.21673542261123657, + "learning_rate": 1.2877258606053266e-05, + "loss": 0.2677, + "num_input_tokens_seen": 54150264, + "step": 93330 + }, + { + "epoch": 13.901549002085195, + "grad_norm": 0.00025335297686979175, + "learning_rate": 1.2874416880627116e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54153208, + "step": 93335 + }, + { + "epoch": 13.902293714626154, + "grad_norm": 0.0014236117713153362, + "learning_rate": 1.2871575360050376e-05, + "loss": 0.0, + "num_input_tokens_seen": 54156280, + "step": 93340 + }, + { + "epoch": 13.903038427167113, + "grad_norm": 51.488525390625, + "learning_rate": 1.2868734044371044e-05, + "loss": 0.044, + "num_input_tokens_seen": 54159224, + "step": 93345 + }, + { + "epoch": 13.903783139708073, + "grad_norm": 0.003765861736610532, + "learning_rate": 1.2865892933637114e-05, + "loss": 0.0, + "num_input_tokens_seen": 54162552, + "step": 93350 + }, + { + "epoch": 13.904527852249032, + "grad_norm": 0.0001881623174995184, + "learning_rate": 1.2863052027896597e-05, + "loss": 0.0, + "num_input_tokens_seen": 54165336, + "step": 93355 + }, + { + "epoch": 13.90527256478999, + "grad_norm": 1.2888829708099365, + "learning_rate": 1.2860211327197468e-05, + "loss": 0.0007, + "num_input_tokens_seen": 54168344, + "step": 93360 + }, + { + "epoch": 13.90601727733095, + "grad_norm": 0.0005306693492457271, + "learning_rate": 1.2857370831587745e-05, + "loss": 0.0128, + "num_input_tokens_seen": 54171224, + "step": 93365 + }, + { + "epoch": 13.90676198987191, + "grad_norm": 0.0029478268697857857, + "learning_rate": 1.28545305411154e-05, + "loss": 0.0207, + "num_input_tokens_seen": 54174072, + "step": 93370 + }, + { + "epoch": 13.907506702412869, + "grad_norm": 0.00624303100630641, + "learning_rate": 1.2851690455828414e-05, + "loss": 0.0133, + "num_input_tokens_seen": 54177144, + "step": 93375 + }, + { + "epoch": 13.908251414953828, + "grad_norm": 0.0900021567940712, + "learning_rate": 1.2848850575774774e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54179960, + "step": 93380 + }, + { + "epoch": 13.908996127494786, + "grad_norm": 0.0011825738474726677, + "learning_rate": 1.2846010901002442e-05, + "loss": 0.0, + "num_input_tokens_seen": 54182872, + "step": 93385 + }, + { + "epoch": 13.909740840035747, + "grad_norm": 0.0037990855053067207, + "learning_rate": 1.2843171431559414e-05, + "loss": 0.0, + "num_input_tokens_seen": 54185624, + "step": 93390 + }, + { + "epoch": 13.910485552576706, + "grad_norm": 0.004113388247787952, + "learning_rate": 1.284033216749364e-05, + "loss": 0.1884, + "num_input_tokens_seen": 54188344, + "step": 93395 + }, + { + "epoch": 13.911230265117664, + "grad_norm": 0.04519777372479439, + "learning_rate": 1.2837493108853105e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54191256, + "step": 93400 + }, + { + "epoch": 13.911974977658623, + "grad_norm": 0.02682509645819664, + "learning_rate": 1.2834654255685752e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54194552, + "step": 93405 + }, + { + "epoch": 13.912719690199584, + "grad_norm": 0.01298537477850914, + "learning_rate": 1.283181560803956e-05, + "loss": 0.1744, + "num_input_tokens_seen": 54197144, + "step": 93410 + }, + { + "epoch": 13.913464402740543, + "grad_norm": 0.00023937362129800022, + "learning_rate": 1.282897716596247e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54199928, + "step": 93415 + }, + { + "epoch": 13.914209115281501, + "grad_norm": 0.01119101233780384, + "learning_rate": 1.2826138929502446e-05, + "loss": 0.0, + "num_input_tokens_seen": 54202872, + "step": 93420 + }, + { + "epoch": 13.91495382782246, + "grad_norm": 0.0001507193228462711, + "learning_rate": 1.2823300898707432e-05, + "loss": 0.1221, + "num_input_tokens_seen": 54205464, + "step": 93425 + }, + { + "epoch": 13.915698540363419, + "grad_norm": 0.0032355228904634714, + "learning_rate": 1.2820463073625367e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54208152, + "step": 93430 + }, + { + "epoch": 13.91644325290438, + "grad_norm": 0.01548322569578886, + "learning_rate": 1.2817625454304204e-05, + "loss": 0.0, + "num_input_tokens_seen": 54211000, + "step": 93435 + }, + { + "epoch": 13.917187965445338, + "grad_norm": 0.0008794030291028321, + "learning_rate": 1.281478804079188e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54213528, + "step": 93440 + }, + { + "epoch": 13.917932677986297, + "grad_norm": 384.10107421875, + "learning_rate": 1.2811950833136332e-05, + "loss": 0.0946, + "num_input_tokens_seen": 54216120, + "step": 93445 + }, + { + "epoch": 13.918677390527257, + "grad_norm": 0.034145671874284744, + "learning_rate": 1.2809113831385472e-05, + "loss": 0.1279, + "num_input_tokens_seen": 54218968, + "step": 93450 + }, + { + "epoch": 13.919422103068216, + "grad_norm": 0.006225362420082092, + "learning_rate": 1.2806277035587256e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54221720, + "step": 93455 + }, + { + "epoch": 13.920166815609175, + "grad_norm": 0.0051516881212592125, + "learning_rate": 1.2803440445789594e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54224248, + "step": 93460 + }, + { + "epoch": 13.920911528150134, + "grad_norm": 0.005753797013312578, + "learning_rate": 1.2800604062040403e-05, + "loss": 0.0, + "num_input_tokens_seen": 54227000, + "step": 93465 + }, + { + "epoch": 13.921656240691092, + "grad_norm": 0.01104716770350933, + "learning_rate": 1.2797767884387615e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54229816, + "step": 93470 + }, + { + "epoch": 13.922400953232053, + "grad_norm": 0.00413795281201601, + "learning_rate": 1.2794931912879127e-05, + "loss": 0.0, + "num_input_tokens_seen": 54232824, + "step": 93475 + }, + { + "epoch": 13.923145665773012, + "grad_norm": 0.0016348108183592558, + "learning_rate": 1.2792096147562872e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54235640, + "step": 93480 + }, + { + "epoch": 13.92389037831397, + "grad_norm": 0.0002740419877227396, + "learning_rate": 1.2789260588486735e-05, + "loss": 0.0545, + "num_input_tokens_seen": 54238392, + "step": 93485 + }, + { + "epoch": 13.92463509085493, + "grad_norm": 0.005882977042347193, + "learning_rate": 1.2786425235698634e-05, + "loss": 0.1128, + "num_input_tokens_seen": 54241336, + "step": 93490 + }, + { + "epoch": 13.92537980339589, + "grad_norm": 0.00859018787741661, + "learning_rate": 1.2783590089246473e-05, + "loss": 0.0, + "num_input_tokens_seen": 54244504, + "step": 93495 + }, + { + "epoch": 13.926124515936849, + "grad_norm": 0.0019172565080225468, + "learning_rate": 1.2780755149178136e-05, + "loss": 0.1626, + "num_input_tokens_seen": 54247320, + "step": 93500 + }, + { + "epoch": 13.926869228477807, + "grad_norm": 0.01686951518058777, + "learning_rate": 1.2777920415541514e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54250456, + "step": 93505 + }, + { + "epoch": 13.927613941018766, + "grad_norm": 0.0017130918568000197, + "learning_rate": 1.2775085888384514e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54253432, + "step": 93510 + }, + { + "epoch": 13.928358653559727, + "grad_norm": 0.0057576545514166355, + "learning_rate": 1.2772251567755011e-05, + "loss": 0.0009, + "num_input_tokens_seen": 54256152, + "step": 93515 + }, + { + "epoch": 13.929103366100685, + "grad_norm": 7.60077428817749, + "learning_rate": 1.2769417453700882e-05, + "loss": 0.0552, + "num_input_tokens_seen": 54259096, + "step": 93520 + }, + { + "epoch": 13.929848078641644, + "grad_norm": 0.01331732701510191, + "learning_rate": 1.2766583546270027e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54262232, + "step": 93525 + }, + { + "epoch": 13.930592791182603, + "grad_norm": 0.08798135071992874, + "learning_rate": 1.2763749845510297e-05, + "loss": 0.0287, + "num_input_tokens_seen": 54265368, + "step": 93530 + }, + { + "epoch": 13.931337503723563, + "grad_norm": 0.00428807083517313, + "learning_rate": 1.2760916351469588e-05, + "loss": 0.0009, + "num_input_tokens_seen": 54268120, + "step": 93535 + }, + { + "epoch": 13.932082216264522, + "grad_norm": 0.002985662315040827, + "learning_rate": 1.2758083064195756e-05, + "loss": 0.0329, + "num_input_tokens_seen": 54270776, + "step": 93540 + }, + { + "epoch": 13.932826928805481, + "grad_norm": 0.010511672124266624, + "learning_rate": 1.2755249983736662e-05, + "loss": 0.0, + "num_input_tokens_seen": 54273752, + "step": 93545 + }, + { + "epoch": 13.93357164134644, + "grad_norm": 0.013411762192845345, + "learning_rate": 1.275241711014018e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54276632, + "step": 93550 + }, + { + "epoch": 13.9343163538874, + "grad_norm": 0.010106267407536507, + "learning_rate": 1.2749584443454154e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54279640, + "step": 93555 + }, + { + "epoch": 13.935061066428359, + "grad_norm": 1.8596484661102295, + "learning_rate": 1.2746751983726459e-05, + "loss": 0.066, + "num_input_tokens_seen": 54282488, + "step": 93560 + }, + { + "epoch": 13.935805778969318, + "grad_norm": 0.011919393204152584, + "learning_rate": 1.2743919731004938e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54285272, + "step": 93565 + }, + { + "epoch": 13.936550491510276, + "grad_norm": 0.022325722500681877, + "learning_rate": 1.2741087685337432e-05, + "loss": 0.0005, + "num_input_tokens_seen": 54288600, + "step": 93570 + }, + { + "epoch": 13.937295204051237, + "grad_norm": 0.010731109417974949, + "learning_rate": 1.2738255846771785e-05, + "loss": 0.0814, + "num_input_tokens_seen": 54291576, + "step": 93575 + }, + { + "epoch": 13.938039916592196, + "grad_norm": 0.0008942650747485459, + "learning_rate": 1.273542421535585e-05, + "loss": 0.0, + "num_input_tokens_seen": 54294488, + "step": 93580 + }, + { + "epoch": 13.938784629133155, + "grad_norm": 0.014684023335576057, + "learning_rate": 1.273259279113746e-05, + "loss": 0.0004, + "num_input_tokens_seen": 54297208, + "step": 93585 + }, + { + "epoch": 13.939529341674113, + "grad_norm": 0.004696881864219904, + "learning_rate": 1.2729761574164434e-05, + "loss": 0.0224, + "num_input_tokens_seen": 54300184, + "step": 93590 + }, + { + "epoch": 13.940274054215074, + "grad_norm": 0.0027468346524983644, + "learning_rate": 1.2726930564484627e-05, + "loss": 0.0, + "num_input_tokens_seen": 54303096, + "step": 93595 + }, + { + "epoch": 13.941018766756033, + "grad_norm": 0.037576060742139816, + "learning_rate": 1.2724099762145841e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54305848, + "step": 93600 + }, + { + "epoch": 13.941763479296991, + "grad_norm": 0.0047868164256215096, + "learning_rate": 1.2721269167195926e-05, + "loss": 0.0174, + "num_input_tokens_seen": 54308760, + "step": 93605 + }, + { + "epoch": 13.94250819183795, + "grad_norm": 0.0019991241861134768, + "learning_rate": 1.2718438779682678e-05, + "loss": 0.0022, + "num_input_tokens_seen": 54311928, + "step": 93610 + }, + { + "epoch": 13.943252904378909, + "grad_norm": 0.021520474925637245, + "learning_rate": 1.2715608599653938e-05, + "loss": 0.1284, + "num_input_tokens_seen": 54315000, + "step": 93615 + }, + { + "epoch": 13.94399761691987, + "grad_norm": 0.0016137489583343267, + "learning_rate": 1.271277862715749e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54317688, + "step": 93620 + }, + { + "epoch": 13.944742329460828, + "grad_norm": 0.00028913136338815093, + "learning_rate": 1.2709948862241173e-05, + "loss": 0.0, + "num_input_tokens_seen": 54320568, + "step": 93625 + }, + { + "epoch": 13.945487042001787, + "grad_norm": 0.0012976506259292364, + "learning_rate": 1.2707119304952777e-05, + "loss": 0.0006, + "num_input_tokens_seen": 54323512, + "step": 93630 + }, + { + "epoch": 13.946231754542747, + "grad_norm": 0.007378123700618744, + "learning_rate": 1.2704289955340107e-05, + "loss": 0.0, + "num_input_tokens_seen": 54326296, + "step": 93635 + }, + { + "epoch": 13.946976467083706, + "grad_norm": 0.053363848477602005, + "learning_rate": 1.270146081345096e-05, + "loss": 0.0, + "num_input_tokens_seen": 54328952, + "step": 93640 + }, + { + "epoch": 13.947721179624665, + "grad_norm": 0.0012725221458822489, + "learning_rate": 1.2698631879333126e-05, + "loss": 0.0003, + "num_input_tokens_seen": 54331736, + "step": 93645 + }, + { + "epoch": 13.948465892165624, + "grad_norm": 0.0004156163486186415, + "learning_rate": 1.2695803153034411e-05, + "loss": 0.0, + "num_input_tokens_seen": 54334712, + "step": 93650 + }, + { + "epoch": 13.949210604706582, + "grad_norm": 0.1708039492368698, + "learning_rate": 1.2692974634602586e-05, + "loss": 0.0008, + "num_input_tokens_seen": 54337560, + "step": 93655 + }, + { + "epoch": 13.949955317247543, + "grad_norm": 0.0008746019448153675, + "learning_rate": 1.2690146324085458e-05, + "loss": 0.0, + "num_input_tokens_seen": 54340344, + "step": 93660 + }, + { + "epoch": 13.950700029788502, + "grad_norm": 0.041711676865816116, + "learning_rate": 1.2687318221530797e-05, + "loss": 0.0008, + "num_input_tokens_seen": 54343224, + "step": 93665 + }, + { + "epoch": 13.95144474232946, + "grad_norm": 0.00010784734331537038, + "learning_rate": 1.268449032698637e-05, + "loss": 0.0, + "num_input_tokens_seen": 54346296, + "step": 93670 + }, + { + "epoch": 13.95218945487042, + "grad_norm": 0.0011284821666777134, + "learning_rate": 1.2681662640499969e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54349304, + "step": 93675 + }, + { + "epoch": 13.95293416741138, + "grad_norm": 0.00031807945924811065, + "learning_rate": 1.2678835162119352e-05, + "loss": 0.0, + "num_input_tokens_seen": 54352472, + "step": 93680 + }, + { + "epoch": 13.953678879952339, + "grad_norm": 112.40003204345703, + "learning_rate": 1.26760078918923e-05, + "loss": 0.1339, + "num_input_tokens_seen": 54355352, + "step": 93685 + }, + { + "epoch": 13.954423592493297, + "grad_norm": 0.00617244653403759, + "learning_rate": 1.267318082986656e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54358232, + "step": 93690 + }, + { + "epoch": 13.955168305034256, + "grad_norm": 144.52557373046875, + "learning_rate": 1.267035397608991e-05, + "loss": 0.279, + "num_input_tokens_seen": 54361176, + "step": 93695 + }, + { + "epoch": 13.955913017575217, + "grad_norm": 0.00461725564673543, + "learning_rate": 1.2667527330610101e-05, + "loss": 0.1876, + "num_input_tokens_seen": 54364184, + "step": 93700 + }, + { + "epoch": 13.956657730116175, + "grad_norm": 0.000220479050767608, + "learning_rate": 1.2664700893474884e-05, + "loss": 0.0005, + "num_input_tokens_seen": 54366904, + "step": 93705 + }, + { + "epoch": 13.957402442657134, + "grad_norm": 0.005064599681645632, + "learning_rate": 1.2661874664732004e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54369848, + "step": 93710 + }, + { + "epoch": 13.958147155198093, + "grad_norm": 9.8729887008667, + "learning_rate": 1.2659048644429205e-05, + "loss": 0.0026, + "num_input_tokens_seen": 54372760, + "step": 93715 + }, + { + "epoch": 13.958891867739053, + "grad_norm": 0.014087161980569363, + "learning_rate": 1.2656222832614245e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54375544, + "step": 93720 + }, + { + "epoch": 13.959636580280012, + "grad_norm": 0.010768129490315914, + "learning_rate": 1.2653397229334846e-05, + "loss": 0.0, + "num_input_tokens_seen": 54378680, + "step": 93725 + }, + { + "epoch": 13.960381292820971, + "grad_norm": 0.0061697340570390224, + "learning_rate": 1.2650571834638764e-05, + "loss": 0.0, + "num_input_tokens_seen": 54381592, + "step": 93730 + }, + { + "epoch": 13.96112600536193, + "grad_norm": 0.0021228264085948467, + "learning_rate": 1.2647746648573705e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54384664, + "step": 93735 + }, + { + "epoch": 13.96187071790289, + "grad_norm": 0.0009940832387655973, + "learning_rate": 1.2644921671187424e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54387480, + "step": 93740 + }, + { + "epoch": 13.962615430443849, + "grad_norm": 0.0004707763437181711, + "learning_rate": 1.2642096902527633e-05, + "loss": 0.0, + "num_input_tokens_seen": 54390328, + "step": 93745 + }, + { + "epoch": 13.963360142984808, + "grad_norm": 0.000490996113512665, + "learning_rate": 1.2639272342642047e-05, + "loss": 0.0, + "num_input_tokens_seen": 54393304, + "step": 93750 + }, + { + "epoch": 13.964104855525767, + "grad_norm": 0.012903368100523949, + "learning_rate": 1.2636447991578401e-05, + "loss": 0.0755, + "num_input_tokens_seen": 54396312, + "step": 93755 + }, + { + "epoch": 13.964849568066727, + "grad_norm": 0.0011486014118418097, + "learning_rate": 1.263362384938439e-05, + "loss": 0.1533, + "num_input_tokens_seen": 54399288, + "step": 93760 + }, + { + "epoch": 13.965594280607686, + "grad_norm": 0.07541963458061218, + "learning_rate": 1.2630799916107747e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54402296, + "step": 93765 + }, + { + "epoch": 13.966338993148645, + "grad_norm": 0.003670821199193597, + "learning_rate": 1.2627976191796165e-05, + "loss": 0.0, + "num_input_tokens_seen": 54405016, + "step": 93770 + }, + { + "epoch": 13.967083705689603, + "grad_norm": 0.01172658335417509, + "learning_rate": 1.2625152676497354e-05, + "loss": 0.1099, + "num_input_tokens_seen": 54407640, + "step": 93775 + }, + { + "epoch": 13.967828418230564, + "grad_norm": 0.0045897094532847404, + "learning_rate": 1.2622329370259001e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54410456, + "step": 93780 + }, + { + "epoch": 13.968573130771523, + "grad_norm": 0.010757770389318466, + "learning_rate": 1.261950627312882e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54413240, + "step": 93785 + }, + { + "epoch": 13.969317843312481, + "grad_norm": 5.715088627766818e-05, + "learning_rate": 1.2616683385154498e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54415992, + "step": 93790 + }, + { + "epoch": 13.97006255585344, + "grad_norm": 0.01000403892248869, + "learning_rate": 1.2613860706383718e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54418744, + "step": 93795 + }, + { + "epoch": 13.970807268394399, + "grad_norm": 0.06914032995700836, + "learning_rate": 1.261103823686418e-05, + "loss": 0.0, + "num_input_tokens_seen": 54421496, + "step": 93800 + }, + { + "epoch": 13.97155198093536, + "grad_norm": 0.009051114320755005, + "learning_rate": 1.260821597664355e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54424376, + "step": 93805 + }, + { + "epoch": 13.972296693476318, + "grad_norm": 0.01093698013573885, + "learning_rate": 1.2605393925769526e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54427320, + "step": 93810 + }, + { + "epoch": 13.973041406017277, + "grad_norm": 0.004608784802258015, + "learning_rate": 1.2602572084289765e-05, + "loss": 0.0, + "num_input_tokens_seen": 54429880, + "step": 93815 + }, + { + "epoch": 13.973786118558236, + "grad_norm": 0.0003783847496379167, + "learning_rate": 1.259975045225196e-05, + "loss": 0.1315, + "num_input_tokens_seen": 54432472, + "step": 93820 + }, + { + "epoch": 13.974530831099196, + "grad_norm": 0.0003102326299995184, + "learning_rate": 1.2596929029703766e-05, + "loss": 0.0, + "num_input_tokens_seen": 54435128, + "step": 93825 + }, + { + "epoch": 13.975275543640155, + "grad_norm": 0.0009399197297170758, + "learning_rate": 1.2594107816692852e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54438104, + "step": 93830 + }, + { + "epoch": 13.976020256181114, + "grad_norm": 0.009009872563183308, + "learning_rate": 1.2591286813266867e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54440984, + "step": 93835 + }, + { + "epoch": 13.976764968722073, + "grad_norm": 0.0023501806426793337, + "learning_rate": 1.2588466019473488e-05, + "loss": 0.0, + "num_input_tokens_seen": 54443864, + "step": 93840 + }, + { + "epoch": 13.977509681263033, + "grad_norm": 0.00457766093313694, + "learning_rate": 1.2585645435360361e-05, + "loss": 0.0679, + "num_input_tokens_seen": 54446680, + "step": 93845 + }, + { + "epoch": 13.978254393803992, + "grad_norm": 0.0015015508979558945, + "learning_rate": 1.2582825060975128e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54449592, + "step": 93850 + }, + { + "epoch": 13.97899910634495, + "grad_norm": 0.0008052302291616797, + "learning_rate": 1.2580004896365455e-05, + "loss": 0.0004, + "num_input_tokens_seen": 54452600, + "step": 93855 + }, + { + "epoch": 13.97974381888591, + "grad_norm": 0.0032960069365799427, + "learning_rate": 1.2577184941578968e-05, + "loss": 0.0, + "num_input_tokens_seen": 54455384, + "step": 93860 + }, + { + "epoch": 13.98048853142687, + "grad_norm": 0.00047069700667634606, + "learning_rate": 1.2574365196663324e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54458712, + "step": 93865 + }, + { + "epoch": 13.981233243967829, + "grad_norm": 0.24771477282047272, + "learning_rate": 1.2571545661666151e-05, + "loss": 0.0004, + "num_input_tokens_seen": 54461624, + "step": 93870 + }, + { + "epoch": 13.981977956508787, + "grad_norm": 0.0010601583635434508, + "learning_rate": 1.2568726336635073e-05, + "loss": 0.0856, + "num_input_tokens_seen": 54464792, + "step": 93875 + }, + { + "epoch": 13.982722669049746, + "grad_norm": 13.776100158691406, + "learning_rate": 1.2565907221617738e-05, + "loss": 0.1049, + "num_input_tokens_seen": 54467960, + "step": 93880 + }, + { + "epoch": 13.983467381590707, + "grad_norm": 0.002354710828512907, + "learning_rate": 1.2563088316661753e-05, + "loss": 0.2505, + "num_input_tokens_seen": 54470680, + "step": 93885 + }, + { + "epoch": 13.984212094131665, + "grad_norm": 0.0009128056699410081, + "learning_rate": 1.256026962181476e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54473528, + "step": 93890 + }, + { + "epoch": 13.984956806672624, + "grad_norm": 0.0022909091785550117, + "learning_rate": 1.255745113712437e-05, + "loss": 0.0, + "num_input_tokens_seen": 54476184, + "step": 93895 + }, + { + "epoch": 13.985701519213583, + "grad_norm": 0.011610288172960281, + "learning_rate": 1.2554632862638197e-05, + "loss": 0.0, + "num_input_tokens_seen": 54479128, + "step": 93900 + }, + { + "epoch": 13.986446231754543, + "grad_norm": 0.004219639115035534, + "learning_rate": 1.2551814798403851e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54481816, + "step": 93905 + }, + { + "epoch": 13.987190944295502, + "grad_norm": 0.010697160847485065, + "learning_rate": 1.2548996944468935e-05, + "loss": 0.0225, + "num_input_tokens_seen": 54484920, + "step": 93910 + }, + { + "epoch": 13.987935656836461, + "grad_norm": 0.024776272475719452, + "learning_rate": 1.254617930088107e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54487512, + "step": 93915 + }, + { + "epoch": 13.98868036937742, + "grad_norm": 0.0003249350702390075, + "learning_rate": 1.2543361867687836e-05, + "loss": 0.0012, + "num_input_tokens_seen": 54489976, + "step": 93920 + }, + { + "epoch": 13.98942508191838, + "grad_norm": 0.0008329250849783421, + "learning_rate": 1.2540544644936858e-05, + "loss": 0.0, + "num_input_tokens_seen": 54492632, + "step": 93925 + }, + { + "epoch": 13.990169794459339, + "grad_norm": 0.005017279181629419, + "learning_rate": 1.2537727632675699e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54495416, + "step": 93930 + }, + { + "epoch": 13.990914507000298, + "grad_norm": 0.013921703211963177, + "learning_rate": 1.253491083095198e-05, + "loss": 0.0016, + "num_input_tokens_seen": 54498136, + "step": 93935 + }, + { + "epoch": 13.991659219541257, + "grad_norm": 0.029385462403297424, + "learning_rate": 1.253209423981326e-05, + "loss": 0.0802, + "num_input_tokens_seen": 54501208, + "step": 93940 + }, + { + "epoch": 13.992403932082215, + "grad_norm": 0.0003681684611365199, + "learning_rate": 1.2529277859307148e-05, + "loss": 0.0, + "num_input_tokens_seen": 54504120, + "step": 93945 + }, + { + "epoch": 13.993148644623176, + "grad_norm": 0.004993804730474949, + "learning_rate": 1.2526461689481212e-05, + "loss": 0.0007, + "num_input_tokens_seen": 54507128, + "step": 93950 + }, + { + "epoch": 13.993893357164135, + "grad_norm": 0.03268881142139435, + "learning_rate": 1.2523645730383018e-05, + "loss": 0.0015, + "num_input_tokens_seen": 54510008, + "step": 93955 + }, + { + "epoch": 13.994638069705093, + "grad_norm": 0.0035284929908812046, + "learning_rate": 1.2520829982060162e-05, + "loss": 0.0, + "num_input_tokens_seen": 54513080, + "step": 93960 + }, + { + "epoch": 13.995382782246054, + "grad_norm": 29.725133895874023, + "learning_rate": 1.2518014444560195e-05, + "loss": 0.038, + "num_input_tokens_seen": 54515768, + "step": 93965 + }, + { + "epoch": 13.996127494787013, + "grad_norm": 0.005223363637924194, + "learning_rate": 1.251519911793069e-05, + "loss": 0.0023, + "num_input_tokens_seen": 54518552, + "step": 93970 + }, + { + "epoch": 13.996872207327971, + "grad_norm": 0.23441199958324432, + "learning_rate": 1.2512384002219196e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54521368, + "step": 93975 + }, + { + "epoch": 13.99761691986893, + "grad_norm": 0.00219594850204885, + "learning_rate": 1.2509569097473295e-05, + "loss": 0.0041, + "num_input_tokens_seen": 54524472, + "step": 93980 + }, + { + "epoch": 13.998361632409889, + "grad_norm": 0.02368941530585289, + "learning_rate": 1.2506754403740529e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54527512, + "step": 93985 + }, + { + "epoch": 13.99910634495085, + "grad_norm": 0.0002254378778161481, + "learning_rate": 1.2503939921068435e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54530296, + "step": 93990 + }, + { + "epoch": 13.999851057491808, + "grad_norm": 18.1660099029541, + "learning_rate": 1.2501125649504591e-05, + "loss": 0.1223, + "num_input_tokens_seen": 54533016, + "step": 93995 + }, + { + "epoch": 14.0, + "eval_loss": 2.2301673889160156, + "eval_runtime": 51.3351, + "eval_samples_per_second": 58.128, + "eval_steps_per_second": 14.532, + "num_input_tokens_seen": 54533112, + "step": 93996 + }, + { + "epoch": 14.000595770032767, + "grad_norm": 0.007977981120347977, + "learning_rate": 1.2498311589096514e-05, + "loss": 0.0005, + "num_input_tokens_seen": 54535352, + "step": 94000 + }, + { + "epoch": 14.001340482573726, + "grad_norm": 0.0002214400446973741, + "learning_rate": 1.2495497739891764e-05, + "loss": 0.0589, + "num_input_tokens_seen": 54538328, + "step": 94005 + }, + { + "epoch": 14.002085195114686, + "grad_norm": 0.006509752478450537, + "learning_rate": 1.2492684101937865e-05, + "loss": 0.273, + "num_input_tokens_seen": 54541272, + "step": 94010 + }, + { + "epoch": 14.002829907655645, + "grad_norm": 0.00013495265739038587, + "learning_rate": 1.2489870675282364e-05, + "loss": 0.0259, + "num_input_tokens_seen": 54543960, + "step": 94015 + }, + { + "epoch": 14.003574620196604, + "grad_norm": 0.02371416613459587, + "learning_rate": 1.2487057459972775e-05, + "loss": 0.0, + "num_input_tokens_seen": 54546840, + "step": 94020 + }, + { + "epoch": 14.004319332737563, + "grad_norm": 0.007573126349598169, + "learning_rate": 1.248424445605664e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54549592, + "step": 94025 + }, + { + "epoch": 14.005064045278523, + "grad_norm": 0.0010191339533776045, + "learning_rate": 1.2481431663581474e-05, + "loss": 0.0, + "num_input_tokens_seen": 54552568, + "step": 94030 + }, + { + "epoch": 14.005808757819482, + "grad_norm": 0.018793795257806778, + "learning_rate": 1.24786190825948e-05, + "loss": 0.0003, + "num_input_tokens_seen": 54555608, + "step": 94035 + }, + { + "epoch": 14.00655347036044, + "grad_norm": 0.0012186899548396468, + "learning_rate": 1.2475806713144128e-05, + "loss": 0.0, + "num_input_tokens_seen": 54558904, + "step": 94040 + }, + { + "epoch": 14.0072981829014, + "grad_norm": 0.0014909004094079137, + "learning_rate": 1.2472994555276964e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54561912, + "step": 94045 + }, + { + "epoch": 14.00804289544236, + "grad_norm": 5.29918797838036e-05, + "learning_rate": 1.2470182609040833e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54564920, + "step": 94050 + }, + { + "epoch": 14.008787607983319, + "grad_norm": 0.00013040995690971613, + "learning_rate": 1.2467370874483225e-05, + "loss": 0.0, + "num_input_tokens_seen": 54567896, + "step": 94055 + }, + { + "epoch": 14.009532320524277, + "grad_norm": 0.0005491552292369306, + "learning_rate": 1.2464559351651658e-05, + "loss": 0.0, + "num_input_tokens_seen": 54570360, + "step": 94060 + }, + { + "epoch": 14.010277033065236, + "grad_norm": 0.027547260746359825, + "learning_rate": 1.2461748040593621e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54573144, + "step": 94065 + }, + { + "epoch": 14.011021745606197, + "grad_norm": 0.00036661839112639427, + "learning_rate": 1.2458936941356594e-05, + "loss": 0.0, + "num_input_tokens_seen": 54576024, + "step": 94070 + }, + { + "epoch": 14.011766458147155, + "grad_norm": 0.0003391127975191921, + "learning_rate": 1.2456126053988093e-05, + "loss": 0.023, + "num_input_tokens_seen": 54578712, + "step": 94075 + }, + { + "epoch": 14.012511170688114, + "grad_norm": 0.008099477738142014, + "learning_rate": 1.2453315378535584e-05, + "loss": 0.0635, + "num_input_tokens_seen": 54581784, + "step": 94080 + }, + { + "epoch": 14.013255883229073, + "grad_norm": 0.002482001204043627, + "learning_rate": 1.245050491504657e-05, + "loss": 0.0, + "num_input_tokens_seen": 54584760, + "step": 94085 + }, + { + "epoch": 14.014000595770034, + "grad_norm": 0.03646041080355644, + "learning_rate": 1.2447694663568509e-05, + "loss": 0.0153, + "num_input_tokens_seen": 54587832, + "step": 94090 + }, + { + "epoch": 14.014745308310992, + "grad_norm": 0.02474779635667801, + "learning_rate": 1.24448846241489e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54590520, + "step": 94095 + }, + { + "epoch": 14.015490020851951, + "grad_norm": 0.0014871250605210662, + "learning_rate": 1.2442074796835206e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54593304, + "step": 94100 + }, + { + "epoch": 14.01623473339291, + "grad_norm": 0.00605337368324399, + "learning_rate": 1.2439265181674895e-05, + "loss": 0.0, + "num_input_tokens_seen": 54595960, + "step": 94105 + }, + { + "epoch": 14.01697944593387, + "grad_norm": 2.3111324310302734, + "learning_rate": 1.2436455778715431e-05, + "loss": 0.0029, + "num_input_tokens_seen": 54599064, + "step": 94110 + }, + { + "epoch": 14.017724158474829, + "grad_norm": 0.14216206967830658, + "learning_rate": 1.2433646588004266e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54601912, + "step": 94115 + }, + { + "epoch": 14.018468871015788, + "grad_norm": 0.0024783373810350895, + "learning_rate": 1.2430837609588883e-05, + "loss": 0.0008, + "num_input_tokens_seen": 54604856, + "step": 94120 + }, + { + "epoch": 14.019213583556747, + "grad_norm": 0.008287911303341389, + "learning_rate": 1.2428028843516715e-05, + "loss": 0.1657, + "num_input_tokens_seen": 54607832, + "step": 94125 + }, + { + "epoch": 14.019958296097707, + "grad_norm": 0.01657557301223278, + "learning_rate": 1.2425220289835229e-05, + "loss": 0.0, + "num_input_tokens_seen": 54610584, + "step": 94130 + }, + { + "epoch": 14.020703008638666, + "grad_norm": 0.0011994007509201765, + "learning_rate": 1.2422411948591855e-05, + "loss": 0.0, + "num_input_tokens_seen": 54613560, + "step": 94135 + }, + { + "epoch": 14.021447721179625, + "grad_norm": 0.0015289254952222109, + "learning_rate": 1.241960381983406e-05, + "loss": 0.0041, + "num_input_tokens_seen": 54616952, + "step": 94140 + }, + { + "epoch": 14.022192433720583, + "grad_norm": 0.0021764012053608894, + "learning_rate": 1.2416795903609274e-05, + "loss": 0.0, + "num_input_tokens_seen": 54620024, + "step": 94145 + }, + { + "epoch": 14.022937146261542, + "grad_norm": 0.0006101694889366627, + "learning_rate": 1.2413988199964918e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54622936, + "step": 94150 + }, + { + "epoch": 14.023681858802503, + "grad_norm": 0.0007549130241386592, + "learning_rate": 1.2411180708948453e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54626040, + "step": 94155 + }, + { + "epoch": 14.024426571343461, + "grad_norm": 0.3620013892650604, + "learning_rate": 1.2408373430607296e-05, + "loss": 0.0004, + "num_input_tokens_seen": 54629016, + "step": 94160 + }, + { + "epoch": 14.02517128388442, + "grad_norm": 0.002568033756688237, + "learning_rate": 1.2405566364988857e-05, + "loss": 0.0455, + "num_input_tokens_seen": 54631800, + "step": 94165 + }, + { + "epoch": 14.025915996425379, + "grad_norm": 0.0005742140347138047, + "learning_rate": 1.2402759512140588e-05, + "loss": 0.0, + "num_input_tokens_seen": 54634520, + "step": 94170 + }, + { + "epoch": 14.02666070896634, + "grad_norm": 0.0010963742388412356, + "learning_rate": 1.2399952872109893e-05, + "loss": 0.0, + "num_input_tokens_seen": 54637400, + "step": 94175 + }, + { + "epoch": 14.027405421507298, + "grad_norm": 0.001890445128083229, + "learning_rate": 1.239714644494418e-05, + "loss": 0.0, + "num_input_tokens_seen": 54640184, + "step": 94180 + }, + { + "epoch": 14.028150134048257, + "grad_norm": 0.00010711339564295486, + "learning_rate": 1.2394340230690877e-05, + "loss": 0.0, + "num_input_tokens_seen": 54643032, + "step": 94185 + }, + { + "epoch": 14.028894846589216, + "grad_norm": 0.0022651248145848513, + "learning_rate": 1.2391534229397384e-05, + "loss": 0.0, + "num_input_tokens_seen": 54645848, + "step": 94190 + }, + { + "epoch": 14.029639559130176, + "grad_norm": 8.49030876159668, + "learning_rate": 1.2388728441111095e-05, + "loss": 0.107, + "num_input_tokens_seen": 54648920, + "step": 94195 + }, + { + "epoch": 14.030384271671135, + "grad_norm": 0.016398968175053596, + "learning_rate": 1.2385922865879432e-05, + "loss": 0.0, + "num_input_tokens_seen": 54651768, + "step": 94200 + }, + { + "epoch": 14.031128984212094, + "grad_norm": 0.0001487189729232341, + "learning_rate": 1.2383117503749769e-05, + "loss": 0.0, + "num_input_tokens_seen": 54654808, + "step": 94205 + }, + { + "epoch": 14.031873696753053, + "grad_norm": 0.015545069240033627, + "learning_rate": 1.2380312354769526e-05, + "loss": 0.0884, + "num_input_tokens_seen": 54657848, + "step": 94210 + }, + { + "epoch": 14.032618409294013, + "grad_norm": 0.002091950736939907, + "learning_rate": 1.2377507418986071e-05, + "loss": 0.0, + "num_input_tokens_seen": 54660760, + "step": 94215 + }, + { + "epoch": 14.033363121834972, + "grad_norm": 0.002631931100040674, + "learning_rate": 1.2374702696446806e-05, + "loss": 0.0, + "num_input_tokens_seen": 54663544, + "step": 94220 + }, + { + "epoch": 14.03410783437593, + "grad_norm": 0.00021652602299582213, + "learning_rate": 1.2371898187199108e-05, + "loss": 0.0, + "num_input_tokens_seen": 54666296, + "step": 94225 + }, + { + "epoch": 14.03485254691689, + "grad_norm": 0.006894403602927923, + "learning_rate": 1.2369093891290357e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54669240, + "step": 94230 + }, + { + "epoch": 14.03559725945785, + "grad_norm": 0.002005362417548895, + "learning_rate": 1.2366289808767926e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54672088, + "step": 94235 + }, + { + "epoch": 14.036341971998809, + "grad_norm": 0.0012652885634452105, + "learning_rate": 1.2363485939679175e-05, + "loss": 0.1314, + "num_input_tokens_seen": 54674872, + "step": 94240 + }, + { + "epoch": 14.037086684539767, + "grad_norm": 0.0002825958945322782, + "learning_rate": 1.23606822840715e-05, + "loss": 0.0, + "num_input_tokens_seen": 54677624, + "step": 94245 + }, + { + "epoch": 14.037831397080726, + "grad_norm": 0.025041896849870682, + "learning_rate": 1.2357878841992243e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54680504, + "step": 94250 + }, + { + "epoch": 14.038576109621687, + "grad_norm": 0.001666609663516283, + "learning_rate": 1.2355075613488782e-05, + "loss": 0.0, + "num_input_tokens_seen": 54683896, + "step": 94255 + }, + { + "epoch": 14.039320822162646, + "grad_norm": 0.00026833690935745835, + "learning_rate": 1.2352272598608455e-05, + "loss": 0.0311, + "num_input_tokens_seen": 54687064, + "step": 94260 + }, + { + "epoch": 14.040065534703604, + "grad_norm": 0.13055825233459473, + "learning_rate": 1.234946979739864e-05, + "loss": 0.0, + "num_input_tokens_seen": 54690072, + "step": 94265 + }, + { + "epoch": 14.040810247244563, + "grad_norm": 0.12585504353046417, + "learning_rate": 1.2346667209906677e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54693016, + "step": 94270 + }, + { + "epoch": 14.041554959785524, + "grad_norm": 0.0001277788687730208, + "learning_rate": 1.23438648361799e-05, + "loss": 0.0, + "num_input_tokens_seen": 54695960, + "step": 94275 + }, + { + "epoch": 14.042299672326482, + "grad_norm": 0.0063047390431165695, + "learning_rate": 1.2341062676265671e-05, + "loss": 0.0008, + "num_input_tokens_seen": 54698904, + "step": 94280 + }, + { + "epoch": 14.043044384867441, + "grad_norm": 0.00010951502918032929, + "learning_rate": 1.2338260730211316e-05, + "loss": 0.0, + "num_input_tokens_seen": 54701944, + "step": 94285 + }, + { + "epoch": 14.0437890974084, + "grad_norm": 0.0014253199333325028, + "learning_rate": 1.2335458998064184e-05, + "loss": 0.1626, + "num_input_tokens_seen": 54704760, + "step": 94290 + }, + { + "epoch": 14.04453380994936, + "grad_norm": 227.80885314941406, + "learning_rate": 1.23326574798716e-05, + "loss": 0.019, + "num_input_tokens_seen": 54707576, + "step": 94295 + }, + { + "epoch": 14.04527852249032, + "grad_norm": 0.019682690501213074, + "learning_rate": 1.2329856175680896e-05, + "loss": 0.0331, + "num_input_tokens_seen": 54710584, + "step": 94300 + }, + { + "epoch": 14.046023235031278, + "grad_norm": 0.0010727489134296775, + "learning_rate": 1.2327055085539382e-05, + "loss": 0.0004, + "num_input_tokens_seen": 54713240, + "step": 94305 + }, + { + "epoch": 14.046767947572237, + "grad_norm": 0.04288706183433533, + "learning_rate": 1.2324254209494405e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54715960, + "step": 94310 + }, + { + "epoch": 14.047512660113195, + "grad_norm": 0.004619448445737362, + "learning_rate": 1.2321453547593267e-05, + "loss": 0.0, + "num_input_tokens_seen": 54719032, + "step": 94315 + }, + { + "epoch": 14.048257372654156, + "grad_norm": 2.534564256668091, + "learning_rate": 1.2318653099883278e-05, + "loss": 0.0052, + "num_input_tokens_seen": 54722168, + "step": 94320 + }, + { + "epoch": 14.049002085195115, + "grad_norm": 0.0022241822443902493, + "learning_rate": 1.2315852866411767e-05, + "loss": 0.1346, + "num_input_tokens_seen": 54724984, + "step": 94325 + }, + { + "epoch": 14.049746797736073, + "grad_norm": 0.011303272098302841, + "learning_rate": 1.2313052847226018e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54727832, + "step": 94330 + }, + { + "epoch": 14.050491510277032, + "grad_norm": 0.004505950026214123, + "learning_rate": 1.2310253042373356e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54730744, + "step": 94335 + }, + { + "epoch": 14.051236222817993, + "grad_norm": 0.0044071851298213005, + "learning_rate": 1.2307453451901063e-05, + "loss": 0.0, + "num_input_tokens_seen": 54733496, + "step": 94340 + }, + { + "epoch": 14.051980935358952, + "grad_norm": 0.000941790291108191, + "learning_rate": 1.2304654075856452e-05, + "loss": 0.1252, + "num_input_tokens_seen": 54736344, + "step": 94345 + }, + { + "epoch": 14.05272564789991, + "grad_norm": 0.0004981184029020369, + "learning_rate": 1.2301854914286812e-05, + "loss": 0.0, + "num_input_tokens_seen": 54739480, + "step": 94350 + }, + { + "epoch": 14.053470360440869, + "grad_norm": 0.0019327359041199088, + "learning_rate": 1.2299055967239415e-05, + "loss": 0.0, + "num_input_tokens_seen": 54742264, + "step": 94355 + }, + { + "epoch": 14.05421507298183, + "grad_norm": 0.00010170057794312015, + "learning_rate": 1.2296257234761566e-05, + "loss": 0.0, + "num_input_tokens_seen": 54745080, + "step": 94360 + }, + { + "epoch": 14.054959785522788, + "grad_norm": 0.770203173160553, + "learning_rate": 1.2293458716900543e-05, + "loss": 0.0045, + "num_input_tokens_seen": 54748088, + "step": 94365 + }, + { + "epoch": 14.055704498063747, + "grad_norm": 0.0013543280074372888, + "learning_rate": 1.229066041370362e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54751064, + "step": 94370 + }, + { + "epoch": 14.056449210604706, + "grad_norm": 4.044289016746916e-05, + "learning_rate": 1.228786232521806e-05, + "loss": 0.1595, + "num_input_tokens_seen": 54754232, + "step": 94375 + }, + { + "epoch": 14.057193923145666, + "grad_norm": 3.462407039478421e-05, + "learning_rate": 1.2285064451491157e-05, + "loss": 0.0, + "num_input_tokens_seen": 54757240, + "step": 94380 + }, + { + "epoch": 14.057938635686625, + "grad_norm": 0.001394309801980853, + "learning_rate": 1.2282266792570158e-05, + "loss": 0.0138, + "num_input_tokens_seen": 54760120, + "step": 94385 + }, + { + "epoch": 14.058683348227584, + "grad_norm": 0.0003499985032249242, + "learning_rate": 1.2279469348502345e-05, + "loss": 0.0, + "num_input_tokens_seen": 54763032, + "step": 94390 + }, + { + "epoch": 14.059428060768543, + "grad_norm": 0.00015357829397544265, + "learning_rate": 1.227667211933497e-05, + "loss": 0.0, + "num_input_tokens_seen": 54765944, + "step": 94395 + }, + { + "epoch": 14.060172773309503, + "grad_norm": 0.00048452167538926005, + "learning_rate": 1.2273875105115275e-05, + "loss": 0.0, + "num_input_tokens_seen": 54768760, + "step": 94400 + }, + { + "epoch": 14.060917485850462, + "grad_norm": 0.0010137141216546297, + "learning_rate": 1.227107830589054e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54771608, + "step": 94405 + }, + { + "epoch": 14.06166219839142, + "grad_norm": 0.00038583422428928316, + "learning_rate": 1.2268281721707989e-05, + "loss": 0.1408, + "num_input_tokens_seen": 54774392, + "step": 94410 + }, + { + "epoch": 14.06240691093238, + "grad_norm": 0.01049975398927927, + "learning_rate": 1.2265485352614887e-05, + "loss": 0.0, + "num_input_tokens_seen": 54777208, + "step": 94415 + }, + { + "epoch": 14.06315162347334, + "grad_norm": 0.0009127813391387463, + "learning_rate": 1.226268919865846e-05, + "loss": 0.0, + "num_input_tokens_seen": 54779992, + "step": 94420 + }, + { + "epoch": 14.063896336014299, + "grad_norm": 0.007427705451846123, + "learning_rate": 1.225989325988596e-05, + "loss": 0.0011, + "num_input_tokens_seen": 54782872, + "step": 94425 + }, + { + "epoch": 14.064641048555258, + "grad_norm": 0.00018033644300885499, + "learning_rate": 1.2257097536344613e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54785656, + "step": 94430 + }, + { + "epoch": 14.065385761096216, + "grad_norm": 0.010631237179040909, + "learning_rate": 1.2254302028081657e-05, + "loss": 0.0918, + "num_input_tokens_seen": 54788472, + "step": 94435 + }, + { + "epoch": 14.066130473637177, + "grad_norm": 0.02959953062236309, + "learning_rate": 1.225150673514431e-05, + "loss": 0.0005, + "num_input_tokens_seen": 54791480, + "step": 94440 + }, + { + "epoch": 14.066875186178136, + "grad_norm": 0.000809684453997761, + "learning_rate": 1.2248711657579792e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54794392, + "step": 94445 + }, + { + "epoch": 14.067619898719094, + "grad_norm": 0.004021541215479374, + "learning_rate": 1.2245916795435342e-05, + "loss": 0.0005, + "num_input_tokens_seen": 54797304, + "step": 94450 + }, + { + "epoch": 14.068364611260053, + "grad_norm": 0.0009717812645249069, + "learning_rate": 1.2243122148758152e-05, + "loss": 0.0, + "num_input_tokens_seen": 54800152, + "step": 94455 + }, + { + "epoch": 14.069109323801014, + "grad_norm": 0.0009350801119580865, + "learning_rate": 1.224032771759546e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54802904, + "step": 94460 + }, + { + "epoch": 14.069854036341972, + "grad_norm": 5.028934478759766, + "learning_rate": 1.2237533501994452e-05, + "loss": 0.0081, + "num_input_tokens_seen": 54805784, + "step": 94465 + }, + { + "epoch": 14.070598748882931, + "grad_norm": 2.0969484467059374e-05, + "learning_rate": 1.2234739502002353e-05, + "loss": 0.0, + "num_input_tokens_seen": 54808920, + "step": 94470 + }, + { + "epoch": 14.07134346142389, + "grad_norm": 0.0032953780610114336, + "learning_rate": 1.2231945717666358e-05, + "loss": 0.0, + "num_input_tokens_seen": 54812152, + "step": 94475 + }, + { + "epoch": 14.07208817396485, + "grad_norm": 0.0005067794118076563, + "learning_rate": 1.2229152149033655e-05, + "loss": 0.0733, + "num_input_tokens_seen": 54814968, + "step": 94480 + }, + { + "epoch": 14.07283288650581, + "grad_norm": 0.0007462730864062905, + "learning_rate": 1.2226358796151452e-05, + "loss": 0.0, + "num_input_tokens_seen": 54817784, + "step": 94485 + }, + { + "epoch": 14.073577599046768, + "grad_norm": 0.11687115579843521, + "learning_rate": 1.2223565659066938e-05, + "loss": 0.0537, + "num_input_tokens_seen": 54820664, + "step": 94490 + }, + { + "epoch": 14.074322311587727, + "grad_norm": 0.005668212193995714, + "learning_rate": 1.2220772737827285e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54823448, + "step": 94495 + }, + { + "epoch": 14.075067024128685, + "grad_norm": 0.001678796368651092, + "learning_rate": 1.2217980032479701e-05, + "loss": 0.0, + "num_input_tokens_seen": 54826104, + "step": 94500 + }, + { + "epoch": 14.075811736669646, + "grad_norm": 2.311645948793739e-05, + "learning_rate": 1.221518754307135e-05, + "loss": 0.0, + "num_input_tokens_seen": 54829016, + "step": 94505 + }, + { + "epoch": 14.076556449210605, + "grad_norm": 0.01740029826760292, + "learning_rate": 1.2212395269649413e-05, + "loss": 0.0, + "num_input_tokens_seen": 54831832, + "step": 94510 + }, + { + "epoch": 14.077301161751564, + "grad_norm": 0.0038509531877934933, + "learning_rate": 1.220960321226105e-05, + "loss": 0.0003, + "num_input_tokens_seen": 54834968, + "step": 94515 + }, + { + "epoch": 14.078045874292522, + "grad_norm": 0.00824626162648201, + "learning_rate": 1.2206811370953453e-05, + "loss": 0.0879, + "num_input_tokens_seen": 54838520, + "step": 94520 + }, + { + "epoch": 14.078790586833483, + "grad_norm": 0.0003439126303419471, + "learning_rate": 1.2204019745773764e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54841368, + "step": 94525 + }, + { + "epoch": 14.079535299374442, + "grad_norm": 0.00012229268031660467, + "learning_rate": 1.2201228336769169e-05, + "loss": 0.0, + "num_input_tokens_seen": 54844120, + "step": 94530 + }, + { + "epoch": 14.0802800119154, + "grad_norm": 0.0003039908187929541, + "learning_rate": 1.2198437143986798e-05, + "loss": 0.019, + "num_input_tokens_seen": 54847000, + "step": 94535 + }, + { + "epoch": 14.081024724456359, + "grad_norm": 0.0014990185154601932, + "learning_rate": 1.2195646167473835e-05, + "loss": 0.0007, + "num_input_tokens_seen": 54849592, + "step": 94540 + }, + { + "epoch": 14.08176943699732, + "grad_norm": 0.007382526528090239, + "learning_rate": 1.2192855407277407e-05, + "loss": 0.0, + "num_input_tokens_seen": 54852248, + "step": 94545 + }, + { + "epoch": 14.082514149538278, + "grad_norm": 0.011006399989128113, + "learning_rate": 1.2190064863444675e-05, + "loss": 0.0, + "num_input_tokens_seen": 54854936, + "step": 94550 + }, + { + "epoch": 14.083258862079237, + "grad_norm": 0.1156897246837616, + "learning_rate": 1.2187274536022783e-05, + "loss": 0.0003, + "num_input_tokens_seen": 54857848, + "step": 94555 + }, + { + "epoch": 14.084003574620196, + "grad_norm": 0.0010929583804681897, + "learning_rate": 1.2184484425058863e-05, + "loss": 0.0224, + "num_input_tokens_seen": 54860920, + "step": 94560 + }, + { + "epoch": 14.084748287161156, + "grad_norm": 0.000151494808960706, + "learning_rate": 1.2181694530600052e-05, + "loss": 0.0, + "num_input_tokens_seen": 54863864, + "step": 94565 + }, + { + "epoch": 14.085492999702115, + "grad_norm": 0.0012735113268718123, + "learning_rate": 1.2178904852693476e-05, + "loss": 0.0042, + "num_input_tokens_seen": 54866712, + "step": 94570 + }, + { + "epoch": 14.086237712243074, + "grad_norm": 0.05242176353931427, + "learning_rate": 1.217611539138628e-05, + "loss": 0.0, + "num_input_tokens_seen": 54869560, + "step": 94575 + }, + { + "epoch": 14.086982424784033, + "grad_norm": 0.018435755744576454, + "learning_rate": 1.2173326146725575e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54872440, + "step": 94580 + }, + { + "epoch": 14.087727137324993, + "grad_norm": 0.00042511356878094375, + "learning_rate": 1.2170537118758496e-05, + "loss": 0.0, + "num_input_tokens_seen": 54875352, + "step": 94585 + }, + { + "epoch": 14.088471849865952, + "grad_norm": 0.00020274177950341254, + "learning_rate": 1.216774830753215e-05, + "loss": 0.0, + "num_input_tokens_seen": 54878456, + "step": 94590 + }, + { + "epoch": 14.08921656240691, + "grad_norm": 0.00770250940695405, + "learning_rate": 1.2164959713093649e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54881464, + "step": 94595 + }, + { + "epoch": 14.08996127494787, + "grad_norm": 0.0016891879495233297, + "learning_rate": 1.2162171335490115e-05, + "loss": 0.1252, + "num_input_tokens_seen": 54884248, + "step": 94600 + }, + { + "epoch": 14.09070598748883, + "grad_norm": 0.006455375347286463, + "learning_rate": 1.2159383174768641e-05, + "loss": 0.0, + "num_input_tokens_seen": 54887352, + "step": 94605 + }, + { + "epoch": 14.091450700029789, + "grad_norm": 0.0015772398328408599, + "learning_rate": 1.2156595230976348e-05, + "loss": 0.0, + "num_input_tokens_seen": 54890488, + "step": 94610 + }, + { + "epoch": 14.092195412570748, + "grad_norm": 0.0006227613775990903, + "learning_rate": 1.2153807504160313e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54893208, + "step": 94615 + }, + { + "epoch": 14.092940125111706, + "grad_norm": 0.006009286269545555, + "learning_rate": 1.2151019994367655e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54896376, + "step": 94620 + }, + { + "epoch": 14.093684837652667, + "grad_norm": 0.0011383647797629237, + "learning_rate": 1.2148232701645453e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54899416, + "step": 94625 + }, + { + "epoch": 14.094429550193626, + "grad_norm": 0.002851797267794609, + "learning_rate": 1.2145445626040801e-05, + "loss": 0.147, + "num_input_tokens_seen": 54902488, + "step": 94630 + }, + { + "epoch": 14.095174262734584, + "grad_norm": 0.04236926510930061, + "learning_rate": 1.2142658767600779e-05, + "loss": 0.0023, + "num_input_tokens_seen": 54905464, + "step": 94635 + }, + { + "epoch": 14.095918975275543, + "grad_norm": 0.002032322809100151, + "learning_rate": 1.213987212637246e-05, + "loss": 0.0014, + "num_input_tokens_seen": 54908088, + "step": 94640 + }, + { + "epoch": 14.096663687816504, + "grad_norm": 0.001773362630046904, + "learning_rate": 1.2137085702402939e-05, + "loss": 0.0, + "num_input_tokens_seen": 54911512, + "step": 94645 + }, + { + "epoch": 14.097408400357462, + "grad_norm": 0.00500947842374444, + "learning_rate": 1.2134299495739274e-05, + "loss": 0.0, + "num_input_tokens_seen": 54914328, + "step": 94650 + }, + { + "epoch": 14.098153112898421, + "grad_norm": 7.655943772988394e-05, + "learning_rate": 1.2131513506428552e-05, + "loss": 0.0, + "num_input_tokens_seen": 54917112, + "step": 94655 + }, + { + "epoch": 14.09889782543938, + "grad_norm": 0.00498817628249526, + "learning_rate": 1.2128727734517819e-05, + "loss": 0.0, + "num_input_tokens_seen": 54919992, + "step": 94660 + }, + { + "epoch": 14.099642537980339, + "grad_norm": 0.010075557976961136, + "learning_rate": 1.2125942180054161e-05, + "loss": 0.0, + "num_input_tokens_seen": 54922840, + "step": 94665 + }, + { + "epoch": 14.1003872505213, + "grad_norm": 0.0002544453600421548, + "learning_rate": 1.2123156843084624e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54925880, + "step": 94670 + }, + { + "epoch": 14.101131963062258, + "grad_norm": 0.0005954526131972671, + "learning_rate": 1.2120371723656257e-05, + "loss": 0.0, + "num_input_tokens_seen": 54928728, + "step": 94675 + }, + { + "epoch": 14.101876675603217, + "grad_norm": 0.001114431768655777, + "learning_rate": 1.2117586821816127e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54931704, + "step": 94680 + }, + { + "epoch": 14.102621388144176, + "grad_norm": 0.012984232977032661, + "learning_rate": 1.2114802137611266e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54934584, + "step": 94685 + }, + { + "epoch": 14.103366100685136, + "grad_norm": 0.0005417711799964309, + "learning_rate": 1.2112017671088737e-05, + "loss": 0.0, + "num_input_tokens_seen": 54937368, + "step": 94690 + }, + { + "epoch": 14.104110813226095, + "grad_norm": 5.930214881896973, + "learning_rate": 1.2109233422295568e-05, + "loss": 0.1173, + "num_input_tokens_seen": 54940056, + "step": 94695 + }, + { + "epoch": 14.104855525767054, + "grad_norm": 0.0032032716553658247, + "learning_rate": 1.2106449391278802e-05, + "loss": 0.0064, + "num_input_tokens_seen": 54943032, + "step": 94700 + }, + { + "epoch": 14.105600238308012, + "grad_norm": 0.0009444049210287631, + "learning_rate": 1.2103665578085458e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54946040, + "step": 94705 + }, + { + "epoch": 14.106344950848973, + "grad_norm": 1.239157973031979e-05, + "learning_rate": 1.2100881982762589e-05, + "loss": 0.0001, + "num_input_tokens_seen": 54948728, + "step": 94710 + }, + { + "epoch": 14.107089663389932, + "grad_norm": 0.00044347939547151327, + "learning_rate": 1.2098098605357205e-05, + "loss": 0.0, + "num_input_tokens_seen": 54951672, + "step": 94715 + }, + { + "epoch": 14.10783437593089, + "grad_norm": 23.592954635620117, + "learning_rate": 1.2095315445916323e-05, + "loss": 0.002, + "num_input_tokens_seen": 54954520, + "step": 94720 + }, + { + "epoch": 14.10857908847185, + "grad_norm": 0.0012166522210463881, + "learning_rate": 1.2092532504486981e-05, + "loss": 0.0, + "num_input_tokens_seen": 54957592, + "step": 94725 + }, + { + "epoch": 14.10932380101281, + "grad_norm": 0.010110379196703434, + "learning_rate": 1.2089749781116175e-05, + "loss": 0.0003, + "num_input_tokens_seen": 54960696, + "step": 94730 + }, + { + "epoch": 14.110068513553768, + "grad_norm": 0.0001432000135537237, + "learning_rate": 1.2086967275850936e-05, + "loss": 0.0, + "num_input_tokens_seen": 54963288, + "step": 94735 + }, + { + "epoch": 14.110813226094727, + "grad_norm": 0.005591929890215397, + "learning_rate": 1.2084184988738247e-05, + "loss": 0.0011, + "num_input_tokens_seen": 54966232, + "step": 94740 + }, + { + "epoch": 14.111557938635686, + "grad_norm": 0.00056054926244542, + "learning_rate": 1.2081402919825139e-05, + "loss": 0.0426, + "num_input_tokens_seen": 54969400, + "step": 94745 + }, + { + "epoch": 14.112302651176647, + "grad_norm": 0.0005637677968479693, + "learning_rate": 1.2078621069158596e-05, + "loss": 0.005, + "num_input_tokens_seen": 54972536, + "step": 94750 + }, + { + "epoch": 14.113047363717605, + "grad_norm": 0.0014907604781910777, + "learning_rate": 1.2075839436785611e-05, + "loss": 0.0, + "num_input_tokens_seen": 54975448, + "step": 94755 + }, + { + "epoch": 14.113792076258564, + "grad_norm": 0.00013692310312762856, + "learning_rate": 1.2073058022753189e-05, + "loss": 0.001, + "num_input_tokens_seen": 54978424, + "step": 94760 + }, + { + "epoch": 14.114536788799523, + "grad_norm": 0.006185346283018589, + "learning_rate": 1.2070276827108315e-05, + "loss": 0.0, + "num_input_tokens_seen": 54981272, + "step": 94765 + }, + { + "epoch": 14.115281501340483, + "grad_norm": 0.0006420259596779943, + "learning_rate": 1.2067495849897972e-05, + "loss": 0.0, + "num_input_tokens_seen": 54984216, + "step": 94770 + }, + { + "epoch": 14.116026213881442, + "grad_norm": 9.883459824777674e-06, + "learning_rate": 1.2064715091169135e-05, + "loss": 0.0089, + "num_input_tokens_seen": 54987320, + "step": 94775 + }, + { + "epoch": 14.1167709264224, + "grad_norm": 0.00011785375681938604, + "learning_rate": 1.2061934550968798e-05, + "loss": 0.0002, + "num_input_tokens_seen": 54990104, + "step": 94780 + }, + { + "epoch": 14.11751563896336, + "grad_norm": 0.00585566833615303, + "learning_rate": 1.2059154229343919e-05, + "loss": 0.0, + "num_input_tokens_seen": 54992792, + "step": 94785 + }, + { + "epoch": 14.11826035150432, + "grad_norm": 0.0005434556514956057, + "learning_rate": 1.2056374126341485e-05, + "loss": 0.0, + "num_input_tokens_seen": 54995800, + "step": 94790 + }, + { + "epoch": 14.119005064045279, + "grad_norm": 0.0005009131273254752, + "learning_rate": 1.2053594242008453e-05, + "loss": 0.0, + "num_input_tokens_seen": 54998872, + "step": 94795 + }, + { + "epoch": 14.119749776586238, + "grad_norm": 0.0030185459181666374, + "learning_rate": 1.205081457639178e-05, + "loss": 0.0, + "num_input_tokens_seen": 55001848, + "step": 94800 + }, + { + "epoch": 14.120494489127196, + "grad_norm": 4.758149225381203e-05, + "learning_rate": 1.2048035129538446e-05, + "loss": 0.0, + "num_input_tokens_seen": 55004504, + "step": 94805 + }, + { + "epoch": 14.121239201668157, + "grad_norm": 0.00046471558744087815, + "learning_rate": 1.2045255901495384e-05, + "loss": 0.0133, + "num_input_tokens_seen": 55007640, + "step": 94810 + }, + { + "epoch": 14.121983914209116, + "grad_norm": 0.00014300597831606865, + "learning_rate": 1.2042476892309565e-05, + "loss": 0.0, + "num_input_tokens_seen": 55010264, + "step": 94815 + }, + { + "epoch": 14.122728626750074, + "grad_norm": 0.0004948177956975996, + "learning_rate": 1.203969810202793e-05, + "loss": 0.0, + "num_input_tokens_seen": 55013144, + "step": 94820 + }, + { + "epoch": 14.123473339291033, + "grad_norm": 0.0004207390593364835, + "learning_rate": 1.2036919530697412e-05, + "loss": 0.0, + "num_input_tokens_seen": 55016152, + "step": 94825 + }, + { + "epoch": 14.124218051831992, + "grad_norm": 0.0002915702061727643, + "learning_rate": 1.2034141178364974e-05, + "loss": 0.0, + "num_input_tokens_seen": 55019064, + "step": 94830 + }, + { + "epoch": 14.124962764372953, + "grad_norm": 2.2641796022071503e-05, + "learning_rate": 1.2031363045077545e-05, + "loss": 0.0049, + "num_input_tokens_seen": 55021848, + "step": 94835 + }, + { + "epoch": 14.125707476913911, + "grad_norm": 0.00016214183415286243, + "learning_rate": 1.2028585130882056e-05, + "loss": 0.0, + "num_input_tokens_seen": 55024728, + "step": 94840 + }, + { + "epoch": 14.12645218945487, + "grad_norm": 1.0764534636109602e-05, + "learning_rate": 1.2025807435825426e-05, + "loss": 0.0, + "num_input_tokens_seen": 55027672, + "step": 94845 + }, + { + "epoch": 14.127196901995829, + "grad_norm": 0.0002905297151301056, + "learning_rate": 1.2023029959954603e-05, + "loss": 0.0, + "num_input_tokens_seen": 55030456, + "step": 94850 + }, + { + "epoch": 14.12794161453679, + "grad_norm": 7.510869181714952e-06, + "learning_rate": 1.2020252703316492e-05, + "loss": 0.0, + "num_input_tokens_seen": 55033528, + "step": 94855 + }, + { + "epoch": 14.128686327077748, + "grad_norm": 6.704562838422135e-05, + "learning_rate": 1.2017475665958028e-05, + "loss": 0.0, + "num_input_tokens_seen": 55036280, + "step": 94860 + }, + { + "epoch": 14.129431039618707, + "grad_norm": 7.632032065885141e-05, + "learning_rate": 1.201469884792611e-05, + "loss": 0.1376, + "num_input_tokens_seen": 55039384, + "step": 94865 + }, + { + "epoch": 14.130175752159666, + "grad_norm": 7.921523501863703e-05, + "learning_rate": 1.2011922249267662e-05, + "loss": 0.0, + "num_input_tokens_seen": 55042296, + "step": 94870 + }, + { + "epoch": 14.130920464700626, + "grad_norm": 0.011814836412668228, + "learning_rate": 1.2009145870029592e-05, + "loss": 0.0, + "num_input_tokens_seen": 55045144, + "step": 94875 + }, + { + "epoch": 14.131665177241585, + "grad_norm": 0.0031516491435468197, + "learning_rate": 1.200636971025879e-05, + "loss": 0.0, + "num_input_tokens_seen": 55047832, + "step": 94880 + }, + { + "epoch": 14.132409889782544, + "grad_norm": 0.00025545660173520446, + "learning_rate": 1.2003593770002169e-05, + "loss": 0.0, + "num_input_tokens_seen": 55050680, + "step": 94885 + }, + { + "epoch": 14.133154602323502, + "grad_norm": 4.806437937077135e-05, + "learning_rate": 1.2000818049306628e-05, + "loss": 0.0, + "num_input_tokens_seen": 55053656, + "step": 94890 + }, + { + "epoch": 14.133899314864463, + "grad_norm": 4.573581918521086e-06, + "learning_rate": 1.1998042548219052e-05, + "loss": 0.0, + "num_input_tokens_seen": 55056536, + "step": 94895 + }, + { + "epoch": 14.134644027405422, + "grad_norm": 0.000176854882738553, + "learning_rate": 1.1995267266786325e-05, + "loss": 0.0, + "num_input_tokens_seen": 55059256, + "step": 94900 + }, + { + "epoch": 14.13538873994638, + "grad_norm": 0.03917309641838074, + "learning_rate": 1.1992492205055347e-05, + "loss": 0.0, + "num_input_tokens_seen": 55062488, + "step": 94905 + }, + { + "epoch": 14.13613345248734, + "grad_norm": 0.00012293354666326195, + "learning_rate": 1.1989717363072986e-05, + "loss": 0.0, + "num_input_tokens_seen": 55065304, + "step": 94910 + }, + { + "epoch": 14.1368781650283, + "grad_norm": 0.0001506677217548713, + "learning_rate": 1.1986942740886135e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55068408, + "step": 94915 + }, + { + "epoch": 14.137622877569259, + "grad_norm": 4.38818424299825e-05, + "learning_rate": 1.198416833854166e-05, + "loss": 0.0, + "num_input_tokens_seen": 55071416, + "step": 94920 + }, + { + "epoch": 14.138367590110217, + "grad_norm": 0.02297975867986679, + "learning_rate": 1.1981394156086423e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55074264, + "step": 94925 + }, + { + "epoch": 14.139112302651176, + "grad_norm": 0.0013401646865531802, + "learning_rate": 1.197862019356731e-05, + "loss": 0.0, + "num_input_tokens_seen": 55077048, + "step": 94930 + }, + { + "epoch": 14.139857015192137, + "grad_norm": 8.256187438964844, + "learning_rate": 1.1975846451031167e-05, + "loss": 0.0012, + "num_input_tokens_seen": 55080088, + "step": 94935 + }, + { + "epoch": 14.140601727733095, + "grad_norm": 0.0003126701631117612, + "learning_rate": 1.1973072928524868e-05, + "loss": 0.0003, + "num_input_tokens_seen": 55083032, + "step": 94940 + }, + { + "epoch": 14.141346440274054, + "grad_norm": 0.004442583303898573, + "learning_rate": 1.1970299626095252e-05, + "loss": 0.0, + "num_input_tokens_seen": 55085944, + "step": 94945 + }, + { + "epoch": 14.142091152815013, + "grad_norm": 0.0071732476353645325, + "learning_rate": 1.1967526543789192e-05, + "loss": 0.0, + "num_input_tokens_seen": 55089016, + "step": 94950 + }, + { + "epoch": 14.142835865355973, + "grad_norm": 0.0013518333435058594, + "learning_rate": 1.1964753681653526e-05, + "loss": 0.0002, + "num_input_tokens_seen": 55091672, + "step": 94955 + }, + { + "epoch": 14.143580577896932, + "grad_norm": 0.0014763528015464544, + "learning_rate": 1.1961981039735096e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55094488, + "step": 94960 + }, + { + "epoch": 14.14432529043789, + "grad_norm": 0.000470965460408479, + "learning_rate": 1.1959208618080747e-05, + "loss": 0.0, + "num_input_tokens_seen": 55097368, + "step": 94965 + }, + { + "epoch": 14.14507000297885, + "grad_norm": 0.00022397332941181958, + "learning_rate": 1.1956436416737304e-05, + "loss": 0.0089, + "num_input_tokens_seen": 55100184, + "step": 94970 + }, + { + "epoch": 14.14581471551981, + "grad_norm": 0.0037528022658079863, + "learning_rate": 1.1953664435751621e-05, + "loss": 0.0, + "num_input_tokens_seen": 55103096, + "step": 94975 + }, + { + "epoch": 14.146559428060769, + "grad_norm": 0.0006340639665722847, + "learning_rate": 1.1950892675170509e-05, + "loss": 0.0011, + "num_input_tokens_seen": 55105784, + "step": 94980 + }, + { + "epoch": 14.147304140601728, + "grad_norm": 1.003697525447933e-05, + "learning_rate": 1.194812113504081e-05, + "loss": 0.0, + "num_input_tokens_seen": 55108440, + "step": 94985 + }, + { + "epoch": 14.148048853142686, + "grad_norm": 0.0005757492035627365, + "learning_rate": 1.194534981540933e-05, + "loss": 0.0, + "num_input_tokens_seen": 55111096, + "step": 94990 + }, + { + "epoch": 14.148793565683647, + "grad_norm": 8.105205779429525e-05, + "learning_rate": 1.1942578716322905e-05, + "loss": 0.0, + "num_input_tokens_seen": 55114200, + "step": 94995 + }, + { + "epoch": 14.149538278224606, + "grad_norm": 3.373820072738454e-05, + "learning_rate": 1.1939807837828345e-05, + "loss": 0.0382, + "num_input_tokens_seen": 55117080, + "step": 95000 + }, + { + "epoch": 14.150282990765565, + "grad_norm": 6.451985245803371e-05, + "learning_rate": 1.1937037179972447e-05, + "loss": 0.001, + "num_input_tokens_seen": 55119544, + "step": 95005 + }, + { + "epoch": 14.151027703306523, + "grad_norm": 0.019476016983389854, + "learning_rate": 1.1934266742802039e-05, + "loss": 0.0, + "num_input_tokens_seen": 55122552, + "step": 95010 + }, + { + "epoch": 14.151772415847482, + "grad_norm": 0.00696785282343626, + "learning_rate": 1.1931496526363903e-05, + "loss": 0.0, + "num_input_tokens_seen": 55125560, + "step": 95015 + }, + { + "epoch": 14.152517128388443, + "grad_norm": 0.0015179577749222517, + "learning_rate": 1.1928726530704862e-05, + "loss": 0.0, + "num_input_tokens_seen": 55129080, + "step": 95020 + }, + { + "epoch": 14.153261840929401, + "grad_norm": 0.00028543174266815186, + "learning_rate": 1.1925956755871703e-05, + "loss": 0.0, + "num_input_tokens_seen": 55131832, + "step": 95025 + }, + { + "epoch": 14.15400655347036, + "grad_norm": 0.00031750902417115867, + "learning_rate": 1.1923187201911215e-05, + "loss": 0.0, + "num_input_tokens_seen": 55134552, + "step": 95030 + }, + { + "epoch": 14.154751266011319, + "grad_norm": 0.0010508825071156025, + "learning_rate": 1.1920417868870187e-05, + "loss": 0.0, + "num_input_tokens_seen": 55137656, + "step": 95035 + }, + { + "epoch": 14.15549597855228, + "grad_norm": 0.0014713130658492446, + "learning_rate": 1.1917648756795399e-05, + "loss": 0.0, + "num_input_tokens_seen": 55140344, + "step": 95040 + }, + { + "epoch": 14.156240691093238, + "grad_norm": 7.829114474589005e-05, + "learning_rate": 1.1914879865733647e-05, + "loss": 0.0, + "num_input_tokens_seen": 55143448, + "step": 95045 + }, + { + "epoch": 14.156985403634197, + "grad_norm": 0.07039415091276169, + "learning_rate": 1.1912111195731693e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55146072, + "step": 95050 + }, + { + "epoch": 14.157730116175156, + "grad_norm": 0.09448053687810898, + "learning_rate": 1.1909342746836325e-05, + "loss": 0.0, + "num_input_tokens_seen": 55148760, + "step": 95055 + }, + { + "epoch": 14.158474828716116, + "grad_norm": 0.0014937700470909476, + "learning_rate": 1.1906574519094299e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55151896, + "step": 95060 + }, + { + "epoch": 14.159219541257075, + "grad_norm": 0.0005040362011641264, + "learning_rate": 1.1903806512552395e-05, + "loss": 0.0, + "num_input_tokens_seen": 55154680, + "step": 95065 + }, + { + "epoch": 14.159964253798034, + "grad_norm": 3.0055098250159062e-05, + "learning_rate": 1.1901038727257366e-05, + "loss": 0.0264, + "num_input_tokens_seen": 55157848, + "step": 95070 + }, + { + "epoch": 14.160708966338992, + "grad_norm": 0.0006676979828625917, + "learning_rate": 1.189827116325598e-05, + "loss": 0.0764, + "num_input_tokens_seen": 55160952, + "step": 95075 + }, + { + "epoch": 14.161453678879953, + "grad_norm": 0.0023418497294187546, + "learning_rate": 1.1895503820594985e-05, + "loss": 0.0, + "num_input_tokens_seen": 55163864, + "step": 95080 + }, + { + "epoch": 14.162198391420912, + "grad_norm": 0.011862454004585743, + "learning_rate": 1.189273669932113e-05, + "loss": 0.003, + "num_input_tokens_seen": 55166872, + "step": 95085 + }, + { + "epoch": 14.16294310396187, + "grad_norm": 0.594710111618042, + "learning_rate": 1.1889969799481173e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55169912, + "step": 95090 + }, + { + "epoch": 14.16368781650283, + "grad_norm": 3.0978128052083775e-05, + "learning_rate": 1.1887203121121851e-05, + "loss": 0.0, + "num_input_tokens_seen": 55173176, + "step": 95095 + }, + { + "epoch": 14.16443252904379, + "grad_norm": 0.0037947536911815405, + "learning_rate": 1.1884436664289908e-05, + "loss": 0.0, + "num_input_tokens_seen": 55175896, + "step": 95100 + }, + { + "epoch": 14.165177241584749, + "grad_norm": 0.10040753334760666, + "learning_rate": 1.1881670429032066e-05, + "loss": 0.0, + "num_input_tokens_seen": 55178968, + "step": 95105 + }, + { + "epoch": 14.165921954125707, + "grad_norm": 0.0008298223838210106, + "learning_rate": 1.1878904415395078e-05, + "loss": 0.0, + "num_input_tokens_seen": 55181912, + "step": 95110 + }, + { + "epoch": 14.166666666666666, + "grad_norm": 0.00022415058629121631, + "learning_rate": 1.1876138623425667e-05, + "loss": 0.2438, + "num_input_tokens_seen": 55184696, + "step": 95115 + }, + { + "epoch": 14.167411379207627, + "grad_norm": 7.692470535403118e-05, + "learning_rate": 1.1873373053170545e-05, + "loss": 0.0, + "num_input_tokens_seen": 55187512, + "step": 95120 + }, + { + "epoch": 14.168156091748585, + "grad_norm": 5.167526069271844e-06, + "learning_rate": 1.187060770467645e-05, + "loss": 0.0, + "num_input_tokens_seen": 55190360, + "step": 95125 + }, + { + "epoch": 14.168900804289544, + "grad_norm": 0.0020777422469109297, + "learning_rate": 1.1867842577990087e-05, + "loss": 0.0, + "num_input_tokens_seen": 55193272, + "step": 95130 + }, + { + "epoch": 14.169645516830503, + "grad_norm": 0.0014700888423249125, + "learning_rate": 1.1865077673158188e-05, + "loss": 0.0, + "num_input_tokens_seen": 55196024, + "step": 95135 + }, + { + "epoch": 14.170390229371463, + "grad_norm": 0.00018077778804581612, + "learning_rate": 1.186231299022744e-05, + "loss": 0.0, + "num_input_tokens_seen": 55199160, + "step": 95140 + }, + { + "epoch": 14.171134941912422, + "grad_norm": 1.2787964344024658, + "learning_rate": 1.1859548529244571e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55203032, + "step": 95145 + }, + { + "epoch": 14.171879654453381, + "grad_norm": 8.373611490242183e-05, + "learning_rate": 1.1856784290256276e-05, + "loss": 0.0, + "num_input_tokens_seen": 55206008, + "step": 95150 + }, + { + "epoch": 14.17262436699434, + "grad_norm": 0.0003023573663085699, + "learning_rate": 1.1854020273309241e-05, + "loss": 0.0, + "num_input_tokens_seen": 55208920, + "step": 95155 + }, + { + "epoch": 14.1733690795353, + "grad_norm": 0.0010761702433228493, + "learning_rate": 1.1851256478450181e-05, + "loss": 0.0, + "num_input_tokens_seen": 55212056, + "step": 95160 + }, + { + "epoch": 14.174113792076259, + "grad_norm": 7.925274258013815e-05, + "learning_rate": 1.1848492905725781e-05, + "loss": 0.0, + "num_input_tokens_seen": 55214936, + "step": 95165 + }, + { + "epoch": 14.174858504617218, + "grad_norm": 0.0001385725918225944, + "learning_rate": 1.1845729555182728e-05, + "loss": 0.0, + "num_input_tokens_seen": 55217912, + "step": 95170 + }, + { + "epoch": 14.175603217158177, + "grad_norm": 0.0002273871941724792, + "learning_rate": 1.1842966426867694e-05, + "loss": 0.0, + "num_input_tokens_seen": 55220920, + "step": 95175 + }, + { + "epoch": 14.176347929699135, + "grad_norm": 0.018733952194452286, + "learning_rate": 1.1840203520827378e-05, + "loss": 0.0, + "num_input_tokens_seen": 55223608, + "step": 95180 + }, + { + "epoch": 14.177092642240096, + "grad_norm": 0.0025357771664857864, + "learning_rate": 1.183744083710844e-05, + "loss": 0.0, + "num_input_tokens_seen": 55226456, + "step": 95185 + }, + { + "epoch": 14.177837354781055, + "grad_norm": 5.451482502394356e-05, + "learning_rate": 1.1834678375757571e-05, + "loss": 0.0002, + "num_input_tokens_seen": 55229528, + "step": 95190 + }, + { + "epoch": 14.178582067322013, + "grad_norm": 3.036061389138922e-05, + "learning_rate": 1.183191613682143e-05, + "loss": 0.0, + "num_input_tokens_seen": 55232600, + "step": 95195 + }, + { + "epoch": 14.179326779862972, + "grad_norm": 5.57934727112297e-05, + "learning_rate": 1.1829154120346673e-05, + "loss": 0.0, + "num_input_tokens_seen": 55235672, + "step": 95200 + }, + { + "epoch": 14.180071492403933, + "grad_norm": 0.0020102686248719692, + "learning_rate": 1.1826392326379981e-05, + "loss": 0.0, + "num_input_tokens_seen": 55238552, + "step": 95205 + }, + { + "epoch": 14.180816204944891, + "grad_norm": 0.0029587203171104193, + "learning_rate": 1.1823630754967991e-05, + "loss": 0.0, + "num_input_tokens_seen": 55241496, + "step": 95210 + }, + { + "epoch": 14.18156091748585, + "grad_norm": 0.00032429234124720097, + "learning_rate": 1.1820869406157378e-05, + "loss": 0.0, + "num_input_tokens_seen": 55244344, + "step": 95215 + }, + { + "epoch": 14.182305630026809, + "grad_norm": 8.846780838211998e-05, + "learning_rate": 1.181810827999478e-05, + "loss": 0.0, + "num_input_tokens_seen": 55247448, + "step": 95220 + }, + { + "epoch": 14.18305034256777, + "grad_norm": 6.889841461088508e-05, + "learning_rate": 1.1815347376526847e-05, + "loss": 0.0, + "num_input_tokens_seen": 55250200, + "step": 95225 + }, + { + "epoch": 14.183795055108728, + "grad_norm": 0.0005178017308935523, + "learning_rate": 1.181258669580021e-05, + "loss": 0.0, + "num_input_tokens_seen": 55253048, + "step": 95230 + }, + { + "epoch": 14.184539767649687, + "grad_norm": 0.038611847907304764, + "learning_rate": 1.1809826237861527e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55256216, + "step": 95235 + }, + { + "epoch": 14.185284480190646, + "grad_norm": 2.0318411770858802e-05, + "learning_rate": 1.1807066002757422e-05, + "loss": 0.0, + "num_input_tokens_seen": 55259000, + "step": 95240 + }, + { + "epoch": 14.186029192731606, + "grad_norm": 0.0001376289437757805, + "learning_rate": 1.180430599053452e-05, + "loss": 0.0, + "num_input_tokens_seen": 55261592, + "step": 95245 + }, + { + "epoch": 14.186773905272565, + "grad_norm": 0.03979703038930893, + "learning_rate": 1.1801546201239466e-05, + "loss": 0.0244, + "num_input_tokens_seen": 55264312, + "step": 95250 + }, + { + "epoch": 14.187518617813524, + "grad_norm": 3.2879011996556073e-05, + "learning_rate": 1.1798786634918868e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55267128, + "step": 95255 + }, + { + "epoch": 14.188263330354483, + "grad_norm": 2.6963318305206485e-05, + "learning_rate": 1.1796027291619358e-05, + "loss": 0.0, + "num_input_tokens_seen": 55269880, + "step": 95260 + }, + { + "epoch": 14.189008042895443, + "grad_norm": 0.00016341077571269125, + "learning_rate": 1.1793268171387539e-05, + "loss": 0.0, + "num_input_tokens_seen": 55272696, + "step": 95265 + }, + { + "epoch": 14.189752755436402, + "grad_norm": 8.644081390229985e-05, + "learning_rate": 1.1790509274270042e-05, + "loss": 0.0, + "num_input_tokens_seen": 55275544, + "step": 95270 + }, + { + "epoch": 14.19049746797736, + "grad_norm": 3.88928601751104e-05, + "learning_rate": 1.1787750600313465e-05, + "loss": 0.0, + "num_input_tokens_seen": 55278552, + "step": 95275 + }, + { + "epoch": 14.19124218051832, + "grad_norm": 0.0020416919142007828, + "learning_rate": 1.1784992149564403e-05, + "loss": 0.027, + "num_input_tokens_seen": 55281464, + "step": 95280 + }, + { + "epoch": 14.19198689305928, + "grad_norm": 0.0021005054004490376, + "learning_rate": 1.1782233922069478e-05, + "loss": 0.0, + "num_input_tokens_seen": 55284408, + "step": 95285 + }, + { + "epoch": 14.192731605600239, + "grad_norm": 2.8096830646973103e-05, + "learning_rate": 1.1779475917875278e-05, + "loss": 0.0, + "num_input_tokens_seen": 55287064, + "step": 95290 + }, + { + "epoch": 14.193476318141197, + "grad_norm": 4.4869670091429725e-05, + "learning_rate": 1.1776718137028392e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55289752, + "step": 95295 + }, + { + "epoch": 14.194221030682156, + "grad_norm": 0.0004284800961613655, + "learning_rate": 1.1773960579575408e-05, + "loss": 0.0, + "num_input_tokens_seen": 55292504, + "step": 95300 + }, + { + "epoch": 14.194965743223117, + "grad_norm": 0.00018245080718770623, + "learning_rate": 1.1771203245562924e-05, + "loss": 0.0, + "num_input_tokens_seen": 55295448, + "step": 95305 + }, + { + "epoch": 14.195710455764075, + "grad_norm": 0.0009809350594878197, + "learning_rate": 1.176844613503751e-05, + "loss": 0.0, + "num_input_tokens_seen": 55298520, + "step": 95310 + }, + { + "epoch": 14.196455168305034, + "grad_norm": 0.00012972368858754635, + "learning_rate": 1.1765689248045755e-05, + "loss": 0.0, + "num_input_tokens_seen": 55301304, + "step": 95315 + }, + { + "epoch": 14.197199880845993, + "grad_norm": 0.00024888766347430646, + "learning_rate": 1.1762932584634234e-05, + "loss": 0.0, + "num_input_tokens_seen": 55304056, + "step": 95320 + }, + { + "epoch": 14.197944593386953, + "grad_norm": 0.0003613273147493601, + "learning_rate": 1.1760176144849502e-05, + "loss": 0.0, + "num_input_tokens_seen": 55306808, + "step": 95325 + }, + { + "epoch": 14.198689305927912, + "grad_norm": 0.00016851149848662317, + "learning_rate": 1.1757419928738147e-05, + "loss": 0.0003, + "num_input_tokens_seen": 55309624, + "step": 95330 + }, + { + "epoch": 14.199434018468871, + "grad_norm": 0.00027904281159862876, + "learning_rate": 1.1754663936346713e-05, + "loss": 0.0, + "num_input_tokens_seen": 55312504, + "step": 95335 + }, + { + "epoch": 14.20017873100983, + "grad_norm": 0.00028137813205830753, + "learning_rate": 1.1751908167721782e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55315384, + "step": 95340 + }, + { + "epoch": 14.200923443550789, + "grad_norm": 0.00018872715008910745, + "learning_rate": 1.1749152622909884e-05, + "loss": 0.0002, + "num_input_tokens_seen": 55318296, + "step": 95345 + }, + { + "epoch": 14.201668156091749, + "grad_norm": 0.0032535698264837265, + "learning_rate": 1.1746397301957598e-05, + "loss": 0.0, + "num_input_tokens_seen": 55320920, + "step": 95350 + }, + { + "epoch": 14.202412868632708, + "grad_norm": 1.4908519005985e-05, + "learning_rate": 1.174364220491146e-05, + "loss": 0.0, + "num_input_tokens_seen": 55324152, + "step": 95355 + }, + { + "epoch": 14.203157581173667, + "grad_norm": 0.00017246833886019886, + "learning_rate": 1.1740887331818009e-05, + "loss": 0.0, + "num_input_tokens_seen": 55327096, + "step": 95360 + }, + { + "epoch": 14.203902293714625, + "grad_norm": 0.0005278746830299497, + "learning_rate": 1.1738132682723797e-05, + "loss": 0.0, + "num_input_tokens_seen": 55330424, + "step": 95365 + }, + { + "epoch": 14.204647006255586, + "grad_norm": 1.2353039892332163e-05, + "learning_rate": 1.1735378257675338e-05, + "loss": 0.2906, + "num_input_tokens_seen": 55333464, + "step": 95370 + }, + { + "epoch": 14.205391718796545, + "grad_norm": 0.00013981098891235888, + "learning_rate": 1.1732624056719197e-05, + "loss": 0.0, + "num_input_tokens_seen": 55336184, + "step": 95375 + }, + { + "epoch": 14.206136431337503, + "grad_norm": 4.317973071010783e-05, + "learning_rate": 1.1729870079901875e-05, + "loss": 0.0, + "num_input_tokens_seen": 55339160, + "step": 95380 + }, + { + "epoch": 14.206881143878462, + "grad_norm": 20.51653289794922, + "learning_rate": 1.1727116327269924e-05, + "loss": 0.1564, + "num_input_tokens_seen": 55342008, + "step": 95385 + }, + { + "epoch": 14.207625856419423, + "grad_norm": 0.0003231251030229032, + "learning_rate": 1.172436279886984e-05, + "loss": 0.0, + "num_input_tokens_seen": 55344792, + "step": 95390 + }, + { + "epoch": 14.208370568960381, + "grad_norm": 0.01262758020311594, + "learning_rate": 1.1721609494748164e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55347800, + "step": 95395 + }, + { + "epoch": 14.20911528150134, + "grad_norm": 0.0005525814485736191, + "learning_rate": 1.1718856414951402e-05, + "loss": 0.0, + "num_input_tokens_seen": 55350712, + "step": 95400 + }, + { + "epoch": 14.209859994042299, + "grad_norm": 0.00014761617057956755, + "learning_rate": 1.1716103559526051e-05, + "loss": 0.027, + "num_input_tokens_seen": 55353368, + "step": 95405 + }, + { + "epoch": 14.21060470658326, + "grad_norm": 7.807743531884626e-05, + "learning_rate": 1.1713350928518639e-05, + "loss": 0.0, + "num_input_tokens_seen": 55355992, + "step": 95410 + }, + { + "epoch": 14.211349419124218, + "grad_norm": 0.00022620086383540183, + "learning_rate": 1.171059852197565e-05, + "loss": 0.0, + "num_input_tokens_seen": 55358744, + "step": 95415 + }, + { + "epoch": 14.212094131665177, + "grad_norm": 0.08368048071861267, + "learning_rate": 1.1707846339943601e-05, + "loss": 0.0, + "num_input_tokens_seen": 55361944, + "step": 95420 + }, + { + "epoch": 14.212838844206136, + "grad_norm": 72.07325744628906, + "learning_rate": 1.1705094382468979e-05, + "loss": 0.1005, + "num_input_tokens_seen": 55364536, + "step": 95425 + }, + { + "epoch": 14.213583556747096, + "grad_norm": 0.0006557218730449677, + "learning_rate": 1.1702342649598274e-05, + "loss": 0.0, + "num_input_tokens_seen": 55367320, + "step": 95430 + }, + { + "epoch": 14.214328269288055, + "grad_norm": 73.50096893310547, + "learning_rate": 1.1699591141377967e-05, + "loss": 0.0057, + "num_input_tokens_seen": 55370456, + "step": 95435 + }, + { + "epoch": 14.215072981829014, + "grad_norm": 0.00018856274255085737, + "learning_rate": 1.1696839857854558e-05, + "loss": 0.1159, + "num_input_tokens_seen": 55373240, + "step": 95440 + }, + { + "epoch": 14.215817694369973, + "grad_norm": 0.0005921709234826267, + "learning_rate": 1.169408879907452e-05, + "loss": 0.0, + "num_input_tokens_seen": 55376120, + "step": 95445 + }, + { + "epoch": 14.216562406910933, + "grad_norm": 0.0002221129834651947, + "learning_rate": 1.1691337965084321e-05, + "loss": 0.0, + "num_input_tokens_seen": 55379352, + "step": 95450 + }, + { + "epoch": 14.217307119451892, + "grad_norm": 0.00035286572529003024, + "learning_rate": 1.1688587355930444e-05, + "loss": 0.0, + "num_input_tokens_seen": 55382264, + "step": 95455 + }, + { + "epoch": 14.21805183199285, + "grad_norm": 0.08437441289424896, + "learning_rate": 1.168583697165935e-05, + "loss": 0.0, + "num_input_tokens_seen": 55385176, + "step": 95460 + }, + { + "epoch": 14.21879654453381, + "grad_norm": 2.3440061340807006e-05, + "learning_rate": 1.1683086812317517e-05, + "loss": 0.0878, + "num_input_tokens_seen": 55388152, + "step": 95465 + }, + { + "epoch": 14.21954125707477, + "grad_norm": 0.00018544351041782647, + "learning_rate": 1.1680336877951387e-05, + "loss": 0.0707, + "num_input_tokens_seen": 55391064, + "step": 95470 + }, + { + "epoch": 14.220285969615729, + "grad_norm": 6.658403435721993e-05, + "learning_rate": 1.1677587168607437e-05, + "loss": 0.0, + "num_input_tokens_seen": 55393592, + "step": 95475 + }, + { + "epoch": 14.221030682156687, + "grad_norm": 0.010401260107755661, + "learning_rate": 1.1674837684332113e-05, + "loss": 0.0141, + "num_input_tokens_seen": 55396472, + "step": 95480 + }, + { + "epoch": 14.221775394697646, + "grad_norm": 1.522857928648591e-05, + "learning_rate": 1.1672088425171854e-05, + "loss": 0.0, + "num_input_tokens_seen": 55399288, + "step": 95485 + }, + { + "epoch": 14.222520107238607, + "grad_norm": 0.0002192427491536364, + "learning_rate": 1.1669339391173122e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55402360, + "step": 95490 + }, + { + "epoch": 14.223264819779565, + "grad_norm": 0.38361644744873047, + "learning_rate": 1.1666590582382355e-05, + "loss": 0.0012, + "num_input_tokens_seen": 55405560, + "step": 95495 + }, + { + "epoch": 14.224009532320524, + "grad_norm": 0.0009353235363960266, + "learning_rate": 1.166384199884599e-05, + "loss": 0.0, + "num_input_tokens_seen": 55408664, + "step": 95500 + }, + { + "epoch": 14.224754244861483, + "grad_norm": 0.0005301455385051668, + "learning_rate": 1.1661093640610445e-05, + "loss": 0.0885, + "num_input_tokens_seen": 55411384, + "step": 95505 + }, + { + "epoch": 14.225498957402444, + "grad_norm": 0.011256048455834389, + "learning_rate": 1.1658345507722182e-05, + "loss": 0.0, + "num_input_tokens_seen": 55414424, + "step": 95510 + }, + { + "epoch": 14.226243669943402, + "grad_norm": 0.0018108583753928542, + "learning_rate": 1.1655597600227597e-05, + "loss": 0.0, + "num_input_tokens_seen": 55417240, + "step": 95515 + }, + { + "epoch": 14.226988382484361, + "grad_norm": 0.000443410623120144, + "learning_rate": 1.1652849918173139e-05, + "loss": 0.0, + "num_input_tokens_seen": 55420120, + "step": 95520 + }, + { + "epoch": 14.22773309502532, + "grad_norm": 0.0005157412961125374, + "learning_rate": 1.165010246160522e-05, + "loss": 0.0002, + "num_input_tokens_seen": 55423288, + "step": 95525 + }, + { + "epoch": 14.228477807566279, + "grad_norm": 0.021365460008382797, + "learning_rate": 1.1647355230570237e-05, + "loss": 0.0002, + "num_input_tokens_seen": 55426072, + "step": 95530 + }, + { + "epoch": 14.229222520107239, + "grad_norm": 0.00012658454943448305, + "learning_rate": 1.1644608225114629e-05, + "loss": 0.0, + "num_input_tokens_seen": 55428920, + "step": 95535 + }, + { + "epoch": 14.229967232648198, + "grad_norm": 1.1872835784743074e-05, + "learning_rate": 1.164186144528478e-05, + "loss": 0.0195, + "num_input_tokens_seen": 55431960, + "step": 95540 + }, + { + "epoch": 14.230711945189157, + "grad_norm": 0.0003699319495353848, + "learning_rate": 1.1639114891127114e-05, + "loss": 0.0, + "num_input_tokens_seen": 55435000, + "step": 95545 + }, + { + "epoch": 14.231456657730115, + "grad_norm": 0.00498080113902688, + "learning_rate": 1.1636368562688024e-05, + "loss": 0.0, + "num_input_tokens_seen": 55437848, + "step": 95550 + }, + { + "epoch": 14.232201370271076, + "grad_norm": 8.65151309967041, + "learning_rate": 1.1633622460013904e-05, + "loss": 0.0735, + "num_input_tokens_seen": 55441080, + "step": 95555 + }, + { + "epoch": 14.232946082812035, + "grad_norm": 3.265556370024569e-05, + "learning_rate": 1.163087658315114e-05, + "loss": 0.001, + "num_input_tokens_seen": 55444184, + "step": 95560 + }, + { + "epoch": 14.233690795352993, + "grad_norm": 6.988986569922417e-05, + "learning_rate": 1.1628130932146137e-05, + "loss": 0.0, + "num_input_tokens_seen": 55447192, + "step": 95565 + }, + { + "epoch": 14.234435507893952, + "grad_norm": 0.0015633349539712071, + "learning_rate": 1.1625385507045272e-05, + "loss": 0.0, + "num_input_tokens_seen": 55450168, + "step": 95570 + }, + { + "epoch": 14.235180220434913, + "grad_norm": 0.0017982709687203169, + "learning_rate": 1.1622640307894913e-05, + "loss": 0.0, + "num_input_tokens_seen": 55453144, + "step": 95575 + }, + { + "epoch": 14.235924932975871, + "grad_norm": 0.005933789536356926, + "learning_rate": 1.1619895334741463e-05, + "loss": 0.0, + "num_input_tokens_seen": 55456184, + "step": 95580 + }, + { + "epoch": 14.23666964551683, + "grad_norm": 0.0008025276474654675, + "learning_rate": 1.161715058763127e-05, + "loss": 0.0, + "num_input_tokens_seen": 55458904, + "step": 95585 + }, + { + "epoch": 14.237414358057789, + "grad_norm": 0.0008580884896218777, + "learning_rate": 1.1614406066610728e-05, + "loss": 0.004, + "num_input_tokens_seen": 55461976, + "step": 95590 + }, + { + "epoch": 14.23815907059875, + "grad_norm": 1.5186312339210417e-05, + "learning_rate": 1.1611661771726181e-05, + "loss": 0.0, + "num_input_tokens_seen": 55464984, + "step": 95595 + }, + { + "epoch": 14.238903783139708, + "grad_norm": 0.05301426723599434, + "learning_rate": 1.1608917703024009e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55467896, + "step": 95600 + }, + { + "epoch": 14.239648495680667, + "grad_norm": 0.0003303525154478848, + "learning_rate": 1.1606173860550562e-05, + "loss": 0.0023, + "num_input_tokens_seen": 55470680, + "step": 95605 + }, + { + "epoch": 14.240393208221626, + "grad_norm": 0.004105263855308294, + "learning_rate": 1.1603430244352187e-05, + "loss": 0.0188, + "num_input_tokens_seen": 55473592, + "step": 95610 + }, + { + "epoch": 14.241137920762586, + "grad_norm": 0.002611634088680148, + "learning_rate": 1.160068685447525e-05, + "loss": 0.4815, + "num_input_tokens_seen": 55476408, + "step": 95615 + }, + { + "epoch": 14.241882633303545, + "grad_norm": 0.00013325150939635932, + "learning_rate": 1.1597943690966092e-05, + "loss": 0.0, + "num_input_tokens_seen": 55479128, + "step": 95620 + }, + { + "epoch": 14.242627345844504, + "grad_norm": 0.0019257249077782035, + "learning_rate": 1.1595200753871055e-05, + "loss": 0.0, + "num_input_tokens_seen": 55482168, + "step": 95625 + }, + { + "epoch": 14.243372058385463, + "grad_norm": 0.0006104461499489844, + "learning_rate": 1.1592458043236468e-05, + "loss": 0.0, + "num_input_tokens_seen": 55485016, + "step": 95630 + }, + { + "epoch": 14.244116770926423, + "grad_norm": 0.00014670411474071443, + "learning_rate": 1.1589715559108682e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55487896, + "step": 95635 + }, + { + "epoch": 14.244861483467382, + "grad_norm": 0.00021450438362080604, + "learning_rate": 1.1586973301534024e-05, + "loss": 0.0005, + "num_input_tokens_seen": 55490584, + "step": 95640 + }, + { + "epoch": 14.24560619600834, + "grad_norm": 0.037418752908706665, + "learning_rate": 1.158423127055881e-05, + "loss": 0.0002, + "num_input_tokens_seen": 55493592, + "step": 95645 + }, + { + "epoch": 14.2463509085493, + "grad_norm": 0.26559239625930786, + "learning_rate": 1.1581489466229381e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55496376, + "step": 95650 + }, + { + "epoch": 14.24709562109026, + "grad_norm": 0.000565165828447789, + "learning_rate": 1.1578747888592043e-05, + "loss": 0.0037, + "num_input_tokens_seen": 55499576, + "step": 95655 + }, + { + "epoch": 14.247840333631219, + "grad_norm": 1.4854024648666382, + "learning_rate": 1.1576006537693127e-05, + "loss": 0.0023, + "num_input_tokens_seen": 55502264, + "step": 95660 + }, + { + "epoch": 14.248585046172177, + "grad_norm": 10.465691566467285, + "learning_rate": 1.1573265413578926e-05, + "loss": 0.1132, + "num_input_tokens_seen": 55505016, + "step": 95665 + }, + { + "epoch": 14.249329758713136, + "grad_norm": 0.0004562257381621748, + "learning_rate": 1.1570524516295773e-05, + "loss": 0.0, + "num_input_tokens_seen": 55507736, + "step": 95670 + }, + { + "epoch": 14.250074471254097, + "grad_norm": 0.00018111000827047974, + "learning_rate": 1.1567783845889946e-05, + "loss": 0.0, + "num_input_tokens_seen": 55510488, + "step": 95675 + }, + { + "epoch": 14.250819183795056, + "grad_norm": 0.07084474712610245, + "learning_rate": 1.1565043402407768e-05, + "loss": 0.0003, + "num_input_tokens_seen": 55513400, + "step": 95680 + }, + { + "epoch": 14.251563896336014, + "grad_norm": 0.0019561653025448322, + "learning_rate": 1.1562303185895528e-05, + "loss": 0.0, + "num_input_tokens_seen": 55516568, + "step": 95685 + }, + { + "epoch": 14.252308608876973, + "grad_norm": 0.0005611935630440712, + "learning_rate": 1.155956319639952e-05, + "loss": 0.0, + "num_input_tokens_seen": 55519512, + "step": 95690 + }, + { + "epoch": 14.253053321417934, + "grad_norm": 0.009587288834154606, + "learning_rate": 1.155682343396603e-05, + "loss": 0.0, + "num_input_tokens_seen": 55522232, + "step": 95695 + }, + { + "epoch": 14.253798033958892, + "grad_norm": 0.009470186196267605, + "learning_rate": 1.1554083898641335e-05, + "loss": 0.0, + "num_input_tokens_seen": 55525368, + "step": 95700 + }, + { + "epoch": 14.254542746499851, + "grad_norm": 0.0006220112554728985, + "learning_rate": 1.1551344590471739e-05, + "loss": 0.0, + "num_input_tokens_seen": 55528184, + "step": 95705 + }, + { + "epoch": 14.25528745904081, + "grad_norm": 0.0736134722828865, + "learning_rate": 1.1548605509503496e-05, + "loss": 0.0, + "num_input_tokens_seen": 55531128, + "step": 95710 + }, + { + "epoch": 14.256032171581769, + "grad_norm": 9.024945029523224e-05, + "learning_rate": 1.15458666557829e-05, + "loss": 0.0, + "num_input_tokens_seen": 55533816, + "step": 95715 + }, + { + "epoch": 14.25677688412273, + "grad_norm": 0.03024846687912941, + "learning_rate": 1.1543128029356215e-05, + "loss": 0.0002, + "num_input_tokens_seen": 55536888, + "step": 95720 + }, + { + "epoch": 14.257521596663688, + "grad_norm": 0.24233144521713257, + "learning_rate": 1.1540389630269693e-05, + "loss": 0.0029, + "num_input_tokens_seen": 55539832, + "step": 95725 + }, + { + "epoch": 14.258266309204647, + "grad_norm": 0.0035835120361298323, + "learning_rate": 1.153765145856962e-05, + "loss": 0.0, + "num_input_tokens_seen": 55542872, + "step": 95730 + }, + { + "epoch": 14.259011021745605, + "grad_norm": 0.00014017561625223607, + "learning_rate": 1.1534913514302232e-05, + "loss": 0.0, + "num_input_tokens_seen": 55545720, + "step": 95735 + }, + { + "epoch": 14.259755734286566, + "grad_norm": 0.000338612066116184, + "learning_rate": 1.1532175797513806e-05, + "loss": 0.2656, + "num_input_tokens_seen": 55548664, + "step": 95740 + }, + { + "epoch": 14.260500446827525, + "grad_norm": 0.05456757918000221, + "learning_rate": 1.152943830825057e-05, + "loss": 0.0, + "num_input_tokens_seen": 55551608, + "step": 95745 + }, + { + "epoch": 14.261245159368483, + "grad_norm": 4.43748795078136e-05, + "learning_rate": 1.1526701046558794e-05, + "loss": 0.0, + "num_input_tokens_seen": 55554168, + "step": 95750 + }, + { + "epoch": 14.261989871909442, + "grad_norm": 0.0019429989624768496, + "learning_rate": 1.1523964012484712e-05, + "loss": 0.0, + "num_input_tokens_seen": 55557272, + "step": 95755 + }, + { + "epoch": 14.262734584450403, + "grad_norm": 0.0017443104879930615, + "learning_rate": 1.1521227206074559e-05, + "loss": 0.1356, + "num_input_tokens_seen": 55560344, + "step": 95760 + }, + { + "epoch": 14.263479296991362, + "grad_norm": 0.0009917186107486486, + "learning_rate": 1.1518490627374572e-05, + "loss": 0.0016, + "num_input_tokens_seen": 55563224, + "step": 95765 + }, + { + "epoch": 14.26422400953232, + "grad_norm": 0.005352272652089596, + "learning_rate": 1.151575427643098e-05, + "loss": 0.0002, + "num_input_tokens_seen": 55566232, + "step": 95770 + }, + { + "epoch": 14.264968722073279, + "grad_norm": 0.02433205582201481, + "learning_rate": 1.1513018153290018e-05, + "loss": 0.0, + "num_input_tokens_seen": 55569016, + "step": 95775 + }, + { + "epoch": 14.26571343461424, + "grad_norm": 0.0010578951332718134, + "learning_rate": 1.15102822579979e-05, + "loss": 0.0002, + "num_input_tokens_seen": 55571832, + "step": 95780 + }, + { + "epoch": 14.266458147155198, + "grad_norm": 0.0026797037571668625, + "learning_rate": 1.1507546590600862e-05, + "loss": 0.0027, + "num_input_tokens_seen": 55574552, + "step": 95785 + }, + { + "epoch": 14.267202859696157, + "grad_norm": 4.763038635253906, + "learning_rate": 1.15048111511451e-05, + "loss": 0.0346, + "num_input_tokens_seen": 55577624, + "step": 95790 + }, + { + "epoch": 14.267947572237116, + "grad_norm": 0.000927519693505019, + "learning_rate": 1.1502075939676852e-05, + "loss": 0.0, + "num_input_tokens_seen": 55580536, + "step": 95795 + }, + { + "epoch": 14.268692284778076, + "grad_norm": 0.0015280373627319932, + "learning_rate": 1.1499340956242307e-05, + "loss": 0.0, + "num_input_tokens_seen": 55583096, + "step": 95800 + }, + { + "epoch": 14.269436997319035, + "grad_norm": 0.0004875824088230729, + "learning_rate": 1.1496606200887669e-05, + "loss": 0.0, + "num_input_tokens_seen": 55586040, + "step": 95805 + }, + { + "epoch": 14.270181709859994, + "grad_norm": 0.00041633652290329337, + "learning_rate": 1.1493871673659155e-05, + "loss": 0.0, + "num_input_tokens_seen": 55588728, + "step": 95810 + }, + { + "epoch": 14.270926422400953, + "grad_norm": 0.0006895428523421288, + "learning_rate": 1.1491137374602939e-05, + "loss": 0.0, + "num_input_tokens_seen": 55591544, + "step": 95815 + }, + { + "epoch": 14.271671134941913, + "grad_norm": 0.07126939296722412, + "learning_rate": 1.1488403303765239e-05, + "loss": 0.0, + "num_input_tokens_seen": 55594776, + "step": 95820 + }, + { + "epoch": 14.272415847482872, + "grad_norm": 6.639181810896844e-05, + "learning_rate": 1.1485669461192233e-05, + "loss": 0.0, + "num_input_tokens_seen": 55597944, + "step": 95825 + }, + { + "epoch": 14.27316056002383, + "grad_norm": 0.0016393245896324515, + "learning_rate": 1.1482935846930104e-05, + "loss": 0.0, + "num_input_tokens_seen": 55600568, + "step": 95830 + }, + { + "epoch": 14.27390527256479, + "grad_norm": 206.90029907226562, + "learning_rate": 1.148020246102503e-05, + "loss": 0.0207, + "num_input_tokens_seen": 55603448, + "step": 95835 + }, + { + "epoch": 14.27464998510575, + "grad_norm": 0.0007990718586370349, + "learning_rate": 1.14774693035232e-05, + "loss": 0.0, + "num_input_tokens_seen": 55606488, + "step": 95840 + }, + { + "epoch": 14.275394697646709, + "grad_norm": 4.449845790863037, + "learning_rate": 1.1474736374470785e-05, + "loss": 0.0653, + "num_input_tokens_seen": 55609400, + "step": 95845 + }, + { + "epoch": 14.276139410187668, + "grad_norm": 0.0014845089754089713, + "learning_rate": 1.1472003673913942e-05, + "loss": 0.0, + "num_input_tokens_seen": 55612440, + "step": 95850 + }, + { + "epoch": 14.276884122728626, + "grad_norm": 0.0010753301903605461, + "learning_rate": 1.1469271201898857e-05, + "loss": 0.1969, + "num_input_tokens_seen": 55615352, + "step": 95855 + }, + { + "epoch": 14.277628835269585, + "grad_norm": 0.003376435022801161, + "learning_rate": 1.1466538958471673e-05, + "loss": 0.1938, + "num_input_tokens_seen": 55618488, + "step": 95860 + }, + { + "epoch": 14.278373547810546, + "grad_norm": 0.0005128848133608699, + "learning_rate": 1.1463806943678571e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55621432, + "step": 95865 + }, + { + "epoch": 14.279118260351504, + "grad_norm": 11.11430835723877, + "learning_rate": 1.1461075157565681e-05, + "loss": 0.0437, + "num_input_tokens_seen": 55623960, + "step": 95870 + }, + { + "epoch": 14.279862972892463, + "grad_norm": 3.2210171222686768, + "learning_rate": 1.1458343600179175e-05, + "loss": 0.0111, + "num_input_tokens_seen": 55626712, + "step": 95875 + }, + { + "epoch": 14.280607685433422, + "grad_norm": 0.03441581502556801, + "learning_rate": 1.1455612271565192e-05, + "loss": 0.0, + "num_input_tokens_seen": 55629784, + "step": 95880 + }, + { + "epoch": 14.281352397974382, + "grad_norm": 0.0015482996823266149, + "learning_rate": 1.1452881171769872e-05, + "loss": 0.0002, + "num_input_tokens_seen": 55632856, + "step": 95885 + }, + { + "epoch": 14.282097110515341, + "grad_norm": 0.0007583288243040442, + "learning_rate": 1.145015030083935e-05, + "loss": 0.0145, + "num_input_tokens_seen": 55636184, + "step": 95890 + }, + { + "epoch": 14.2828418230563, + "grad_norm": 0.00436432333663106, + "learning_rate": 1.1447419658819775e-05, + "loss": 0.0, + "num_input_tokens_seen": 55638936, + "step": 95895 + }, + { + "epoch": 14.283586535597259, + "grad_norm": 0.0001289439242100343, + "learning_rate": 1.1444689245757268e-05, + "loss": 0.0, + "num_input_tokens_seen": 55641752, + "step": 95900 + }, + { + "epoch": 14.28433124813822, + "grad_norm": 0.00026053344481624663, + "learning_rate": 1.1441959061697952e-05, + "loss": 0.2875, + "num_input_tokens_seen": 55644472, + "step": 95905 + }, + { + "epoch": 14.285075960679178, + "grad_norm": 0.0029990707989782095, + "learning_rate": 1.1439229106687969e-05, + "loss": 0.016, + "num_input_tokens_seen": 55647384, + "step": 95910 + }, + { + "epoch": 14.285820673220137, + "grad_norm": 0.0005073858192190528, + "learning_rate": 1.1436499380773416e-05, + "loss": 0.0, + "num_input_tokens_seen": 55650200, + "step": 95915 + }, + { + "epoch": 14.286565385761095, + "grad_norm": 0.024778762832283974, + "learning_rate": 1.1433769884000429e-05, + "loss": 0.0, + "num_input_tokens_seen": 55653368, + "step": 95920 + }, + { + "epoch": 14.287310098302056, + "grad_norm": 0.00012030984362354502, + "learning_rate": 1.1431040616415114e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55656152, + "step": 95925 + }, + { + "epoch": 14.288054810843015, + "grad_norm": 6.931792449904606e-05, + "learning_rate": 1.1428311578063566e-05, + "loss": 0.0008, + "num_input_tokens_seen": 55659032, + "step": 95930 + }, + { + "epoch": 14.288799523383974, + "grad_norm": 0.0014897397486492991, + "learning_rate": 1.142558276899191e-05, + "loss": 0.0002, + "num_input_tokens_seen": 55661848, + "step": 95935 + }, + { + "epoch": 14.289544235924932, + "grad_norm": 0.00030955165857449174, + "learning_rate": 1.142285418924623e-05, + "loss": 0.0, + "num_input_tokens_seen": 55664568, + "step": 95940 + }, + { + "epoch": 14.290288948465893, + "grad_norm": 0.008715123869478703, + "learning_rate": 1.1420125838872633e-05, + "loss": 0.0099, + "num_input_tokens_seen": 55667352, + "step": 95945 + }, + { + "epoch": 14.291033661006852, + "grad_norm": 0.005235776770859957, + "learning_rate": 1.1417397717917213e-05, + "loss": 0.0, + "num_input_tokens_seen": 55670328, + "step": 95950 + }, + { + "epoch": 14.29177837354781, + "grad_norm": 0.001353503088466823, + "learning_rate": 1.1414669826426053e-05, + "loss": 0.0, + "num_input_tokens_seen": 55673624, + "step": 95955 + }, + { + "epoch": 14.292523086088769, + "grad_norm": 0.0010270824423059821, + "learning_rate": 1.1411942164445228e-05, + "loss": 0.0, + "num_input_tokens_seen": 55676408, + "step": 95960 + }, + { + "epoch": 14.29326779862973, + "grad_norm": 0.0003795077500399202, + "learning_rate": 1.140921473202084e-05, + "loss": 0.0, + "num_input_tokens_seen": 55679224, + "step": 95965 + }, + { + "epoch": 14.294012511170688, + "grad_norm": 7.3260345458984375, + "learning_rate": 1.1406487529198956e-05, + "loss": 0.0007, + "num_input_tokens_seen": 55682360, + "step": 95970 + }, + { + "epoch": 14.294757223711647, + "grad_norm": 0.0005599628202617168, + "learning_rate": 1.1403760556025638e-05, + "loss": 0.0018, + "num_input_tokens_seen": 55684952, + "step": 95975 + }, + { + "epoch": 14.295501936252606, + "grad_norm": 0.002020573941990733, + "learning_rate": 1.140103381254698e-05, + "loss": 0.0, + "num_input_tokens_seen": 55687800, + "step": 95980 + }, + { + "epoch": 14.296246648793566, + "grad_norm": 46.57481384277344, + "learning_rate": 1.1398307298809022e-05, + "loss": 0.1626, + "num_input_tokens_seen": 55690584, + "step": 95985 + }, + { + "epoch": 14.296991361334525, + "grad_norm": 0.002164000179618597, + "learning_rate": 1.1395581014857848e-05, + "loss": 0.0, + "num_input_tokens_seen": 55693880, + "step": 95990 + }, + { + "epoch": 14.297736073875484, + "grad_norm": 4.0045633795671165e-05, + "learning_rate": 1.1392854960739497e-05, + "loss": 0.0, + "num_input_tokens_seen": 55696792, + "step": 95995 + }, + { + "epoch": 14.298480786416443, + "grad_norm": 0.0007412019767798483, + "learning_rate": 1.1390129136500041e-05, + "loss": 0.0, + "num_input_tokens_seen": 55699416, + "step": 96000 + }, + { + "epoch": 14.299225498957403, + "grad_norm": 0.006688308902084827, + "learning_rate": 1.138740354218552e-05, + "loss": 0.0, + "num_input_tokens_seen": 55702840, + "step": 96005 + }, + { + "epoch": 14.299970211498362, + "grad_norm": 0.0002934114309027791, + "learning_rate": 1.1384678177841973e-05, + "loss": 0.0, + "num_input_tokens_seen": 55705656, + "step": 96010 + }, + { + "epoch": 14.30071492403932, + "grad_norm": 0.0011552737560123205, + "learning_rate": 1.1381953043515459e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55708248, + "step": 96015 + }, + { + "epoch": 14.30145963658028, + "grad_norm": 0.0010372824035584927, + "learning_rate": 1.1379228139252007e-05, + "loss": 0.0, + "num_input_tokens_seen": 55710968, + "step": 96020 + }, + { + "epoch": 14.30220434912124, + "grad_norm": 0.0005465740105137229, + "learning_rate": 1.1376503465097651e-05, + "loss": 0.0, + "num_input_tokens_seen": 55713944, + "step": 96025 + }, + { + "epoch": 14.302949061662199, + "grad_norm": 6.650965690612793, + "learning_rate": 1.1373779021098415e-05, + "loss": 0.2699, + "num_input_tokens_seen": 55716728, + "step": 96030 + }, + { + "epoch": 14.303693774203158, + "grad_norm": 0.00010753275273600593, + "learning_rate": 1.1371054807300344e-05, + "loss": 0.1099, + "num_input_tokens_seen": 55719416, + "step": 96035 + }, + { + "epoch": 14.304438486744116, + "grad_norm": 0.00010150900925509632, + "learning_rate": 1.1368330823749441e-05, + "loss": 0.0, + "num_input_tokens_seen": 55722328, + "step": 96040 + }, + { + "epoch": 14.305183199285075, + "grad_norm": 0.00034981584758497775, + "learning_rate": 1.1365607070491741e-05, + "loss": 0.0004, + "num_input_tokens_seen": 55725304, + "step": 96045 + }, + { + "epoch": 14.305927911826036, + "grad_norm": 0.00041885089012794197, + "learning_rate": 1.1362883547573252e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55728088, + "step": 96050 + }, + { + "epoch": 14.306672624366994, + "grad_norm": 18.299354553222656, + "learning_rate": 1.1360160255039976e-05, + "loss": 0.0619, + "num_input_tokens_seen": 55731064, + "step": 96055 + }, + { + "epoch": 14.307417336907953, + "grad_norm": 0.023910081014037132, + "learning_rate": 1.1357437192937943e-05, + "loss": 0.0002, + "num_input_tokens_seen": 55733464, + "step": 96060 + }, + { + "epoch": 14.308162049448912, + "grad_norm": 0.0007296619587577879, + "learning_rate": 1.1354714361313128e-05, + "loss": 0.0, + "num_input_tokens_seen": 55736536, + "step": 96065 + }, + { + "epoch": 14.308906761989872, + "grad_norm": 0.01725458912551403, + "learning_rate": 1.1351991760211558e-05, + "loss": 0.0, + "num_input_tokens_seen": 55739480, + "step": 96070 + }, + { + "epoch": 14.309651474530831, + "grad_norm": 0.001068954123184085, + "learning_rate": 1.1349269389679203e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55742520, + "step": 96075 + }, + { + "epoch": 14.31039618707179, + "grad_norm": 0.0008881168323569, + "learning_rate": 1.1346547249762082e-05, + "loss": 0.0, + "num_input_tokens_seen": 55745272, + "step": 96080 + }, + { + "epoch": 14.311140899612749, + "grad_norm": 0.0004295527469366789, + "learning_rate": 1.1343825340506167e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55748408, + "step": 96085 + }, + { + "epoch": 14.31188561215371, + "grad_norm": 0.0034482385963201523, + "learning_rate": 1.1341103661957441e-05, + "loss": 0.0, + "num_input_tokens_seen": 55751448, + "step": 96090 + }, + { + "epoch": 14.312630324694668, + "grad_norm": 0.000480220012832433, + "learning_rate": 1.1338382214161888e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55754136, + "step": 96095 + }, + { + "epoch": 14.313375037235627, + "grad_norm": 0.0001223089057020843, + "learning_rate": 1.1335660997165473e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55757048, + "step": 96100 + }, + { + "epoch": 14.314119749776586, + "grad_norm": 0.003825259627774358, + "learning_rate": 1.133294001101419e-05, + "loss": 0.0645, + "num_input_tokens_seen": 55760088, + "step": 96105 + }, + { + "epoch": 14.314864462317546, + "grad_norm": 0.0005196130368858576, + "learning_rate": 1.1330219255753983e-05, + "loss": 0.0, + "num_input_tokens_seen": 55762936, + "step": 96110 + }, + { + "epoch": 14.315609174858505, + "grad_norm": 5.7373261370230466e-05, + "learning_rate": 1.1327498731430835e-05, + "loss": 0.0, + "num_input_tokens_seen": 55765976, + "step": 96115 + }, + { + "epoch": 14.316353887399464, + "grad_norm": 0.5685502290725708, + "learning_rate": 1.1324778438090694e-05, + "loss": 0.001, + "num_input_tokens_seen": 55768888, + "step": 96120 + }, + { + "epoch": 14.317098599940422, + "grad_norm": 44.57240676879883, + "learning_rate": 1.132205837577953e-05, + "loss": 0.0265, + "num_input_tokens_seen": 55771672, + "step": 96125 + }, + { + "epoch": 14.317843312481383, + "grad_norm": 0.018949037417769432, + "learning_rate": 1.131933854454329e-05, + "loss": 0.0042, + "num_input_tokens_seen": 55774424, + "step": 96130 + }, + { + "epoch": 14.318588025022342, + "grad_norm": 0.0029219137504696846, + "learning_rate": 1.131661894442791e-05, + "loss": 0.0011, + "num_input_tokens_seen": 55777240, + "step": 96135 + }, + { + "epoch": 14.3193327375633, + "grad_norm": 0.0035705710761249065, + "learning_rate": 1.1313899575479355e-05, + "loss": 0.0, + "num_input_tokens_seen": 55779832, + "step": 96140 + }, + { + "epoch": 14.32007745010426, + "grad_norm": 0.00019407496438361704, + "learning_rate": 1.1311180437743549e-05, + "loss": 0.0, + "num_input_tokens_seen": 55783064, + "step": 96145 + }, + { + "epoch": 14.32082216264522, + "grad_norm": 0.05277414992451668, + "learning_rate": 1.1308461531266442e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55785848, + "step": 96150 + }, + { + "epoch": 14.321566875186178, + "grad_norm": 0.01136741042137146, + "learning_rate": 1.1305742856093964e-05, + "loss": 0.0, + "num_input_tokens_seen": 55788536, + "step": 96155 + }, + { + "epoch": 14.322311587727137, + "grad_norm": 0.0006344551220536232, + "learning_rate": 1.1303024412272046e-05, + "loss": 0.0, + "num_input_tokens_seen": 55791448, + "step": 96160 + }, + { + "epoch": 14.323056300268096, + "grad_norm": 499.6778564453125, + "learning_rate": 1.1300306199846605e-05, + "loss": 0.2802, + "num_input_tokens_seen": 55794392, + "step": 96165 + }, + { + "epoch": 14.323801012809056, + "grad_norm": 0.0029376137536019087, + "learning_rate": 1.1297588218863561e-05, + "loss": 0.0, + "num_input_tokens_seen": 55797208, + "step": 96170 + }, + { + "epoch": 14.324545725350015, + "grad_norm": 0.0007400038884952664, + "learning_rate": 1.1294870469368846e-05, + "loss": 0.0, + "num_input_tokens_seen": 55800088, + "step": 96175 + }, + { + "epoch": 14.325290437890974, + "grad_norm": 0.0036660160403698683, + "learning_rate": 1.1292152951408356e-05, + "loss": 0.0065, + "num_input_tokens_seen": 55802872, + "step": 96180 + }, + { + "epoch": 14.326035150431933, + "grad_norm": 0.0032034392934292555, + "learning_rate": 1.1289435665028016e-05, + "loss": 0.0, + "num_input_tokens_seen": 55805848, + "step": 96185 + }, + { + "epoch": 14.326779862972893, + "grad_norm": 0.009747308678925037, + "learning_rate": 1.1286718610273719e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55808568, + "step": 96190 + }, + { + "epoch": 14.327524575513852, + "grad_norm": 0.0001947565033333376, + "learning_rate": 1.1284001787191381e-05, + "loss": 0.0, + "num_input_tokens_seen": 55811640, + "step": 96195 + }, + { + "epoch": 14.32826928805481, + "grad_norm": 0.0003569418331608176, + "learning_rate": 1.1281285195826884e-05, + "loss": 0.0011, + "num_input_tokens_seen": 55814392, + "step": 96200 + }, + { + "epoch": 14.32901400059577, + "grad_norm": 2.2480430603027344, + "learning_rate": 1.1278568836226142e-05, + "loss": 0.0172, + "num_input_tokens_seen": 55817432, + "step": 96205 + }, + { + "epoch": 14.32975871313673, + "grad_norm": 0.0010580015368759632, + "learning_rate": 1.1275852708435033e-05, + "loss": 0.0, + "num_input_tokens_seen": 55820120, + "step": 96210 + }, + { + "epoch": 14.330503425677689, + "grad_norm": 4.237997055053711, + "learning_rate": 1.127313681249944e-05, + "loss": 0.0273, + "num_input_tokens_seen": 55823096, + "step": 96215 + }, + { + "epoch": 14.331248138218648, + "grad_norm": 0.004175985231995583, + "learning_rate": 1.1270421148465245e-05, + "loss": 0.0, + "num_input_tokens_seen": 55826104, + "step": 96220 + }, + { + "epoch": 14.331992850759606, + "grad_norm": 36.237125396728516, + "learning_rate": 1.1267705716378338e-05, + "loss": 0.0533, + "num_input_tokens_seen": 55829112, + "step": 96225 + }, + { + "epoch": 14.332737563300565, + "grad_norm": 0.000157900620251894, + "learning_rate": 1.1264990516284585e-05, + "loss": 0.0, + "num_input_tokens_seen": 55831864, + "step": 96230 + }, + { + "epoch": 14.333482275841526, + "grad_norm": 0.0004059978818986565, + "learning_rate": 1.126227554822985e-05, + "loss": 0.0, + "num_input_tokens_seen": 55834776, + "step": 96235 + }, + { + "epoch": 14.334226988382484, + "grad_norm": 0.0018205581000074744, + "learning_rate": 1.1259560812260014e-05, + "loss": 0.0, + "num_input_tokens_seen": 55837784, + "step": 96240 + }, + { + "epoch": 14.334971700923443, + "grad_norm": 0.0003788137109950185, + "learning_rate": 1.1256846308420935e-05, + "loss": 0.0, + "num_input_tokens_seen": 55840888, + "step": 96245 + }, + { + "epoch": 14.335716413464402, + "grad_norm": 0.008723946288228035, + "learning_rate": 1.125413203675846e-05, + "loss": 0.0, + "num_input_tokens_seen": 55843768, + "step": 96250 + }, + { + "epoch": 14.336461126005362, + "grad_norm": 0.5247492790222168, + "learning_rate": 1.1251417997318464e-05, + "loss": 0.0, + "num_input_tokens_seen": 55846936, + "step": 96255 + }, + { + "epoch": 14.337205838546321, + "grad_norm": 0.022414423525333405, + "learning_rate": 1.1248704190146778e-05, + "loss": 0.0, + "num_input_tokens_seen": 55849656, + "step": 96260 + }, + { + "epoch": 14.33795055108728, + "grad_norm": 0.0013578588841482997, + "learning_rate": 1.1245990615289264e-05, + "loss": 0.0, + "num_input_tokens_seen": 55852440, + "step": 96265 + }, + { + "epoch": 14.338695263628239, + "grad_norm": 0.0015172765124589205, + "learning_rate": 1.1243277272791755e-05, + "loss": 0.0, + "num_input_tokens_seen": 55855288, + "step": 96270 + }, + { + "epoch": 14.3394399761692, + "grad_norm": 0.00033839387469924986, + "learning_rate": 1.1240564162700101e-05, + "loss": 0.0, + "num_input_tokens_seen": 55858200, + "step": 96275 + }, + { + "epoch": 14.340184688710158, + "grad_norm": 9.426162432646379e-05, + "learning_rate": 1.1237851285060133e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55861080, + "step": 96280 + }, + { + "epoch": 14.340929401251117, + "grad_norm": 0.0001290565705858171, + "learning_rate": 1.123513863991768e-05, + "loss": 0.0, + "num_input_tokens_seen": 55863896, + "step": 96285 + }, + { + "epoch": 14.341674113792076, + "grad_norm": 0.001873877365142107, + "learning_rate": 1.1232426227318568e-05, + "loss": 0.0, + "num_input_tokens_seen": 55867064, + "step": 96290 + }, + { + "epoch": 14.342418826333036, + "grad_norm": 0.012967577204108238, + "learning_rate": 1.1229714047308615e-05, + "loss": 0.0, + "num_input_tokens_seen": 55870136, + "step": 96295 + }, + { + "epoch": 14.343163538873995, + "grad_norm": 1.089712142944336, + "learning_rate": 1.1227002099933657e-05, + "loss": 0.2774, + "num_input_tokens_seen": 55873144, + "step": 96300 + }, + { + "epoch": 14.343908251414954, + "grad_norm": 0.00015039046411402524, + "learning_rate": 1.1224290385239488e-05, + "loss": 0.0, + "num_input_tokens_seen": 55875896, + "step": 96305 + }, + { + "epoch": 14.344652963955912, + "grad_norm": 0.003711041994392872, + "learning_rate": 1.1221578903271943e-05, + "loss": 0.0022, + "num_input_tokens_seen": 55878648, + "step": 96310 + }, + { + "epoch": 14.345397676496873, + "grad_norm": 0.00010968273272737861, + "learning_rate": 1.1218867654076812e-05, + "loss": 0.0, + "num_input_tokens_seen": 55881432, + "step": 96315 + }, + { + "epoch": 14.346142389037832, + "grad_norm": 3.6200828617438674e-05, + "learning_rate": 1.1216156637699909e-05, + "loss": 0.0, + "num_input_tokens_seen": 55884248, + "step": 96320 + }, + { + "epoch": 14.34688710157879, + "grad_norm": 0.00018712635210249573, + "learning_rate": 1.1213445854187035e-05, + "loss": 0.0, + "num_input_tokens_seen": 55887096, + "step": 96325 + }, + { + "epoch": 14.34763181411975, + "grad_norm": 0.012984168715775013, + "learning_rate": 1.1210735303583972e-05, + "loss": 0.001, + "num_input_tokens_seen": 55889912, + "step": 96330 + }, + { + "epoch": 14.34837652666071, + "grad_norm": 0.000605311244726181, + "learning_rate": 1.1208024985936527e-05, + "loss": 0.0, + "num_input_tokens_seen": 55893080, + "step": 96335 + }, + { + "epoch": 14.349121239201668, + "grad_norm": 0.0030503252055495977, + "learning_rate": 1.1205314901290475e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55895928, + "step": 96340 + }, + { + "epoch": 14.349865951742627, + "grad_norm": 0.00029854150488972664, + "learning_rate": 1.120260504969162e-05, + "loss": 0.0005, + "num_input_tokens_seen": 55898904, + "step": 96345 + }, + { + "epoch": 14.350610664283586, + "grad_norm": 0.004343483597040176, + "learning_rate": 1.1199895431185726e-05, + "loss": 0.1377, + "num_input_tokens_seen": 55901816, + "step": 96350 + }, + { + "epoch": 14.351355376824547, + "grad_norm": 0.0010213027708232403, + "learning_rate": 1.1197186045818572e-05, + "loss": 0.0, + "num_input_tokens_seen": 55904568, + "step": 96355 + }, + { + "epoch": 14.352100089365505, + "grad_norm": 0.0030538898427039385, + "learning_rate": 1.1194476893635924e-05, + "loss": 0.0262, + "num_input_tokens_seen": 55907448, + "step": 96360 + }, + { + "epoch": 14.352844801906464, + "grad_norm": 0.00012797670206055045, + "learning_rate": 1.1191767974683567e-05, + "loss": 0.0, + "num_input_tokens_seen": 55910392, + "step": 96365 + }, + { + "epoch": 14.353589514447423, + "grad_norm": 0.00015838864783290774, + "learning_rate": 1.1189059289007256e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55913464, + "step": 96370 + }, + { + "epoch": 14.354334226988382, + "grad_norm": 0.0020790603011846542, + "learning_rate": 1.118635083665274e-05, + "loss": 0.0, + "num_input_tokens_seen": 55916280, + "step": 96375 + }, + { + "epoch": 14.355078939529342, + "grad_norm": 0.019674332812428474, + "learning_rate": 1.1183642617665799e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55919032, + "step": 96380 + }, + { + "epoch": 14.3558236520703, + "grad_norm": 0.00014769710833206773, + "learning_rate": 1.1180934632092163e-05, + "loss": 0.0, + "num_input_tokens_seen": 55921944, + "step": 96385 + }, + { + "epoch": 14.35656836461126, + "grad_norm": 0.021037332713603973, + "learning_rate": 1.11782268799776e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55924856, + "step": 96390 + }, + { + "epoch": 14.357313077152218, + "grad_norm": 1.7567938566207886, + "learning_rate": 1.1175519361367837e-05, + "loss": 0.0003, + "num_input_tokens_seen": 55927704, + "step": 96395 + }, + { + "epoch": 14.358057789693179, + "grad_norm": 0.0027100234292447567, + "learning_rate": 1.1172812076308634e-05, + "loss": 0.0, + "num_input_tokens_seen": 55930456, + "step": 96400 + }, + { + "epoch": 14.358802502234138, + "grad_norm": 6.156685412861407e-05, + "learning_rate": 1.1170105024845718e-05, + "loss": 0.0, + "num_input_tokens_seen": 55933400, + "step": 96405 + }, + { + "epoch": 14.359547214775096, + "grad_norm": 4.293010351830162e-05, + "learning_rate": 1.1167398207024812e-05, + "loss": 0.0, + "num_input_tokens_seen": 55936152, + "step": 96410 + }, + { + "epoch": 14.360291927316055, + "grad_norm": 0.0003471198433544487, + "learning_rate": 1.1164691622891662e-05, + "loss": 0.0006, + "num_input_tokens_seen": 55939256, + "step": 96415 + }, + { + "epoch": 14.361036639857016, + "grad_norm": 0.0018579980824142694, + "learning_rate": 1.1161985272491986e-05, + "loss": 0.2188, + "num_input_tokens_seen": 55941912, + "step": 96420 + }, + { + "epoch": 14.361781352397974, + "grad_norm": 0.00015010641072876751, + "learning_rate": 1.1159279155871507e-05, + "loss": 0.0, + "num_input_tokens_seen": 55944856, + "step": 96425 + }, + { + "epoch": 14.362526064938933, + "grad_norm": 3.5006029065698385e-05, + "learning_rate": 1.115657327307593e-05, + "loss": 0.0008, + "num_input_tokens_seen": 55947704, + "step": 96430 + }, + { + "epoch": 14.363270777479892, + "grad_norm": 0.002356933895498514, + "learning_rate": 1.1153867624150986e-05, + "loss": 0.0, + "num_input_tokens_seen": 55950808, + "step": 96435 + }, + { + "epoch": 14.364015490020853, + "grad_norm": 0.00014583846495952457, + "learning_rate": 1.1151162209142362e-05, + "loss": 0.0, + "num_input_tokens_seen": 55953848, + "step": 96440 + }, + { + "epoch": 14.364760202561811, + "grad_norm": 4.328707291278988e-06, + "learning_rate": 1.1148457028095794e-05, + "loss": 0.0654, + "num_input_tokens_seen": 55957016, + "step": 96445 + }, + { + "epoch": 14.36550491510277, + "grad_norm": 3.9312806620728225e-05, + "learning_rate": 1.1145752081056961e-05, + "loss": 0.0, + "num_input_tokens_seen": 55960024, + "step": 96450 + }, + { + "epoch": 14.366249627643729, + "grad_norm": 0.04181855171918869, + "learning_rate": 1.114304736807156e-05, + "loss": 0.0, + "num_input_tokens_seen": 55963224, + "step": 96455 + }, + { + "epoch": 14.36699434018469, + "grad_norm": 0.00026559524121694267, + "learning_rate": 1.1140342889185299e-05, + "loss": 0.0, + "num_input_tokens_seen": 55966072, + "step": 96460 + }, + { + "epoch": 14.367739052725648, + "grad_norm": 0.001381883048452437, + "learning_rate": 1.1137638644443846e-05, + "loss": 0.0, + "num_input_tokens_seen": 55969016, + "step": 96465 + }, + { + "epoch": 14.368483765266607, + "grad_norm": 0.00010295305401086807, + "learning_rate": 1.113493463389291e-05, + "loss": 0.0, + "num_input_tokens_seen": 55971896, + "step": 96470 + }, + { + "epoch": 14.369228477807566, + "grad_norm": 4.158287265454419e-05, + "learning_rate": 1.1132230857578155e-05, + "loss": 0.0, + "num_input_tokens_seen": 55974936, + "step": 96475 + }, + { + "epoch": 14.369973190348526, + "grad_norm": 0.0024142644833773375, + "learning_rate": 1.1129527315545272e-05, + "loss": 0.0, + "num_input_tokens_seen": 55977752, + "step": 96480 + }, + { + "epoch": 14.370717902889485, + "grad_norm": 4.0728777094045654e-05, + "learning_rate": 1.1126824007839927e-05, + "loss": 0.0, + "num_input_tokens_seen": 55980440, + "step": 96485 + }, + { + "epoch": 14.371462615430444, + "grad_norm": 0.0017113551730290055, + "learning_rate": 1.1124120934507792e-05, + "loss": 0.0, + "num_input_tokens_seen": 55983160, + "step": 96490 + }, + { + "epoch": 14.372207327971402, + "grad_norm": 8.049807547649834e-06, + "learning_rate": 1.112141809559453e-05, + "loss": 0.0, + "num_input_tokens_seen": 55986136, + "step": 96495 + }, + { + "epoch": 14.372952040512363, + "grad_norm": 0.0008287274395115674, + "learning_rate": 1.1118715491145795e-05, + "loss": 0.0, + "num_input_tokens_seen": 55988728, + "step": 96500 + }, + { + "epoch": 14.373696753053322, + "grad_norm": 0.0004202626587357372, + "learning_rate": 1.1116013121207261e-05, + "loss": 0.0, + "num_input_tokens_seen": 55991512, + "step": 96505 + }, + { + "epoch": 14.37444146559428, + "grad_norm": 0.00028137091430835426, + "learning_rate": 1.1113310985824566e-05, + "loss": 0.0, + "num_input_tokens_seen": 55994328, + "step": 96510 + }, + { + "epoch": 14.37518617813524, + "grad_norm": 0.0009499358711764216, + "learning_rate": 1.1110609085043378e-05, + "loss": 0.0001, + "num_input_tokens_seen": 55997464, + "step": 96515 + }, + { + "epoch": 14.3759308906762, + "grad_norm": 0.0007413018611259758, + "learning_rate": 1.1107907418909324e-05, + "loss": 0.0, + "num_input_tokens_seen": 56000184, + "step": 96520 + }, + { + "epoch": 14.376675603217159, + "grad_norm": 0.00020055443746969104, + "learning_rate": 1.1105205987468064e-05, + "loss": 0.0, + "num_input_tokens_seen": 56003096, + "step": 96525 + }, + { + "epoch": 14.377420315758117, + "grad_norm": 0.00040188845014199615, + "learning_rate": 1.1102504790765225e-05, + "loss": 0.0, + "num_input_tokens_seen": 56006008, + "step": 96530 + }, + { + "epoch": 14.378165028299076, + "grad_norm": 0.00017150006897281855, + "learning_rate": 1.1099803828846437e-05, + "loss": 0.1376, + "num_input_tokens_seen": 56008952, + "step": 96535 + }, + { + "epoch": 14.378909740840037, + "grad_norm": 0.008099976927042007, + "learning_rate": 1.1097103101757342e-05, + "loss": 0.0, + "num_input_tokens_seen": 56011672, + "step": 96540 + }, + { + "epoch": 14.379654453380995, + "grad_norm": 1.625135337235406e-05, + "learning_rate": 1.1094402609543561e-05, + "loss": 0.0, + "num_input_tokens_seen": 56014456, + "step": 96545 + }, + { + "epoch": 14.380399165921954, + "grad_norm": 12.119423866271973, + "learning_rate": 1.1091702352250704e-05, + "loss": 0.0151, + "num_input_tokens_seen": 56017464, + "step": 96550 + }, + { + "epoch": 14.381143878462913, + "grad_norm": 11.74055290222168, + "learning_rate": 1.108900232992441e-05, + "loss": 0.0129, + "num_input_tokens_seen": 56020440, + "step": 96555 + }, + { + "epoch": 14.381888591003872, + "grad_norm": 0.00028747066971845925, + "learning_rate": 1.1086302542610285e-05, + "loss": 0.0, + "num_input_tokens_seen": 56023480, + "step": 96560 + }, + { + "epoch": 14.382633303544832, + "grad_norm": 9.9882461654488e-05, + "learning_rate": 1.1083602990353928e-05, + "loss": 0.0, + "num_input_tokens_seen": 56026264, + "step": 96565 + }, + { + "epoch": 14.383378016085791, + "grad_norm": 0.0156976580619812, + "learning_rate": 1.1080903673200962e-05, + "loss": 0.0, + "num_input_tokens_seen": 56029208, + "step": 96570 + }, + { + "epoch": 14.38412272862675, + "grad_norm": 0.0007205933216027915, + "learning_rate": 1.107820459119698e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56032056, + "step": 96575 + }, + { + "epoch": 14.384867441167708, + "grad_norm": 0.00012229947606101632, + "learning_rate": 1.1075505744387577e-05, + "loss": 0.0018, + "num_input_tokens_seen": 56035000, + "step": 96580 + }, + { + "epoch": 14.385612153708669, + "grad_norm": 0.00029287749202921987, + "learning_rate": 1.1072807132818358e-05, + "loss": 0.0, + "num_input_tokens_seen": 56037848, + "step": 96585 + }, + { + "epoch": 14.386356866249628, + "grad_norm": 0.934413731098175, + "learning_rate": 1.10701087565349e-05, + "loss": 0.0046, + "num_input_tokens_seen": 56040536, + "step": 96590 + }, + { + "epoch": 14.387101578790586, + "grad_norm": 2.2591328161070123e-05, + "learning_rate": 1.1067410615582808e-05, + "loss": 0.0, + "num_input_tokens_seen": 56043512, + "step": 96595 + }, + { + "epoch": 14.387846291331545, + "grad_norm": 0.000162658398039639, + "learning_rate": 1.106471271000764e-05, + "loss": 0.0, + "num_input_tokens_seen": 56046360, + "step": 96600 + }, + { + "epoch": 14.388591003872506, + "grad_norm": 0.00020933256018906832, + "learning_rate": 1.1062015039854997e-05, + "loss": 0.0, + "num_input_tokens_seen": 56049240, + "step": 96605 + }, + { + "epoch": 14.389335716413465, + "grad_norm": 0.00048802417586557567, + "learning_rate": 1.1059317605170447e-05, + "loss": 0.0, + "num_input_tokens_seen": 56052120, + "step": 96610 + }, + { + "epoch": 14.390080428954423, + "grad_norm": 0.000322657055221498, + "learning_rate": 1.1056620405999558e-05, + "loss": 0.0, + "num_input_tokens_seen": 56055096, + "step": 96615 + }, + { + "epoch": 14.390825141495382, + "grad_norm": 7.638788520125672e-05, + "learning_rate": 1.1053923442387892e-05, + "loss": 0.0, + "num_input_tokens_seen": 56058232, + "step": 96620 + }, + { + "epoch": 14.391569854036343, + "grad_norm": 0.00022196654754225165, + "learning_rate": 1.1051226714381008e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56061080, + "step": 96625 + }, + { + "epoch": 14.392314566577301, + "grad_norm": 1.568511470395606e-05, + "learning_rate": 1.1048530222024481e-05, + "loss": 0.0, + "num_input_tokens_seen": 56064120, + "step": 96630 + }, + { + "epoch": 14.39305927911826, + "grad_norm": 0.0001134579797508195, + "learning_rate": 1.1045833965363847e-05, + "loss": 0.0, + "num_input_tokens_seen": 56067032, + "step": 96635 + }, + { + "epoch": 14.393803991659219, + "grad_norm": 0.0004119714430999011, + "learning_rate": 1.1043137944444673e-05, + "loss": 0.0, + "num_input_tokens_seen": 56070200, + "step": 96640 + }, + { + "epoch": 14.39454870420018, + "grad_norm": 2.3926473659230396e-05, + "learning_rate": 1.1040442159312491e-05, + "loss": 0.0, + "num_input_tokens_seen": 56073112, + "step": 96645 + }, + { + "epoch": 14.395293416741138, + "grad_norm": 4.3098545575048774e-05, + "learning_rate": 1.1037746610012861e-05, + "loss": 0.0, + "num_input_tokens_seen": 56075768, + "step": 96650 + }, + { + "epoch": 14.396038129282097, + "grad_norm": 0.000609555805567652, + "learning_rate": 1.1035051296591309e-05, + "loss": 0.0, + "num_input_tokens_seen": 56078776, + "step": 96655 + }, + { + "epoch": 14.396782841823056, + "grad_norm": 0.0006346279988065362, + "learning_rate": 1.1032356219093365e-05, + "loss": 0.0, + "num_input_tokens_seen": 56081592, + "step": 96660 + }, + { + "epoch": 14.397527554364016, + "grad_norm": 0.00012209788837935776, + "learning_rate": 1.1029661377564576e-05, + "loss": 0.0, + "num_input_tokens_seen": 56084600, + "step": 96665 + }, + { + "epoch": 14.398272266904975, + "grad_norm": 0.07912632077932358, + "learning_rate": 1.1026966772050448e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56087416, + "step": 96670 + }, + { + "epoch": 14.399016979445934, + "grad_norm": 3.6166940844850615e-05, + "learning_rate": 1.1024272402596526e-05, + "loss": 0.0, + "num_input_tokens_seen": 56090168, + "step": 96675 + }, + { + "epoch": 14.399761691986892, + "grad_norm": 0.0001462690852349624, + "learning_rate": 1.1021578269248314e-05, + "loss": 0.0, + "num_input_tokens_seen": 56092952, + "step": 96680 + }, + { + "epoch": 14.400506404527853, + "grad_norm": 105.3722915649414, + "learning_rate": 1.1018884372051333e-05, + "loss": 0.2129, + "num_input_tokens_seen": 56095800, + "step": 96685 + }, + { + "epoch": 14.401251117068812, + "grad_norm": 0.00019286609312985092, + "learning_rate": 1.1016190711051092e-05, + "loss": 0.0, + "num_input_tokens_seen": 56098712, + "step": 96690 + }, + { + "epoch": 14.40199582960977, + "grad_norm": 12.34611988067627, + "learning_rate": 1.1013497286293085e-05, + "loss": 0.0011, + "num_input_tokens_seen": 56101624, + "step": 96695 + }, + { + "epoch": 14.40274054215073, + "grad_norm": 0.00015486570191569626, + "learning_rate": 1.1010804097822836e-05, + "loss": 0.0, + "num_input_tokens_seen": 56104600, + "step": 96700 + }, + { + "epoch": 14.40348525469169, + "grad_norm": 0.02246270701289177, + "learning_rate": 1.1008111145685824e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56107416, + "step": 96705 + }, + { + "epoch": 14.404229967232649, + "grad_norm": 0.0004669188929256052, + "learning_rate": 1.1005418429927563e-05, + "loss": 0.0, + "num_input_tokens_seen": 56110392, + "step": 96710 + }, + { + "epoch": 14.404974679773607, + "grad_norm": 6.263704563025385e-05, + "learning_rate": 1.1002725950593525e-05, + "loss": 0.0, + "num_input_tokens_seen": 56113112, + "step": 96715 + }, + { + "epoch": 14.405719392314566, + "grad_norm": 0.00046574094449169934, + "learning_rate": 1.1000033707729216e-05, + "loss": 0.0, + "num_input_tokens_seen": 56116216, + "step": 96720 + }, + { + "epoch": 14.406464104855527, + "grad_norm": 0.0008312097052112222, + "learning_rate": 1.0997341701380099e-05, + "loss": 0.0, + "num_input_tokens_seen": 56119128, + "step": 96725 + }, + { + "epoch": 14.407208817396485, + "grad_norm": 7.344246114371344e-05, + "learning_rate": 1.0994649931591669e-05, + "loss": 0.0, + "num_input_tokens_seen": 56121848, + "step": 96730 + }, + { + "epoch": 14.407953529937444, + "grad_norm": 0.002058842685073614, + "learning_rate": 1.0991958398409396e-05, + "loss": 0.0, + "num_input_tokens_seen": 56124856, + "step": 96735 + }, + { + "epoch": 14.408698242478403, + "grad_norm": 0.0007363589247688651, + "learning_rate": 1.0989267101878742e-05, + "loss": 0.0, + "num_input_tokens_seen": 56127576, + "step": 96740 + }, + { + "epoch": 14.409442955019362, + "grad_norm": 0.00011491039185784757, + "learning_rate": 1.0986576042045186e-05, + "loss": 0.0, + "num_input_tokens_seen": 56130456, + "step": 96745 + }, + { + "epoch": 14.410187667560322, + "grad_norm": 0.00045552855590358377, + "learning_rate": 1.0983885218954187e-05, + "loss": 0.0, + "num_input_tokens_seen": 56133400, + "step": 96750 + }, + { + "epoch": 14.410932380101281, + "grad_norm": 0.0004416750161908567, + "learning_rate": 1.0981194632651201e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56136280, + "step": 96755 + }, + { + "epoch": 14.41167709264224, + "grad_norm": 6.613240111619234e-05, + "learning_rate": 1.0978504283181674e-05, + "loss": 0.0, + "num_input_tokens_seen": 56139352, + "step": 96760 + }, + { + "epoch": 14.412421805183198, + "grad_norm": 0.04501212388277054, + "learning_rate": 1.0975814170591076e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56142520, + "step": 96765 + }, + { + "epoch": 14.413166517724159, + "grad_norm": 9.957054135156795e-05, + "learning_rate": 1.0973124294924843e-05, + "loss": 0.0, + "num_input_tokens_seen": 56145208, + "step": 96770 + }, + { + "epoch": 14.413911230265118, + "grad_norm": 0.0014138190308585763, + "learning_rate": 1.0970434656228412e-05, + "loss": 0.0, + "num_input_tokens_seen": 56148440, + "step": 96775 + }, + { + "epoch": 14.414655942806077, + "grad_norm": 0.008600585162639618, + "learning_rate": 1.0967745254547238e-05, + "loss": 0.0, + "num_input_tokens_seen": 56151320, + "step": 96780 + }, + { + "epoch": 14.415400655347035, + "grad_norm": 0.014030758291482925, + "learning_rate": 1.0965056089926734e-05, + "loss": 0.0, + "num_input_tokens_seen": 56154072, + "step": 96785 + }, + { + "epoch": 14.416145367887996, + "grad_norm": 0.006310196127742529, + "learning_rate": 1.0962367162412354e-05, + "loss": 0.1439, + "num_input_tokens_seen": 56156952, + "step": 96790 + }, + { + "epoch": 14.416890080428955, + "grad_norm": 4.710362434387207, + "learning_rate": 1.0959678472049502e-05, + "loss": 0.0052, + "num_input_tokens_seen": 56159480, + "step": 96795 + }, + { + "epoch": 14.417634792969913, + "grad_norm": 0.00013383421173784882, + "learning_rate": 1.0956990018883625e-05, + "loss": 0.0, + "num_input_tokens_seen": 56162456, + "step": 96800 + }, + { + "epoch": 14.418379505510872, + "grad_norm": 0.0001332680694758892, + "learning_rate": 1.0954301802960118e-05, + "loss": 0.0, + "num_input_tokens_seen": 56165368, + "step": 96805 + }, + { + "epoch": 14.419124218051833, + "grad_norm": 3.656594708445482e-05, + "learning_rate": 1.0951613824324417e-05, + "loss": 0.0, + "num_input_tokens_seen": 56168408, + "step": 96810 + }, + { + "epoch": 14.419868930592791, + "grad_norm": 0.0006134359864518046, + "learning_rate": 1.0948926083021921e-05, + "loss": 0.0, + "num_input_tokens_seen": 56171224, + "step": 96815 + }, + { + "epoch": 14.42061364313375, + "grad_norm": 0.00021022898727096617, + "learning_rate": 1.0946238579098036e-05, + "loss": 0.0, + "num_input_tokens_seen": 56173912, + "step": 96820 + }, + { + "epoch": 14.421358355674709, + "grad_norm": 9.39193632802926e-05, + "learning_rate": 1.0943551312598172e-05, + "loss": 0.0, + "num_input_tokens_seen": 56176504, + "step": 96825 + }, + { + "epoch": 14.42210306821567, + "grad_norm": 0.0029029077850282192, + "learning_rate": 1.0940864283567708e-05, + "loss": 0.0, + "num_input_tokens_seen": 56179320, + "step": 96830 + }, + { + "epoch": 14.422847780756628, + "grad_norm": 0.0005559810670092702, + "learning_rate": 1.0938177492052064e-05, + "loss": 0.0794, + "num_input_tokens_seen": 56182296, + "step": 96835 + }, + { + "epoch": 14.423592493297587, + "grad_norm": 2.2361426090355963e-05, + "learning_rate": 1.093549093809661e-05, + "loss": 0.0, + "num_input_tokens_seen": 56185208, + "step": 96840 + }, + { + "epoch": 14.424337205838546, + "grad_norm": 0.00012464646715670824, + "learning_rate": 1.0932804621746751e-05, + "loss": 0.2032, + "num_input_tokens_seen": 56188024, + "step": 96845 + }, + { + "epoch": 14.425081918379506, + "grad_norm": 2.1913339878665283e-05, + "learning_rate": 1.0930118543047862e-05, + "loss": 0.0207, + "num_input_tokens_seen": 56190968, + "step": 96850 + }, + { + "epoch": 14.425826630920465, + "grad_norm": 0.00048566190525889397, + "learning_rate": 1.0927432702045309e-05, + "loss": 0.0, + "num_input_tokens_seen": 56193688, + "step": 96855 + }, + { + "epoch": 14.426571343461424, + "grad_norm": 8.579302084399387e-05, + "learning_rate": 1.0924747098784488e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56196536, + "step": 96860 + }, + { + "epoch": 14.427316056002383, + "grad_norm": 0.0004197127418592572, + "learning_rate": 1.0922061733310751e-05, + "loss": 0.1315, + "num_input_tokens_seen": 56199640, + "step": 96865 + }, + { + "epoch": 14.428060768543343, + "grad_norm": 0.0007027167011983693, + "learning_rate": 1.0919376605669481e-05, + "loss": 0.0008, + "num_input_tokens_seen": 56202456, + "step": 96870 + }, + { + "epoch": 14.428805481084302, + "grad_norm": 0.0013756006956100464, + "learning_rate": 1.0916691715906034e-05, + "loss": 0.0, + "num_input_tokens_seen": 56205336, + "step": 96875 + }, + { + "epoch": 14.42955019362526, + "grad_norm": 0.00023223856987897307, + "learning_rate": 1.0914007064065754e-05, + "loss": 0.0, + "num_input_tokens_seen": 56208184, + "step": 96880 + }, + { + "epoch": 14.43029490616622, + "grad_norm": 0.00014220357115846127, + "learning_rate": 1.091132265019402e-05, + "loss": 0.0002, + "num_input_tokens_seen": 56211096, + "step": 96885 + }, + { + "epoch": 14.43103961870718, + "grad_norm": 0.0008324861410073936, + "learning_rate": 1.0908638474336172e-05, + "loss": 0.0, + "num_input_tokens_seen": 56213624, + "step": 96890 + }, + { + "epoch": 14.431784331248139, + "grad_norm": 0.0007916170870885253, + "learning_rate": 1.0905954536537551e-05, + "loss": 0.0, + "num_input_tokens_seen": 56216536, + "step": 96895 + }, + { + "epoch": 14.432529043789097, + "grad_norm": 0.00032473678584210575, + "learning_rate": 1.0903270836843499e-05, + "loss": 0.0, + "num_input_tokens_seen": 56219064, + "step": 96900 + }, + { + "epoch": 14.433273756330056, + "grad_norm": 0.0027431901544332504, + "learning_rate": 1.0900587375299365e-05, + "loss": 0.0006, + "num_input_tokens_seen": 56221944, + "step": 96905 + }, + { + "epoch": 14.434018468871017, + "grad_norm": 0.0005576997646130621, + "learning_rate": 1.0897904151950469e-05, + "loss": 0.0, + "num_input_tokens_seen": 56224952, + "step": 96910 + }, + { + "epoch": 14.434763181411975, + "grad_norm": 0.00011362494842614979, + "learning_rate": 1.089522116684216e-05, + "loss": 0.0, + "num_input_tokens_seen": 56228056, + "step": 96915 + }, + { + "epoch": 14.435507893952934, + "grad_norm": 0.0005063116550445557, + "learning_rate": 1.0892538420019744e-05, + "loss": 0.0, + "num_input_tokens_seen": 56230904, + "step": 96920 + }, + { + "epoch": 14.436252606493893, + "grad_norm": 0.0003806002496276051, + "learning_rate": 1.0889855911528562e-05, + "loss": 0.0, + "num_input_tokens_seen": 56233816, + "step": 96925 + }, + { + "epoch": 14.436997319034852, + "grad_norm": 0.0013707184698432684, + "learning_rate": 1.0887173641413923e-05, + "loss": 0.0, + "num_input_tokens_seen": 56236984, + "step": 96930 + }, + { + "epoch": 14.437742031575812, + "grad_norm": 0.0014358869520947337, + "learning_rate": 1.0884491609721133e-05, + "loss": 0.0007, + "num_input_tokens_seen": 56239928, + "step": 96935 + }, + { + "epoch": 14.438486744116771, + "grad_norm": 0.00018575166177470237, + "learning_rate": 1.088180981649552e-05, + "loss": 0.0, + "num_input_tokens_seen": 56242712, + "step": 96940 + }, + { + "epoch": 14.43923145665773, + "grad_norm": 0.0021967801731079817, + "learning_rate": 1.0879128261782382e-05, + "loss": 0.0, + "num_input_tokens_seen": 56245560, + "step": 96945 + }, + { + "epoch": 14.439976169198689, + "grad_norm": 0.0010040415218099952, + "learning_rate": 1.0876446945627019e-05, + "loss": 0.0, + "num_input_tokens_seen": 56248440, + "step": 96950 + }, + { + "epoch": 14.440720881739649, + "grad_norm": 0.06380322575569153, + "learning_rate": 1.0873765868074723e-05, + "loss": 0.0, + "num_input_tokens_seen": 56251224, + "step": 96955 + }, + { + "epoch": 14.441465594280608, + "grad_norm": 0.0005814892938360572, + "learning_rate": 1.0871085029170802e-05, + "loss": 0.0, + "num_input_tokens_seen": 56254296, + "step": 96960 + }, + { + "epoch": 14.442210306821567, + "grad_norm": 0.003315537003800273, + "learning_rate": 1.0868404428960532e-05, + "loss": 0.0, + "num_input_tokens_seen": 56257112, + "step": 96965 + }, + { + "epoch": 14.442955019362525, + "grad_norm": 4.1991923353634775e-05, + "learning_rate": 1.0865724067489214e-05, + "loss": 0.0, + "num_input_tokens_seen": 56259896, + "step": 96970 + }, + { + "epoch": 14.443699731903486, + "grad_norm": 0.024518122896552086, + "learning_rate": 1.0863043944802123e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56262616, + "step": 96975 + }, + { + "epoch": 14.444444444444445, + "grad_norm": 1.5012956857681274, + "learning_rate": 1.0860364060944527e-05, + "loss": 0.0021, + "num_input_tokens_seen": 56265592, + "step": 96980 + }, + { + "epoch": 14.445189156985403, + "grad_norm": 0.00027652495191432536, + "learning_rate": 1.0857684415961721e-05, + "loss": 0.0, + "num_input_tokens_seen": 56268920, + "step": 96985 + }, + { + "epoch": 14.445933869526362, + "grad_norm": 0.0019519239431247115, + "learning_rate": 1.0855005009898953e-05, + "loss": 0.0, + "num_input_tokens_seen": 56271896, + "step": 96990 + }, + { + "epoch": 14.446678582067323, + "grad_norm": 0.011409715749323368, + "learning_rate": 1.0852325842801506e-05, + "loss": 0.0, + "num_input_tokens_seen": 56274680, + "step": 96995 + }, + { + "epoch": 14.447423294608281, + "grad_norm": 1.706007719039917, + "learning_rate": 1.0849646914714628e-05, + "loss": 0.0002, + "num_input_tokens_seen": 56277432, + "step": 97000 + }, + { + "epoch": 14.44816800714924, + "grad_norm": 0.00016842735931277275, + "learning_rate": 1.0846968225683591e-05, + "loss": 0.0, + "num_input_tokens_seen": 56280600, + "step": 97005 + }, + { + "epoch": 14.448912719690199, + "grad_norm": 8.151819201884791e-05, + "learning_rate": 1.0844289775753645e-05, + "loss": 0.0005, + "num_input_tokens_seen": 56283384, + "step": 97010 + }, + { + "epoch": 14.44965743223116, + "grad_norm": 4.185438956483267e-05, + "learning_rate": 1.084161156497003e-05, + "loss": 0.0, + "num_input_tokens_seen": 56286232, + "step": 97015 + }, + { + "epoch": 14.450402144772118, + "grad_norm": 0.00011352646833984181, + "learning_rate": 1.0838933593378e-05, + "loss": 0.0, + "num_input_tokens_seen": 56288824, + "step": 97020 + }, + { + "epoch": 14.451146857313077, + "grad_norm": 0.0006683003739453852, + "learning_rate": 1.0836255861022788e-05, + "loss": 0.0, + "num_input_tokens_seen": 56291832, + "step": 97025 + }, + { + "epoch": 14.451891569854036, + "grad_norm": 2.029083225352224e-05, + "learning_rate": 1.0833578367949646e-05, + "loss": 0.0, + "num_input_tokens_seen": 56295128, + "step": 97030 + }, + { + "epoch": 14.452636282394996, + "grad_norm": 0.14282740652561188, + "learning_rate": 1.0830901114203786e-05, + "loss": 0.0004, + "num_input_tokens_seen": 56297880, + "step": 97035 + }, + { + "epoch": 14.453380994935955, + "grad_norm": 0.10901832580566406, + "learning_rate": 1.0828224099830464e-05, + "loss": 0.0, + "num_input_tokens_seen": 56300472, + "step": 97040 + }, + { + "epoch": 14.454125707476914, + "grad_norm": 611.4129638671875, + "learning_rate": 1.0825547324874883e-05, + "loss": 0.1128, + "num_input_tokens_seen": 56303512, + "step": 97045 + }, + { + "epoch": 14.454870420017873, + "grad_norm": 0.00010828422091435641, + "learning_rate": 1.0822870789382283e-05, + "loss": 0.0, + "num_input_tokens_seen": 56306456, + "step": 97050 + }, + { + "epoch": 14.455615132558833, + "grad_norm": 9.447614502278157e-06, + "learning_rate": 1.082019449339787e-05, + "loss": 0.0, + "num_input_tokens_seen": 56309560, + "step": 97055 + }, + { + "epoch": 14.456359845099792, + "grad_norm": 0.0004339801671449095, + "learning_rate": 1.0817518436966852e-05, + "loss": 0.0, + "num_input_tokens_seen": 56312344, + "step": 97060 + }, + { + "epoch": 14.45710455764075, + "grad_norm": 0.00012906870688311756, + "learning_rate": 1.0814842620134456e-05, + "loss": 0.0, + "num_input_tokens_seen": 56315160, + "step": 97065 + }, + { + "epoch": 14.45784927018171, + "grad_norm": 0.12455279380083084, + "learning_rate": 1.0812167042945864e-05, + "loss": 0.0, + "num_input_tokens_seen": 56317944, + "step": 97070 + }, + { + "epoch": 14.458593982722668, + "grad_norm": 5.96616264374461e-05, + "learning_rate": 1.08094917054463e-05, + "loss": 0.0, + "num_input_tokens_seen": 56320856, + "step": 97075 + }, + { + "epoch": 14.459338695263629, + "grad_norm": 0.00023703780607320368, + "learning_rate": 1.0806816607680954e-05, + "loss": 0.0, + "num_input_tokens_seen": 56323896, + "step": 97080 + }, + { + "epoch": 14.460083407804587, + "grad_norm": 7.255312812048942e-05, + "learning_rate": 1.0804141749695012e-05, + "loss": 0.0, + "num_input_tokens_seen": 56326808, + "step": 97085 + }, + { + "epoch": 14.460828120345546, + "grad_norm": 0.006862129550427198, + "learning_rate": 1.0801467131533669e-05, + "loss": 0.0002, + "num_input_tokens_seen": 56329784, + "step": 97090 + }, + { + "epoch": 14.461572832886505, + "grad_norm": 0.02923564426600933, + "learning_rate": 1.0798792753242099e-05, + "loss": 0.0, + "num_input_tokens_seen": 56332376, + "step": 97095 + }, + { + "epoch": 14.462317545427466, + "grad_norm": 6.581960042240098e-05, + "learning_rate": 1.0796118614865503e-05, + "loss": 0.0588, + "num_input_tokens_seen": 56335512, + "step": 97100 + }, + { + "epoch": 14.463062257968424, + "grad_norm": 0.0004137355135753751, + "learning_rate": 1.0793444716449033e-05, + "loss": 0.0, + "num_input_tokens_seen": 56338520, + "step": 97105 + }, + { + "epoch": 14.463806970509383, + "grad_norm": 0.00020689937809947878, + "learning_rate": 1.0790771058037889e-05, + "loss": 0.0, + "num_input_tokens_seen": 56341560, + "step": 97110 + }, + { + "epoch": 14.464551683050342, + "grad_norm": 3.508450390654616e-05, + "learning_rate": 1.0788097639677216e-05, + "loss": 0.2156, + "num_input_tokens_seen": 56344504, + "step": 97115 + }, + { + "epoch": 14.465296395591302, + "grad_norm": 0.005532002076506615, + "learning_rate": 1.0785424461412197e-05, + "loss": 0.0, + "num_input_tokens_seen": 56347448, + "step": 97120 + }, + { + "epoch": 14.466041108132261, + "grad_norm": 6.411971569061279, + "learning_rate": 1.0782751523287977e-05, + "loss": 0.073, + "num_input_tokens_seen": 56350232, + "step": 97125 + }, + { + "epoch": 14.46678582067322, + "grad_norm": 0.12495335936546326, + "learning_rate": 1.0780078825349729e-05, + "loss": 0.0853, + "num_input_tokens_seen": 56353144, + "step": 97130 + }, + { + "epoch": 14.467530533214179, + "grad_norm": 0.0010530464351177216, + "learning_rate": 1.0777406367642595e-05, + "loss": 0.0, + "num_input_tokens_seen": 56355832, + "step": 97135 + }, + { + "epoch": 14.46827524575514, + "grad_norm": 0.00018613530846778303, + "learning_rate": 1.0774734150211718e-05, + "loss": 0.0, + "num_input_tokens_seen": 56358488, + "step": 97140 + }, + { + "epoch": 14.469019958296098, + "grad_norm": 0.9155115485191345, + "learning_rate": 1.077206217310226e-05, + "loss": 0.1474, + "num_input_tokens_seen": 56361528, + "step": 97145 + }, + { + "epoch": 14.469764670837057, + "grad_norm": 0.0025598020292818546, + "learning_rate": 1.0769390436359348e-05, + "loss": 0.0, + "num_input_tokens_seen": 56364664, + "step": 97150 + }, + { + "epoch": 14.470509383378015, + "grad_norm": 0.020626822486519814, + "learning_rate": 1.0766718940028123e-05, + "loss": 0.0, + "num_input_tokens_seen": 56367416, + "step": 97155 + }, + { + "epoch": 14.471254095918976, + "grad_norm": 1.8112823454430327e-05, + "learning_rate": 1.0764047684153705e-05, + "loss": 0.3878, + "num_input_tokens_seen": 56370072, + "step": 97160 + }, + { + "epoch": 14.471998808459935, + "grad_norm": 0.0013122978853061795, + "learning_rate": 1.0761376668781244e-05, + "loss": 0.0, + "num_input_tokens_seen": 56373080, + "step": 97165 + }, + { + "epoch": 14.472743521000893, + "grad_norm": 0.0005250665126368403, + "learning_rate": 1.0758705893955843e-05, + "loss": 0.0, + "num_input_tokens_seen": 56375960, + "step": 97170 + }, + { + "epoch": 14.473488233541852, + "grad_norm": 15.277353286743164, + "learning_rate": 1.0756035359722639e-05, + "loss": 0.0191, + "num_input_tokens_seen": 56379128, + "step": 97175 + }, + { + "epoch": 14.474232946082813, + "grad_norm": 0.002824692288413644, + "learning_rate": 1.0753365066126741e-05, + "loss": 0.0, + "num_input_tokens_seen": 56382072, + "step": 97180 + }, + { + "epoch": 14.474977658623772, + "grad_norm": 0.0003704246773850173, + "learning_rate": 1.0750695013213251e-05, + "loss": 0.0, + "num_input_tokens_seen": 56384824, + "step": 97185 + }, + { + "epoch": 14.47572237116473, + "grad_norm": 0.00010557129280641675, + "learning_rate": 1.0748025201027298e-05, + "loss": 0.0, + "num_input_tokens_seen": 56388280, + "step": 97190 + }, + { + "epoch": 14.476467083705689, + "grad_norm": 1.905690078274347e-05, + "learning_rate": 1.0745355629613965e-05, + "loss": 0.0, + "num_input_tokens_seen": 56391000, + "step": 97195 + }, + { + "epoch": 14.47721179624665, + "grad_norm": 0.17129546403884888, + "learning_rate": 1.0742686299018368e-05, + "loss": 0.0, + "num_input_tokens_seen": 56393848, + "step": 97200 + }, + { + "epoch": 14.477956508787608, + "grad_norm": 11.369583129882812, + "learning_rate": 1.0740017209285597e-05, + "loss": 0.1036, + "num_input_tokens_seen": 56396536, + "step": 97205 + }, + { + "epoch": 14.478701221328567, + "grad_norm": 0.0017694131238386035, + "learning_rate": 1.0737348360460733e-05, + "loss": 0.0, + "num_input_tokens_seen": 56399256, + "step": 97210 + }, + { + "epoch": 14.479445933869526, + "grad_norm": 1.617581619939301e-05, + "learning_rate": 1.073467975258888e-05, + "loss": 0.0, + "num_input_tokens_seen": 56402008, + "step": 97215 + }, + { + "epoch": 14.480190646410486, + "grad_norm": 0.002962603699415922, + "learning_rate": 1.0732011385715116e-05, + "loss": 0.0, + "num_input_tokens_seen": 56404952, + "step": 97220 + }, + { + "epoch": 14.480935358951445, + "grad_norm": 0.0009159796172752976, + "learning_rate": 1.0729343259884516e-05, + "loss": 0.0, + "num_input_tokens_seen": 56408312, + "step": 97225 + }, + { + "epoch": 14.481680071492404, + "grad_norm": 0.004837993532419205, + "learning_rate": 1.0726675375142151e-05, + "loss": 0.0, + "num_input_tokens_seen": 56411096, + "step": 97230 + }, + { + "epoch": 14.482424784033363, + "grad_norm": 0.000743427372071892, + "learning_rate": 1.0724007731533107e-05, + "loss": 0.0, + "num_input_tokens_seen": 56413816, + "step": 97235 + }, + { + "epoch": 14.483169496574323, + "grad_norm": 0.00019117385090794414, + "learning_rate": 1.072134032910243e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56416664, + "step": 97240 + }, + { + "epoch": 14.483914209115282, + "grad_norm": 0.00014216633280739188, + "learning_rate": 1.071867316789521e-05, + "loss": 0.0, + "num_input_tokens_seen": 56419512, + "step": 97245 + }, + { + "epoch": 14.48465892165624, + "grad_norm": 0.00023736782895866781, + "learning_rate": 1.0716006247956481e-05, + "loss": 0.0, + "num_input_tokens_seen": 56422424, + "step": 97250 + }, + { + "epoch": 14.4854036341972, + "grad_norm": 3.976112202508375e-05, + "learning_rate": 1.0713339569331318e-05, + "loss": 0.0, + "num_input_tokens_seen": 56425496, + "step": 97255 + }, + { + "epoch": 14.486148346738158, + "grad_norm": 0.00023864074319135398, + "learning_rate": 1.0710673132064764e-05, + "loss": 0.0, + "num_input_tokens_seen": 56428504, + "step": 97260 + }, + { + "epoch": 14.486893059279119, + "grad_norm": 0.02011956460773945, + "learning_rate": 1.0708006936201853e-05, + "loss": 0.0, + "num_input_tokens_seen": 56431352, + "step": 97265 + }, + { + "epoch": 14.487637771820078, + "grad_norm": 0.020158717408776283, + "learning_rate": 1.0705340981787648e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56434040, + "step": 97270 + }, + { + "epoch": 14.488382484361036, + "grad_norm": 4.360917955636978e-05, + "learning_rate": 1.070267526886718e-05, + "loss": 0.0666, + "num_input_tokens_seen": 56436792, + "step": 97275 + }, + { + "epoch": 14.489127196901995, + "grad_norm": 2.8811135052819736e-05, + "learning_rate": 1.0700009797485483e-05, + "loss": 0.0008, + "num_input_tokens_seen": 56439608, + "step": 97280 + }, + { + "epoch": 14.489871909442956, + "grad_norm": 0.0012390818446874619, + "learning_rate": 1.0697344567687575e-05, + "loss": 0.0, + "num_input_tokens_seen": 56442328, + "step": 97285 + }, + { + "epoch": 14.490616621983914, + "grad_norm": 0.001417804043740034, + "learning_rate": 1.0694679579518508e-05, + "loss": 0.0, + "num_input_tokens_seen": 56445400, + "step": 97290 + }, + { + "epoch": 14.491361334524873, + "grad_norm": 0.0009472024976275861, + "learning_rate": 1.0692014833023283e-05, + "loss": 0.0129, + "num_input_tokens_seen": 56448504, + "step": 97295 + }, + { + "epoch": 14.492106047065832, + "grad_norm": 0.00018419134721625596, + "learning_rate": 1.0689350328246922e-05, + "loss": 0.0329, + "num_input_tokens_seen": 56451160, + "step": 97300 + }, + { + "epoch": 14.492850759606792, + "grad_norm": 0.19395701587200165, + "learning_rate": 1.068668606523445e-05, + "loss": 0.0299, + "num_input_tokens_seen": 56453720, + "step": 97305 + }, + { + "epoch": 14.493595472147751, + "grad_norm": 0.0007070049759931862, + "learning_rate": 1.0684022044030861e-05, + "loss": 0.0, + "num_input_tokens_seen": 56456664, + "step": 97310 + }, + { + "epoch": 14.49434018468871, + "grad_norm": 0.00019209070887882262, + "learning_rate": 1.0681358264681176e-05, + "loss": 0.0, + "num_input_tokens_seen": 56459384, + "step": 97315 + }, + { + "epoch": 14.495084897229669, + "grad_norm": 0.00031150277936831117, + "learning_rate": 1.0678694727230384e-05, + "loss": 0.0, + "num_input_tokens_seen": 56462520, + "step": 97320 + }, + { + "epoch": 14.49582960977063, + "grad_norm": 8.883758709998801e-05, + "learning_rate": 1.0676031431723497e-05, + "loss": 0.0, + "num_input_tokens_seen": 56465304, + "step": 97325 + }, + { + "epoch": 14.496574322311588, + "grad_norm": 0.0023424869868904352, + "learning_rate": 1.0673368378205492e-05, + "loss": 0.0, + "num_input_tokens_seen": 56467960, + "step": 97330 + }, + { + "epoch": 14.497319034852547, + "grad_norm": 0.0002517763350624591, + "learning_rate": 1.0670705566721376e-05, + "loss": 0.0, + "num_input_tokens_seen": 56471128, + "step": 97335 + }, + { + "epoch": 14.498063747393505, + "grad_norm": 4.984138740837807e-06, + "learning_rate": 1.0668042997316126e-05, + "loss": 0.0, + "num_input_tokens_seen": 56473944, + "step": 97340 + }, + { + "epoch": 14.498808459934466, + "grad_norm": 0.00039582382305525243, + "learning_rate": 1.0665380670034725e-05, + "loss": 0.0257, + "num_input_tokens_seen": 56476760, + "step": 97345 + }, + { + "epoch": 14.499553172475425, + "grad_norm": 0.0008459406672045588, + "learning_rate": 1.0662718584922145e-05, + "loss": 0.0007, + "num_input_tokens_seen": 56479736, + "step": 97350 + }, + { + "epoch": 14.500297885016384, + "grad_norm": 7.544297841377556e-05, + "learning_rate": 1.0660056742023355e-05, + "loss": 0.1668, + "num_input_tokens_seen": 56482776, + "step": 97355 + }, + { + "epoch": 14.501042597557342, + "grad_norm": 1.9804678231594153e-05, + "learning_rate": 1.0657395141383342e-05, + "loss": 0.0079, + "num_input_tokens_seen": 56486168, + "step": 97360 + }, + { + "epoch": 14.501787310098303, + "grad_norm": 5.69395961065311e-05, + "learning_rate": 1.0654733783047052e-05, + "loss": 0.0, + "num_input_tokens_seen": 56489016, + "step": 97365 + }, + { + "epoch": 14.502532022639262, + "grad_norm": 0.0004364679625723511, + "learning_rate": 1.0652072667059462e-05, + "loss": 0.2409, + "num_input_tokens_seen": 56491928, + "step": 97370 + }, + { + "epoch": 14.50327673518022, + "grad_norm": 0.0006286298739723861, + "learning_rate": 1.0649411793465525e-05, + "loss": 0.0, + "num_input_tokens_seen": 56494840, + "step": 97375 + }, + { + "epoch": 14.504021447721179, + "grad_norm": 0.5964275598526001, + "learning_rate": 1.0646751162310178e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56497944, + "step": 97380 + }, + { + "epoch": 14.50476616026214, + "grad_norm": 0.00010276511602569371, + "learning_rate": 1.0644090773638394e-05, + "loss": 0.0, + "num_input_tokens_seen": 56500792, + "step": 97385 + }, + { + "epoch": 14.505510872803098, + "grad_norm": 0.0005068235914222896, + "learning_rate": 1.0641430627495094e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56504024, + "step": 97390 + }, + { + "epoch": 14.506255585344057, + "grad_norm": 0.36927223205566406, + "learning_rate": 1.063877072392524e-05, + "loss": 0.0261, + "num_input_tokens_seen": 56506776, + "step": 97395 + }, + { + "epoch": 14.507000297885016, + "grad_norm": 0.00017897193902172148, + "learning_rate": 1.063611106297375e-05, + "loss": 0.3603, + "num_input_tokens_seen": 56509592, + "step": 97400 + }, + { + "epoch": 14.507745010425975, + "grad_norm": 0.005935434252023697, + "learning_rate": 1.0633451644685572e-05, + "loss": 0.0, + "num_input_tokens_seen": 56512312, + "step": 97405 + }, + { + "epoch": 14.508489722966935, + "grad_norm": 0.0002099145349347964, + "learning_rate": 1.063079246910563e-05, + "loss": 0.0, + "num_input_tokens_seen": 56515352, + "step": 97410 + }, + { + "epoch": 14.509234435507894, + "grad_norm": 0.0006815956439822912, + "learning_rate": 1.0628133536278842e-05, + "loss": 0.0, + "num_input_tokens_seen": 56518168, + "step": 97415 + }, + { + "epoch": 14.509979148048853, + "grad_norm": 5.477006561704911e-05, + "learning_rate": 1.0625474846250134e-05, + "loss": 0.0007, + "num_input_tokens_seen": 56521144, + "step": 97420 + }, + { + "epoch": 14.510723860589813, + "grad_norm": 94.25749969482422, + "learning_rate": 1.062281639906441e-05, + "loss": 0.26, + "num_input_tokens_seen": 56523864, + "step": 97425 + }, + { + "epoch": 14.511468573130772, + "grad_norm": 4.2845898860832676e-05, + "learning_rate": 1.0620158194766597e-05, + "loss": 0.0, + "num_input_tokens_seen": 56526456, + "step": 97430 + }, + { + "epoch": 14.51221328567173, + "grad_norm": 0.00033932377118617296, + "learning_rate": 1.0617500233401587e-05, + "loss": 0.0375, + "num_input_tokens_seen": 56529112, + "step": 97435 + }, + { + "epoch": 14.51295799821269, + "grad_norm": 0.000988850137218833, + "learning_rate": 1.0614842515014303e-05, + "loss": 0.1252, + "num_input_tokens_seen": 56531832, + "step": 97440 + }, + { + "epoch": 14.513702710753648, + "grad_norm": 0.003462241729721427, + "learning_rate": 1.0612185039649625e-05, + "loss": 0.0, + "num_input_tokens_seen": 56534680, + "step": 97445 + }, + { + "epoch": 14.514447423294609, + "grad_norm": 4.125473424210213e-06, + "learning_rate": 1.0609527807352469e-05, + "loss": 0.0, + "num_input_tokens_seen": 56537432, + "step": 97450 + }, + { + "epoch": 14.515192135835568, + "grad_norm": 4.2693678551586345e-05, + "learning_rate": 1.060687081816771e-05, + "loss": 0.0002, + "num_input_tokens_seen": 56540280, + "step": 97455 + }, + { + "epoch": 14.515936848376526, + "grad_norm": 0.0005918437382206321, + "learning_rate": 1.0604214072140233e-05, + "loss": 0.0, + "num_input_tokens_seen": 56543480, + "step": 97460 + }, + { + "epoch": 14.516681560917485, + "grad_norm": 0.0023674832191318274, + "learning_rate": 1.0601557569314941e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56546424, + "step": 97465 + }, + { + "epoch": 14.517426273458446, + "grad_norm": 0.004437402822077274, + "learning_rate": 1.0598901309736686e-05, + "loss": 0.0, + "num_input_tokens_seen": 56549496, + "step": 97470 + }, + { + "epoch": 14.518170985999404, + "grad_norm": 7.164695853134617e-05, + "learning_rate": 1.0596245293450368e-05, + "loss": 0.0004, + "num_input_tokens_seen": 56552344, + "step": 97475 + }, + { + "epoch": 14.518915698540363, + "grad_norm": 8.111685929179657e-06, + "learning_rate": 1.0593589520500846e-05, + "loss": 0.0, + "num_input_tokens_seen": 56555320, + "step": 97480 + }, + { + "epoch": 14.519660411081322, + "grad_norm": 0.0008297700551338494, + "learning_rate": 1.0590933990932989e-05, + "loss": 0.4008, + "num_input_tokens_seen": 56558232, + "step": 97485 + }, + { + "epoch": 14.520405123622282, + "grad_norm": 0.00026086362777277827, + "learning_rate": 1.0588278704791646e-05, + "loss": 0.0, + "num_input_tokens_seen": 56561368, + "step": 97490 + }, + { + "epoch": 14.521149836163241, + "grad_norm": 1.7114021829911508e-05, + "learning_rate": 1.0585623662121696e-05, + "loss": 0.0, + "num_input_tokens_seen": 56563992, + "step": 97495 + }, + { + "epoch": 14.5218945487042, + "grad_norm": 9.49953609961085e-05, + "learning_rate": 1.0582968862967984e-05, + "loss": 0.0122, + "num_input_tokens_seen": 56566968, + "step": 97500 + }, + { + "epoch": 14.522639261245159, + "grad_norm": 0.00012043459719279781, + "learning_rate": 1.0580314307375352e-05, + "loss": 0.0, + "num_input_tokens_seen": 56569784, + "step": 97505 + }, + { + "epoch": 14.52338397378612, + "grad_norm": 0.00021744870173279196, + "learning_rate": 1.0577659995388664e-05, + "loss": 0.0, + "num_input_tokens_seen": 56572408, + "step": 97510 + }, + { + "epoch": 14.524128686327078, + "grad_norm": 0.00815147627145052, + "learning_rate": 1.0575005927052743e-05, + "loss": 0.0, + "num_input_tokens_seen": 56575352, + "step": 97515 + }, + { + "epoch": 14.524873398868037, + "grad_norm": 0.005301278550177813, + "learning_rate": 1.0572352102412442e-05, + "loss": 0.0, + "num_input_tokens_seen": 56578424, + "step": 97520 + }, + { + "epoch": 14.525618111408996, + "grad_norm": 0.0025902297347784042, + "learning_rate": 1.0569698521512583e-05, + "loss": 0.0002, + "num_input_tokens_seen": 56581496, + "step": 97525 + }, + { + "epoch": 14.526362823949956, + "grad_norm": 0.0008477800874970853, + "learning_rate": 1.0567045184398009e-05, + "loss": 0.0, + "num_input_tokens_seen": 56584088, + "step": 97530 + }, + { + "epoch": 14.527107536490915, + "grad_norm": 0.00031248482991941273, + "learning_rate": 1.0564392091113537e-05, + "loss": 0.0, + "num_input_tokens_seen": 56587160, + "step": 97535 + }, + { + "epoch": 14.527852249031874, + "grad_norm": 0.0005751943681389093, + "learning_rate": 1.056173924170398e-05, + "loss": 0.0347, + "num_input_tokens_seen": 56589912, + "step": 97540 + }, + { + "epoch": 14.528596961572832, + "grad_norm": 0.0004980321391485631, + "learning_rate": 1.0559086636214174e-05, + "loss": 0.2594, + "num_input_tokens_seen": 56592792, + "step": 97545 + }, + { + "epoch": 14.529341674113793, + "grad_norm": 0.0003783975844271481, + "learning_rate": 1.0556434274688923e-05, + "loss": 0.0, + "num_input_tokens_seen": 56595896, + "step": 97550 + }, + { + "epoch": 14.530086386654752, + "grad_norm": 3.7507554679905297e-06, + "learning_rate": 1.0553782157173034e-05, + "loss": 0.0, + "num_input_tokens_seen": 56598776, + "step": 97555 + }, + { + "epoch": 14.53083109919571, + "grad_norm": 0.0003148977120872587, + "learning_rate": 1.0551130283711302e-05, + "loss": 0.0, + "num_input_tokens_seen": 56601720, + "step": 97560 + }, + { + "epoch": 14.53157581173667, + "grad_norm": 0.00010659374675014988, + "learning_rate": 1.0548478654348551e-05, + "loss": 0.0007, + "num_input_tokens_seen": 56604440, + "step": 97565 + }, + { + "epoch": 14.53232052427763, + "grad_norm": 0.0001134490521508269, + "learning_rate": 1.0545827269129554e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56607352, + "step": 97570 + }, + { + "epoch": 14.533065236818588, + "grad_norm": 0.00020361282804515213, + "learning_rate": 1.0543176128099126e-05, + "loss": 0.0, + "num_input_tokens_seen": 56610264, + "step": 97575 + }, + { + "epoch": 14.533809949359547, + "grad_norm": 1.5405230442411266e-05, + "learning_rate": 1.0540525231302043e-05, + "loss": 0.0, + "num_input_tokens_seen": 56613176, + "step": 97580 + }, + { + "epoch": 14.534554661900506, + "grad_norm": 25.77923011779785, + "learning_rate": 1.053787457878308e-05, + "loss": 0.0883, + "num_input_tokens_seen": 56615800, + "step": 97585 + }, + { + "epoch": 14.535299374441465, + "grad_norm": 0.0005194924306124449, + "learning_rate": 1.0535224170587038e-05, + "loss": 0.0, + "num_input_tokens_seen": 56619160, + "step": 97590 + }, + { + "epoch": 14.536044086982425, + "grad_norm": 0.0007107568089850247, + "learning_rate": 1.0532574006758673e-05, + "loss": 0.2375, + "num_input_tokens_seen": 56622168, + "step": 97595 + }, + { + "epoch": 14.536788799523384, + "grad_norm": 0.00015207605611067265, + "learning_rate": 1.0529924087342774e-05, + "loss": 0.0, + "num_input_tokens_seen": 56625016, + "step": 97600 + }, + { + "epoch": 14.537533512064343, + "grad_norm": 0.0003738752275239676, + "learning_rate": 1.05272744123841e-05, + "loss": 0.0, + "num_input_tokens_seen": 56627864, + "step": 97605 + }, + { + "epoch": 14.538278224605303, + "grad_norm": 0.00040701127727515996, + "learning_rate": 1.0524624981927416e-05, + "loss": 0.0, + "num_input_tokens_seen": 56630808, + "step": 97610 + }, + { + "epoch": 14.539022937146262, + "grad_norm": 0.0002752139698714018, + "learning_rate": 1.0521975796017483e-05, + "loss": 0.0, + "num_input_tokens_seen": 56633688, + "step": 97615 + }, + { + "epoch": 14.53976764968722, + "grad_norm": 0.004969279747456312, + "learning_rate": 1.0519326854699043e-05, + "loss": 0.0, + "num_input_tokens_seen": 56636504, + "step": 97620 + }, + { + "epoch": 14.54051236222818, + "grad_norm": 4.505858214542968e-06, + "learning_rate": 1.0516678158016868e-05, + "loss": 0.0226, + "num_input_tokens_seen": 56639224, + "step": 97625 + }, + { + "epoch": 14.541257074769138, + "grad_norm": 0.0007506767869926989, + "learning_rate": 1.0514029706015687e-05, + "loss": 0.0, + "num_input_tokens_seen": 56642040, + "step": 97630 + }, + { + "epoch": 14.542001787310099, + "grad_norm": 0.016351642087101936, + "learning_rate": 1.051138149874026e-05, + "loss": 0.0035, + "num_input_tokens_seen": 56645208, + "step": 97635 + }, + { + "epoch": 14.542746499851058, + "grad_norm": 0.005204117391258478, + "learning_rate": 1.0508733536235307e-05, + "loss": 0.0, + "num_input_tokens_seen": 56648024, + "step": 97640 + }, + { + "epoch": 14.543491212392016, + "grad_norm": 0.0006341403350234032, + "learning_rate": 1.0506085818545582e-05, + "loss": 0.0049, + "num_input_tokens_seen": 56650584, + "step": 97645 + }, + { + "epoch": 14.544235924932975, + "grad_norm": 0.0005588832427747548, + "learning_rate": 1.0503438345715798e-05, + "loss": 0.0, + "num_input_tokens_seen": 56653400, + "step": 97650 + }, + { + "epoch": 14.544980637473936, + "grad_norm": 0.004084722138941288, + "learning_rate": 1.0500791117790699e-05, + "loss": 0.0, + "num_input_tokens_seen": 56656056, + "step": 97655 + }, + { + "epoch": 14.545725350014894, + "grad_norm": 0.34299421310424805, + "learning_rate": 1.0498144134814996e-05, + "loss": 0.0, + "num_input_tokens_seen": 56659064, + "step": 97660 + }, + { + "epoch": 14.546470062555853, + "grad_norm": 2.3833272280171514e-05, + "learning_rate": 1.04954973968334e-05, + "loss": 0.0, + "num_input_tokens_seen": 56661880, + "step": 97665 + }, + { + "epoch": 14.547214775096812, + "grad_norm": 0.017300963401794434, + "learning_rate": 1.0492850903890644e-05, + "loss": 0.0, + "num_input_tokens_seen": 56664664, + "step": 97670 + }, + { + "epoch": 14.547959487637772, + "grad_norm": 9.07898138393648e-05, + "learning_rate": 1.0490204656031427e-05, + "loss": 0.0024, + "num_input_tokens_seen": 56667928, + "step": 97675 + }, + { + "epoch": 14.548704200178731, + "grad_norm": 0.0001684085582382977, + "learning_rate": 1.0487558653300455e-05, + "loss": 0.0, + "num_input_tokens_seen": 56670808, + "step": 97680 + }, + { + "epoch": 14.54944891271969, + "grad_norm": 0.00024152836704161018, + "learning_rate": 1.0484912895742422e-05, + "loss": 0.0, + "num_input_tokens_seen": 56673592, + "step": 97685 + }, + { + "epoch": 14.550193625260649, + "grad_norm": 0.0006522748153656721, + "learning_rate": 1.0482267383402041e-05, + "loss": 0.0, + "num_input_tokens_seen": 56676504, + "step": 97690 + }, + { + "epoch": 14.55093833780161, + "grad_norm": 0.00019286129099782556, + "learning_rate": 1.0479622116323997e-05, + "loss": 0.0, + "num_input_tokens_seen": 56679320, + "step": 97695 + }, + { + "epoch": 14.551683050342568, + "grad_norm": 0.003797098295763135, + "learning_rate": 1.047697709455297e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56682168, + "step": 97700 + }, + { + "epoch": 14.552427762883527, + "grad_norm": 0.00014371340512298048, + "learning_rate": 1.0474332318133664e-05, + "loss": 0.0, + "num_input_tokens_seen": 56684984, + "step": 97705 + }, + { + "epoch": 14.553172475424486, + "grad_norm": 28.773752212524414, + "learning_rate": 1.0471687787110743e-05, + "loss": 0.0098, + "num_input_tokens_seen": 56687896, + "step": 97710 + }, + { + "epoch": 14.553917187965446, + "grad_norm": 0.0003879367432091385, + "learning_rate": 1.04690435015289e-05, + "loss": 0.0, + "num_input_tokens_seen": 56690808, + "step": 97715 + }, + { + "epoch": 14.554661900506405, + "grad_norm": 9.312112524639815e-06, + "learning_rate": 1.0466399461432785e-05, + "loss": 0.0014, + "num_input_tokens_seen": 56693624, + "step": 97720 + }, + { + "epoch": 14.555406613047364, + "grad_norm": 0.003546071471646428, + "learning_rate": 1.0463755666867093e-05, + "loss": 0.0, + "num_input_tokens_seen": 56696568, + "step": 97725 + }, + { + "epoch": 14.556151325588322, + "grad_norm": 0.001587101723998785, + "learning_rate": 1.0461112117876464e-05, + "loss": 0.0051, + "num_input_tokens_seen": 56699352, + "step": 97730 + }, + { + "epoch": 14.556896038129283, + "grad_norm": 0.00013561677769757807, + "learning_rate": 1.0458468814505578e-05, + "loss": 0.0097, + "num_input_tokens_seen": 56702232, + "step": 97735 + }, + { + "epoch": 14.557640750670242, + "grad_norm": 8.65380497998558e-05, + "learning_rate": 1.0455825756799084e-05, + "loss": 0.0002, + "num_input_tokens_seen": 56704984, + "step": 97740 + }, + { + "epoch": 14.5583854632112, + "grad_norm": 4.241399801685475e-05, + "learning_rate": 1.0453182944801631e-05, + "loss": 0.0, + "num_input_tokens_seen": 56707864, + "step": 97745 + }, + { + "epoch": 14.55913017575216, + "grad_norm": 0.00033734863973222673, + "learning_rate": 1.045054037855787e-05, + "loss": 0.0429, + "num_input_tokens_seen": 56711032, + "step": 97750 + }, + { + "epoch": 14.55987488829312, + "grad_norm": 0.0003188200353179127, + "learning_rate": 1.0447898058112427e-05, + "loss": 0.0, + "num_input_tokens_seen": 56713592, + "step": 97755 + }, + { + "epoch": 14.560619600834078, + "grad_norm": 0.0007034618174657226, + "learning_rate": 1.0445255983509969e-05, + "loss": 0.0, + "num_input_tokens_seen": 56716984, + "step": 97760 + }, + { + "epoch": 14.561364313375037, + "grad_norm": 0.0010820470051839948, + "learning_rate": 1.0442614154795107e-05, + "loss": 0.0002, + "num_input_tokens_seen": 56719736, + "step": 97765 + }, + { + "epoch": 14.562109025915996, + "grad_norm": 0.00011427600111346692, + "learning_rate": 1.0439972572012496e-05, + "loss": 0.0, + "num_input_tokens_seen": 56722712, + "step": 97770 + }, + { + "epoch": 14.562853738456955, + "grad_norm": 18.12849998474121, + "learning_rate": 1.0437331235206737e-05, + "loss": 0.0753, + "num_input_tokens_seen": 56725560, + "step": 97775 + }, + { + "epoch": 14.563598450997915, + "grad_norm": 0.07321394234895706, + "learning_rate": 1.043469014442248e-05, + "loss": 0.0002, + "num_input_tokens_seen": 56728600, + "step": 97780 + }, + { + "epoch": 14.564343163538874, + "grad_norm": 128.20223999023438, + "learning_rate": 1.0432049299704324e-05, + "loss": 0.1594, + "num_input_tokens_seen": 56731224, + "step": 97785 + }, + { + "epoch": 14.565087876079833, + "grad_norm": 0.005220710299909115, + "learning_rate": 1.0429408701096884e-05, + "loss": 0.0, + "num_input_tokens_seen": 56734040, + "step": 97790 + }, + { + "epoch": 14.565832588620792, + "grad_norm": 8.52816883707419e-05, + "learning_rate": 1.0426768348644782e-05, + "loss": 0.0, + "num_input_tokens_seen": 56736952, + "step": 97795 + }, + { + "epoch": 14.566577301161752, + "grad_norm": 0.0007734220125712454, + "learning_rate": 1.042412824239261e-05, + "loss": 0.0, + "num_input_tokens_seen": 56739928, + "step": 97800 + }, + { + "epoch": 14.56732201370271, + "grad_norm": 0.0001291215739911422, + "learning_rate": 1.0421488382384986e-05, + "loss": 0.0733, + "num_input_tokens_seen": 56742584, + "step": 97805 + }, + { + "epoch": 14.56806672624367, + "grad_norm": 0.00013272451178636402, + "learning_rate": 1.0418848768666498e-05, + "loss": 0.0, + "num_input_tokens_seen": 56745592, + "step": 97810 + }, + { + "epoch": 14.568811438784628, + "grad_norm": 7.294499664567411e-05, + "learning_rate": 1.041620940128174e-05, + "loss": 0.0, + "num_input_tokens_seen": 56748280, + "step": 97815 + }, + { + "epoch": 14.569556151325589, + "grad_norm": 0.0025013939011842012, + "learning_rate": 1.04135702802753e-05, + "loss": 0.0, + "num_input_tokens_seen": 56751000, + "step": 97820 + }, + { + "epoch": 14.570300863866548, + "grad_norm": 2.2363401512848213e-05, + "learning_rate": 1.041093140569176e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56753688, + "step": 97825 + }, + { + "epoch": 14.571045576407506, + "grad_norm": 1.5256361621140968e-05, + "learning_rate": 1.0408292777575712e-05, + "loss": 0.0, + "num_input_tokens_seen": 56756600, + "step": 97830 + }, + { + "epoch": 14.571790288948465, + "grad_norm": 0.00011615278344834223, + "learning_rate": 1.040565439597172e-05, + "loss": 0.0071, + "num_input_tokens_seen": 56759640, + "step": 97835 + }, + { + "epoch": 14.572535001489426, + "grad_norm": 0.0012348233722150326, + "learning_rate": 1.0403016260924373e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56762424, + "step": 97840 + }, + { + "epoch": 14.573279714030384, + "grad_norm": 0.0034929756075143814, + "learning_rate": 1.040037837247822e-05, + "loss": 0.0, + "num_input_tokens_seen": 56765496, + "step": 97845 + }, + { + "epoch": 14.574024426571343, + "grad_norm": 0.00015836887178011239, + "learning_rate": 1.0397740730677845e-05, + "loss": 0.0016, + "num_input_tokens_seen": 56768440, + "step": 97850 + }, + { + "epoch": 14.574769139112302, + "grad_norm": 0.004133644979447126, + "learning_rate": 1.0395103335567794e-05, + "loss": 0.0, + "num_input_tokens_seen": 56771448, + "step": 97855 + }, + { + "epoch": 14.575513851653263, + "grad_norm": 0.0033480452839285135, + "learning_rate": 1.0392466187192634e-05, + "loss": 0.0002, + "num_input_tokens_seen": 56774456, + "step": 97860 + }, + { + "epoch": 14.576258564194221, + "grad_norm": 0.007394765038043261, + "learning_rate": 1.0389829285596914e-05, + "loss": 0.0, + "num_input_tokens_seen": 56777304, + "step": 97865 + }, + { + "epoch": 14.57700327673518, + "grad_norm": 0.0003721936955116689, + "learning_rate": 1.0387192630825168e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56780248, + "step": 97870 + }, + { + "epoch": 14.577747989276139, + "grad_norm": 0.00011055687355110422, + "learning_rate": 1.038455622292196e-05, + "loss": 0.0, + "num_input_tokens_seen": 56783000, + "step": 97875 + }, + { + "epoch": 14.5784927018171, + "grad_norm": 5.751637218054384e-05, + "learning_rate": 1.0381920061931818e-05, + "loss": 0.0, + "num_input_tokens_seen": 56786008, + "step": 97880 + }, + { + "epoch": 14.579237414358058, + "grad_norm": 0.00012100161256967112, + "learning_rate": 1.0379284147899281e-05, + "loss": 0.0, + "num_input_tokens_seen": 56789112, + "step": 97885 + }, + { + "epoch": 14.579982126899017, + "grad_norm": 5.92595424677711e-05, + "learning_rate": 1.0376648480868872e-05, + "loss": 0.0852, + "num_input_tokens_seen": 56791960, + "step": 97890 + }, + { + "epoch": 14.580726839439976, + "grad_norm": 2.4264712919830345e-05, + "learning_rate": 1.0374013060885133e-05, + "loss": 0.0, + "num_input_tokens_seen": 56795192, + "step": 97895 + }, + { + "epoch": 14.581471551980936, + "grad_norm": 0.0037154625169932842, + "learning_rate": 1.0371377887992575e-05, + "loss": 0.0, + "num_input_tokens_seen": 56798072, + "step": 97900 + }, + { + "epoch": 14.582216264521895, + "grad_norm": 0.003568165237084031, + "learning_rate": 1.036874296223571e-05, + "loss": 0.0, + "num_input_tokens_seen": 56800760, + "step": 97905 + }, + { + "epoch": 14.582960977062854, + "grad_norm": 0.00014669187658000737, + "learning_rate": 1.0366108283659077e-05, + "loss": 0.0, + "num_input_tokens_seen": 56803544, + "step": 97910 + }, + { + "epoch": 14.583705689603812, + "grad_norm": 2.195478737121448e-05, + "learning_rate": 1.0363473852307157e-05, + "loss": 0.0, + "num_input_tokens_seen": 56806168, + "step": 97915 + }, + { + "epoch": 14.584450402144771, + "grad_norm": 3.611548527260311e-05, + "learning_rate": 1.0360839668224481e-05, + "loss": 0.2252, + "num_input_tokens_seen": 56808920, + "step": 97920 + }, + { + "epoch": 14.585195114685732, + "grad_norm": 4.158838783041574e-05, + "learning_rate": 1.0358205731455531e-05, + "loss": 0.2438, + "num_input_tokens_seen": 56811736, + "step": 97925 + }, + { + "epoch": 14.58593982722669, + "grad_norm": 0.00019160445663146675, + "learning_rate": 1.0355572042044823e-05, + "loss": 0.0, + "num_input_tokens_seen": 56814648, + "step": 97930 + }, + { + "epoch": 14.58668453976765, + "grad_norm": 0.0009082689066417515, + "learning_rate": 1.0352938600036843e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56817592, + "step": 97935 + }, + { + "epoch": 14.58742925230861, + "grad_norm": 3.205991015420295e-05, + "learning_rate": 1.0350305405476076e-05, + "loss": 0.1346, + "num_input_tokens_seen": 56820504, + "step": 97940 + }, + { + "epoch": 14.588173964849569, + "grad_norm": 0.000941678648814559, + "learning_rate": 1.0347672458407012e-05, + "loss": 0.0, + "num_input_tokens_seen": 56823512, + "step": 97945 + }, + { + "epoch": 14.588918677390527, + "grad_norm": 0.001119564170949161, + "learning_rate": 1.034503975887412e-05, + "loss": 0.0, + "num_input_tokens_seen": 56826648, + "step": 97950 + }, + { + "epoch": 14.589663389931486, + "grad_norm": 0.004476461559534073, + "learning_rate": 1.0342407306921894e-05, + "loss": 0.0, + "num_input_tokens_seen": 56829688, + "step": 97955 + }, + { + "epoch": 14.590408102472445, + "grad_norm": 1.3609836059913505e-05, + "learning_rate": 1.0339775102594793e-05, + "loss": 0.0048, + "num_input_tokens_seen": 56832536, + "step": 97960 + }, + { + "epoch": 14.591152815013405, + "grad_norm": 0.0005432376055978239, + "learning_rate": 1.0337143145937301e-05, + "loss": 0.0025, + "num_input_tokens_seen": 56835672, + "step": 97965 + }, + { + "epoch": 14.591897527554364, + "grad_norm": 9.55202485783957e-05, + "learning_rate": 1.0334511436993863e-05, + "loss": 0.0, + "num_input_tokens_seen": 56838712, + "step": 97970 + }, + { + "epoch": 14.592642240095323, + "grad_norm": 5.100240741739981e-05, + "learning_rate": 1.0331879975808956e-05, + "loss": 0.0, + "num_input_tokens_seen": 56841976, + "step": 97975 + }, + { + "epoch": 14.593386952636282, + "grad_norm": 0.00359134073369205, + "learning_rate": 1.032924876242703e-05, + "loss": 0.0426, + "num_input_tokens_seen": 56844952, + "step": 97980 + }, + { + "epoch": 14.594131665177242, + "grad_norm": 0.0031035759020596743, + "learning_rate": 1.0326617796892527e-05, + "loss": 0.0, + "num_input_tokens_seen": 56847800, + "step": 97985 + }, + { + "epoch": 14.594876377718201, + "grad_norm": 0.00027017397223971784, + "learning_rate": 1.0323987079249911e-05, + "loss": 0.0, + "num_input_tokens_seen": 56850680, + "step": 97990 + }, + { + "epoch": 14.59562109025916, + "grad_norm": 9.260809747502208e-05, + "learning_rate": 1.0321356609543608e-05, + "loss": 0.0, + "num_input_tokens_seen": 56853720, + "step": 97995 + }, + { + "epoch": 14.596365802800118, + "grad_norm": 0.001418125000782311, + "learning_rate": 1.0318726387818078e-05, + "loss": 0.0, + "num_input_tokens_seen": 56856728, + "step": 98000 + }, + { + "epoch": 14.597110515341079, + "grad_norm": 3.136045052087866e-05, + "learning_rate": 1.0316096414117744e-05, + "loss": 0.0, + "num_input_tokens_seen": 56859512, + "step": 98005 + }, + { + "epoch": 14.597855227882038, + "grad_norm": 1.7129945263150148e-05, + "learning_rate": 1.0313466688487034e-05, + "loss": 0.0, + "num_input_tokens_seen": 56862168, + "step": 98010 + }, + { + "epoch": 14.598599940422996, + "grad_norm": 0.0006687802379019558, + "learning_rate": 1.031083721097037e-05, + "loss": 0.0, + "num_input_tokens_seen": 56864888, + "step": 98015 + }, + { + "epoch": 14.599344652963955, + "grad_norm": 0.0008715165895409882, + "learning_rate": 1.0308207981612191e-05, + "loss": 0.0, + "num_input_tokens_seen": 56867608, + "step": 98020 + }, + { + "epoch": 14.600089365504916, + "grad_norm": 2.5266555894631892e-05, + "learning_rate": 1.0305579000456907e-05, + "loss": 0.04, + "num_input_tokens_seen": 56870744, + "step": 98025 + }, + { + "epoch": 14.600834078045875, + "grad_norm": 6.519490852952003e-05, + "learning_rate": 1.0302950267548922e-05, + "loss": 0.0, + "num_input_tokens_seen": 56873496, + "step": 98030 + }, + { + "epoch": 14.601578790586833, + "grad_norm": 0.000152915992657654, + "learning_rate": 1.0300321782932663e-05, + "loss": 0.0, + "num_input_tokens_seen": 56876376, + "step": 98035 + }, + { + "epoch": 14.602323503127792, + "grad_norm": 0.0037229834124445915, + "learning_rate": 1.0297693546652518e-05, + "loss": 0.119, + "num_input_tokens_seen": 56879416, + "step": 98040 + }, + { + "epoch": 14.603068215668753, + "grad_norm": 0.0004197224916424602, + "learning_rate": 1.0295065558752905e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56882328, + "step": 98045 + }, + { + "epoch": 14.603812928209711, + "grad_norm": 5.319570118444972e-05, + "learning_rate": 1.0292437819278208e-05, + "loss": 0.0, + "num_input_tokens_seen": 56885112, + "step": 98050 + }, + { + "epoch": 14.60455764075067, + "grad_norm": 1.3908466826251242e-05, + "learning_rate": 1.0289810328272836e-05, + "loss": 0.0, + "num_input_tokens_seen": 56887640, + "step": 98055 + }, + { + "epoch": 14.605302353291629, + "grad_norm": 0.35519325733184814, + "learning_rate": 1.0287183085781165e-05, + "loss": 0.0002, + "num_input_tokens_seen": 56890488, + "step": 98060 + }, + { + "epoch": 14.60604706583259, + "grad_norm": 4.7515339247183874e-05, + "learning_rate": 1.0284556091847575e-05, + "loss": 0.0, + "num_input_tokens_seen": 56893336, + "step": 98065 + }, + { + "epoch": 14.606791778373548, + "grad_norm": 0.00112597004044801, + "learning_rate": 1.028192934651646e-05, + "loss": 0.0, + "num_input_tokens_seen": 56896504, + "step": 98070 + }, + { + "epoch": 14.607536490914507, + "grad_norm": 0.0061631337739527225, + "learning_rate": 1.0279302849832192e-05, + "loss": 0.2522, + "num_input_tokens_seen": 56899320, + "step": 98075 + }, + { + "epoch": 14.608281203455466, + "grad_norm": 0.01716005429625511, + "learning_rate": 1.027667660183914e-05, + "loss": 0.0, + "num_input_tokens_seen": 56902040, + "step": 98080 + }, + { + "epoch": 14.609025915996426, + "grad_norm": 0.004973652772605419, + "learning_rate": 1.0274050602581667e-05, + "loss": 0.0, + "num_input_tokens_seen": 56905400, + "step": 98085 + }, + { + "epoch": 14.609770628537385, + "grad_norm": 0.007520140614360571, + "learning_rate": 1.0271424852104147e-05, + "loss": 0.0, + "num_input_tokens_seen": 56908408, + "step": 98090 + }, + { + "epoch": 14.610515341078344, + "grad_norm": 0.0017170760547742248, + "learning_rate": 1.0268799350450928e-05, + "loss": 0.0, + "num_input_tokens_seen": 56911288, + "step": 98095 + }, + { + "epoch": 14.611260053619302, + "grad_norm": 0.0005387964774854481, + "learning_rate": 1.026617409766638e-05, + "loss": 0.0, + "num_input_tokens_seen": 56914232, + "step": 98100 + }, + { + "epoch": 14.612004766160261, + "grad_norm": 3.535593350534327e-05, + "learning_rate": 1.0263549093794847e-05, + "loss": 0.0, + "num_input_tokens_seen": 56916888, + "step": 98105 + }, + { + "epoch": 14.612749478701222, + "grad_norm": 3.4683289527893066, + "learning_rate": 1.0260924338880665e-05, + "loss": 0.0014, + "num_input_tokens_seen": 56920056, + "step": 98110 + }, + { + "epoch": 14.61349419124218, + "grad_norm": 0.0005829876754432917, + "learning_rate": 1.0258299832968196e-05, + "loss": 0.0001, + "num_input_tokens_seen": 56923192, + "step": 98115 + }, + { + "epoch": 14.61423890378314, + "grad_norm": 0.00044877786422148347, + "learning_rate": 1.0255675576101759e-05, + "loss": 0.0002, + "num_input_tokens_seen": 56926232, + "step": 98120 + }, + { + "epoch": 14.6149836163241, + "grad_norm": 0.00018392354832030833, + "learning_rate": 1.0253051568325705e-05, + "loss": 0.0, + "num_input_tokens_seen": 56929112, + "step": 98125 + }, + { + "epoch": 14.615728328865059, + "grad_norm": 0.000808656623121351, + "learning_rate": 1.0250427809684349e-05, + "loss": 0.0013, + "num_input_tokens_seen": 56931928, + "step": 98130 + }, + { + "epoch": 14.616473041406017, + "grad_norm": 0.00030259593040682375, + "learning_rate": 1.0247804300222034e-05, + "loss": 0.0, + "num_input_tokens_seen": 56934776, + "step": 98135 + }, + { + "epoch": 14.617217753946976, + "grad_norm": 0.00015813237405382097, + "learning_rate": 1.0245181039983068e-05, + "loss": 0.0, + "num_input_tokens_seen": 56937528, + "step": 98140 + }, + { + "epoch": 14.617962466487935, + "grad_norm": 9.61237856245134e-06, + "learning_rate": 1.0242558029011776e-05, + "loss": 0.0, + "num_input_tokens_seen": 56940184, + "step": 98145 + }, + { + "epoch": 14.618707179028895, + "grad_norm": 0.00025760781136341393, + "learning_rate": 1.0239935267352466e-05, + "loss": 0.0, + "num_input_tokens_seen": 56943032, + "step": 98150 + }, + { + "epoch": 14.619451891569854, + "grad_norm": 0.00039168779039755464, + "learning_rate": 1.023731275504944e-05, + "loss": 0.0, + "num_input_tokens_seen": 56946040, + "step": 98155 + }, + { + "epoch": 14.620196604110813, + "grad_norm": 0.0013047513784840703, + "learning_rate": 1.023469049214702e-05, + "loss": 0.0, + "num_input_tokens_seen": 56948856, + "step": 98160 + }, + { + "epoch": 14.620941316651772, + "grad_norm": 0.00028324127197265625, + "learning_rate": 1.0232068478689488e-05, + "loss": 0.0, + "num_input_tokens_seen": 56951992, + "step": 98165 + }, + { + "epoch": 14.621686029192732, + "grad_norm": 1.7017835489241406e-05, + "learning_rate": 1.0229446714721158e-05, + "loss": 0.0733, + "num_input_tokens_seen": 56954872, + "step": 98170 + }, + { + "epoch": 14.622430741733691, + "grad_norm": 7.424295472446829e-05, + "learning_rate": 1.0226825200286306e-05, + "loss": 0.0, + "num_input_tokens_seen": 56957496, + "step": 98175 + }, + { + "epoch": 14.62317545427465, + "grad_norm": 0.001071388483978808, + "learning_rate": 1.0224203935429235e-05, + "loss": 0.0, + "num_input_tokens_seen": 56960408, + "step": 98180 + }, + { + "epoch": 14.623920166815608, + "grad_norm": 0.000254948710789904, + "learning_rate": 1.0221582920194223e-05, + "loss": 0.0, + "num_input_tokens_seen": 56963288, + "step": 98185 + }, + { + "epoch": 14.624664879356569, + "grad_norm": 38.38688659667969, + "learning_rate": 1.0218962154625535e-05, + "loss": 0.0883, + "num_input_tokens_seen": 56966360, + "step": 98190 + }, + { + "epoch": 14.625409591897528, + "grad_norm": 0.0010510667925700545, + "learning_rate": 1.021634163876747e-05, + "loss": 0.0, + "num_input_tokens_seen": 56969208, + "step": 98195 + }, + { + "epoch": 14.626154304438487, + "grad_norm": 0.015447823330760002, + "learning_rate": 1.0213721372664279e-05, + "loss": 0.0, + "num_input_tokens_seen": 56972152, + "step": 98200 + }, + { + "epoch": 14.626899016979445, + "grad_norm": 213.19300842285156, + "learning_rate": 1.0211101356360245e-05, + "loss": 0.1906, + "num_input_tokens_seen": 56974872, + "step": 98205 + }, + { + "epoch": 14.627643729520406, + "grad_norm": 0.0004774483386427164, + "learning_rate": 1.0208481589899623e-05, + "loss": 0.0, + "num_input_tokens_seen": 56977816, + "step": 98210 + }, + { + "epoch": 14.628388442061365, + "grad_norm": 0.0006987348315306008, + "learning_rate": 1.0205862073326673e-05, + "loss": 0.0, + "num_input_tokens_seen": 56980568, + "step": 98215 + }, + { + "epoch": 14.629133154602323, + "grad_norm": 0.000321605330100283, + "learning_rate": 1.0203242806685645e-05, + "loss": 0.0, + "num_input_tokens_seen": 56983256, + "step": 98220 + }, + { + "epoch": 14.629877867143282, + "grad_norm": 0.002061152597889304, + "learning_rate": 1.0200623790020782e-05, + "loss": 0.0, + "num_input_tokens_seen": 56986392, + "step": 98225 + }, + { + "epoch": 14.630622579684243, + "grad_norm": 4.1542774852132425e-05, + "learning_rate": 1.0198005023376347e-05, + "loss": 0.0, + "num_input_tokens_seen": 56989176, + "step": 98230 + }, + { + "epoch": 14.631367292225201, + "grad_norm": 0.009869691915810108, + "learning_rate": 1.0195386506796567e-05, + "loss": 0.0088, + "num_input_tokens_seen": 56991992, + "step": 98235 + }, + { + "epoch": 14.63211200476616, + "grad_norm": 0.00014747878594789654, + "learning_rate": 1.0192768240325693e-05, + "loss": 0.0, + "num_input_tokens_seen": 56994840, + "step": 98240 + }, + { + "epoch": 14.632856717307119, + "grad_norm": 0.004405087325721979, + "learning_rate": 1.019015022400794e-05, + "loss": 0.0071, + "num_input_tokens_seen": 56997688, + "step": 98245 + }, + { + "epoch": 14.63360142984808, + "grad_norm": 0.0001298283168580383, + "learning_rate": 1.0187532457887557e-05, + "loss": 0.0362, + "num_input_tokens_seen": 57000536, + "step": 98250 + }, + { + "epoch": 14.634346142389038, + "grad_norm": 0.0007374947890639305, + "learning_rate": 1.018491494200875e-05, + "loss": 0.0, + "num_input_tokens_seen": 57003128, + "step": 98255 + }, + { + "epoch": 14.635090854929997, + "grad_norm": 0.005125587806105614, + "learning_rate": 1.0182297676415755e-05, + "loss": 0.0, + "num_input_tokens_seen": 57006232, + "step": 98260 + }, + { + "epoch": 14.635835567470956, + "grad_norm": 0.0021760829258710146, + "learning_rate": 1.0179680661152782e-05, + "loss": 0.0, + "num_input_tokens_seen": 57008856, + "step": 98265 + }, + { + "epoch": 14.636580280011916, + "grad_norm": 0.00802637729793787, + "learning_rate": 1.0177063896264042e-05, + "loss": 0.0, + "num_input_tokens_seen": 57011512, + "step": 98270 + }, + { + "epoch": 14.637324992552875, + "grad_norm": 0.00010233745706500486, + "learning_rate": 1.0174447381793739e-05, + "loss": 0.0, + "num_input_tokens_seen": 57014264, + "step": 98275 + }, + { + "epoch": 14.638069705093834, + "grad_norm": 3.817649121629074e-05, + "learning_rate": 1.0171831117786074e-05, + "loss": 0.0, + "num_input_tokens_seen": 57017048, + "step": 98280 + }, + { + "epoch": 14.638814417634793, + "grad_norm": 0.0005212044343352318, + "learning_rate": 1.016921510428526e-05, + "loss": 0.1082, + "num_input_tokens_seen": 57019736, + "step": 98285 + }, + { + "epoch": 14.639559130175751, + "grad_norm": 20.87258529663086, + "learning_rate": 1.0166599341335473e-05, + "loss": 0.0772, + "num_input_tokens_seen": 57022392, + "step": 98290 + }, + { + "epoch": 14.640303842716712, + "grad_norm": 0.00019863533088937402, + "learning_rate": 1.0163983828980922e-05, + "loss": 0.0, + "num_input_tokens_seen": 57025240, + "step": 98295 + }, + { + "epoch": 14.64104855525767, + "grad_norm": 0.0008558606496080756, + "learning_rate": 1.016136856726579e-05, + "loss": 0.1534, + "num_input_tokens_seen": 57028376, + "step": 98300 + }, + { + "epoch": 14.64179326779863, + "grad_norm": 0.001994734862819314, + "learning_rate": 1.015875355623424e-05, + "loss": 0.0001, + "num_input_tokens_seen": 57031128, + "step": 98305 + }, + { + "epoch": 14.642537980339588, + "grad_norm": 0.0027698318008333445, + "learning_rate": 1.0156138795930479e-05, + "loss": 0.0, + "num_input_tokens_seen": 57033976, + "step": 98310 + }, + { + "epoch": 14.643282692880549, + "grad_norm": 0.00010959306382574141, + "learning_rate": 1.0153524286398656e-05, + "loss": 0.0, + "num_input_tokens_seen": 57036728, + "step": 98315 + }, + { + "epoch": 14.644027405421507, + "grad_norm": 0.0028593491297215223, + "learning_rate": 1.0150910027682958e-05, + "loss": 0.0, + "num_input_tokens_seen": 57039448, + "step": 98320 + }, + { + "epoch": 14.644772117962466, + "grad_norm": 5.437785875983536e-05, + "learning_rate": 1.0148296019827535e-05, + "loss": 0.0, + "num_input_tokens_seen": 57042520, + "step": 98325 + }, + { + "epoch": 14.645516830503425, + "grad_norm": 0.0012292497558519244, + "learning_rate": 1.0145682262876566e-05, + "loss": 0.0, + "num_input_tokens_seen": 57045592, + "step": 98330 + }, + { + "epoch": 14.646261543044385, + "grad_norm": 0.0004583481349982321, + "learning_rate": 1.0143068756874197e-05, + "loss": 0.0, + "num_input_tokens_seen": 57048280, + "step": 98335 + }, + { + "epoch": 14.647006255585344, + "grad_norm": 2.6441792215337045e-05, + "learning_rate": 1.0140455501864583e-05, + "loss": 0.0, + "num_input_tokens_seen": 57051192, + "step": 98340 + }, + { + "epoch": 14.647750968126303, + "grad_norm": 2.8117756301071495e-06, + "learning_rate": 1.013784249789187e-05, + "loss": 0.0005, + "num_input_tokens_seen": 57054072, + "step": 98345 + }, + { + "epoch": 14.648495680667262, + "grad_norm": 0.0005474170902743936, + "learning_rate": 1.013522974500019e-05, + "loss": 0.0, + "num_input_tokens_seen": 57057016, + "step": 98350 + }, + { + "epoch": 14.649240393208222, + "grad_norm": 0.00033016130328178406, + "learning_rate": 1.013261724323371e-05, + "loss": 0.0, + "num_input_tokens_seen": 57059896, + "step": 98355 + }, + { + "epoch": 14.649985105749181, + "grad_norm": 0.010732071474194527, + "learning_rate": 1.0130004992636541e-05, + "loss": 0.0002, + "num_input_tokens_seen": 57063032, + "step": 98360 + }, + { + "epoch": 14.65072981829014, + "grad_norm": 0.0003523805062286556, + "learning_rate": 1.0127392993252832e-05, + "loss": 0.0, + "num_input_tokens_seen": 57065816, + "step": 98365 + }, + { + "epoch": 14.651474530831099, + "grad_norm": 0.013424578122794628, + "learning_rate": 1.0124781245126695e-05, + "loss": 0.0, + "num_input_tokens_seen": 57068728, + "step": 98370 + }, + { + "epoch": 14.652219243372059, + "grad_norm": 0.028104525059461594, + "learning_rate": 1.0122169748302265e-05, + "loss": 0.0, + "num_input_tokens_seen": 57071768, + "step": 98375 + }, + { + "epoch": 14.652963955913018, + "grad_norm": 5.19228087796364e-05, + "learning_rate": 1.011955850282365e-05, + "loss": 0.2594, + "num_input_tokens_seen": 57075288, + "step": 98380 + }, + { + "epoch": 14.653708668453977, + "grad_norm": 0.00024509523063898087, + "learning_rate": 1.0116947508734981e-05, + "loss": 0.0003, + "num_input_tokens_seen": 57078104, + "step": 98385 + }, + { + "epoch": 14.654453380994935, + "grad_norm": 0.004352022893726826, + "learning_rate": 1.0114336766080356e-05, + "loss": 0.1162, + "num_input_tokens_seen": 57081080, + "step": 98390 + }, + { + "epoch": 14.655198093535896, + "grad_norm": 0.00042365031549707055, + "learning_rate": 1.0111726274903873e-05, + "loss": 0.004, + "num_input_tokens_seen": 57084376, + "step": 98395 + }, + { + "epoch": 14.655942806076855, + "grad_norm": 0.0036186925135552883, + "learning_rate": 1.0109116035249652e-05, + "loss": 0.0409, + "num_input_tokens_seen": 57087320, + "step": 98400 + }, + { + "epoch": 14.656687518617813, + "grad_norm": 0.011514803394675255, + "learning_rate": 1.0106506047161782e-05, + "loss": 0.0, + "num_input_tokens_seen": 57090232, + "step": 98405 + }, + { + "epoch": 14.657432231158772, + "grad_norm": 0.0018559423042461276, + "learning_rate": 1.0103896310684356e-05, + "loss": 0.0, + "num_input_tokens_seen": 57092824, + "step": 98410 + }, + { + "epoch": 14.658176943699733, + "grad_norm": 0.00023490622697863728, + "learning_rate": 1.010128682586145e-05, + "loss": 0.0, + "num_input_tokens_seen": 57095512, + "step": 98415 + }, + { + "epoch": 14.658921656240691, + "grad_norm": 0.0002895555517170578, + "learning_rate": 1.009867759273717e-05, + "loss": 0.0, + "num_input_tokens_seen": 57098264, + "step": 98420 + }, + { + "epoch": 14.65966636878165, + "grad_norm": 0.00026211474323645234, + "learning_rate": 1.0096068611355588e-05, + "loss": 0.1594, + "num_input_tokens_seen": 57101176, + "step": 98425 + }, + { + "epoch": 14.660411081322609, + "grad_norm": 56.65018844604492, + "learning_rate": 1.0093459881760772e-05, + "loss": 0.1316, + "num_input_tokens_seen": 57104120, + "step": 98430 + }, + { + "epoch": 14.66115579386357, + "grad_norm": 13.519525527954102, + "learning_rate": 1.0090851403996809e-05, + "loss": 0.0269, + "num_input_tokens_seen": 57107000, + "step": 98435 + }, + { + "epoch": 14.661900506404528, + "grad_norm": 0.0001505518303019926, + "learning_rate": 1.0088243178107748e-05, + "loss": 0.0, + "num_input_tokens_seen": 57109816, + "step": 98440 + }, + { + "epoch": 14.662645218945487, + "grad_norm": 0.0233903955668211, + "learning_rate": 1.0085635204137672e-05, + "loss": 0.0, + "num_input_tokens_seen": 57112792, + "step": 98445 + }, + { + "epoch": 14.663389931486446, + "grad_norm": 8.644329071044922, + "learning_rate": 1.0083027482130625e-05, + "loss": 0.0127, + "num_input_tokens_seen": 57115832, + "step": 98450 + }, + { + "epoch": 14.664134644027406, + "grad_norm": 0.0004316543636377901, + "learning_rate": 1.0080420012130673e-05, + "loss": 0.0, + "num_input_tokens_seen": 57118552, + "step": 98455 + }, + { + "epoch": 14.664879356568365, + "grad_norm": 0.00029207661282271147, + "learning_rate": 1.0077812794181854e-05, + "loss": 0.0001, + "num_input_tokens_seen": 57121432, + "step": 98460 + }, + { + "epoch": 14.665624069109324, + "grad_norm": 0.0008163137244991958, + "learning_rate": 1.0075205828328232e-05, + "loss": 0.0, + "num_input_tokens_seen": 57124504, + "step": 98465 + }, + { + "epoch": 14.666368781650283, + "grad_norm": 0.003944847267121077, + "learning_rate": 1.0072599114613837e-05, + "loss": 0.0, + "num_input_tokens_seen": 57127736, + "step": 98470 + }, + { + "epoch": 14.667113494191241, + "grad_norm": 0.0013397593284025788, + "learning_rate": 1.0069992653082707e-05, + "loss": 0.0001, + "num_input_tokens_seen": 57130744, + "step": 98475 + }, + { + "epoch": 14.667858206732202, + "grad_norm": 0.002802556846290827, + "learning_rate": 1.0067386443778879e-05, + "loss": 0.0, + "num_input_tokens_seen": 57133656, + "step": 98480 + }, + { + "epoch": 14.66860291927316, + "grad_norm": 0.011574063450098038, + "learning_rate": 1.006478048674637e-05, + "loss": 0.0001, + "num_input_tokens_seen": 57136472, + "step": 98485 + }, + { + "epoch": 14.66934763181412, + "grad_norm": 0.006850897334516048, + "learning_rate": 1.0062174782029227e-05, + "loss": 0.0, + "num_input_tokens_seen": 57139448, + "step": 98490 + }, + { + "epoch": 14.670092344355078, + "grad_norm": 0.0003234786563552916, + "learning_rate": 1.0059569329671448e-05, + "loss": 0.0, + "num_input_tokens_seen": 57142424, + "step": 98495 + }, + { + "epoch": 14.670837056896039, + "grad_norm": 0.004099717363715172, + "learning_rate": 1.0056964129717067e-05, + "loss": 0.0, + "num_input_tokens_seen": 57145112, + "step": 98500 + }, + { + "epoch": 14.671581769436997, + "grad_norm": 0.01860899291932583, + "learning_rate": 1.0054359182210093e-05, + "loss": 0.0001, + "num_input_tokens_seen": 57147960, + "step": 98505 + }, + { + "epoch": 14.672326481977956, + "grad_norm": 0.00045544229215011, + "learning_rate": 1.005175448719452e-05, + "loss": 0.0, + "num_input_tokens_seen": 57150936, + "step": 98510 + }, + { + "epoch": 14.673071194518915, + "grad_norm": 0.0046632131561636925, + "learning_rate": 1.0049150044714373e-05, + "loss": 0.0, + "num_input_tokens_seen": 57153944, + "step": 98515 + }, + { + "epoch": 14.673815907059875, + "grad_norm": 0.010755394585430622, + "learning_rate": 1.004654585481363e-05, + "loss": 0.0001, + "num_input_tokens_seen": 57156792, + "step": 98520 + }, + { + "epoch": 14.674560619600834, + "grad_norm": 0.0006007669726386666, + "learning_rate": 1.0043941917536303e-05, + "loss": 0.0, + "num_input_tokens_seen": 57159960, + "step": 98525 + }, + { + "epoch": 14.675305332141793, + "grad_norm": 0.0012726732529699802, + "learning_rate": 1.0041338232926373e-05, + "loss": 0.0, + "num_input_tokens_seen": 57162776, + "step": 98530 + }, + { + "epoch": 14.676050044682752, + "grad_norm": 0.00020047990255989134, + "learning_rate": 1.0038734801027836e-05, + "loss": 0.0, + "num_input_tokens_seen": 57165592, + "step": 98535 + }, + { + "epoch": 14.676794757223712, + "grad_norm": 19.711782455444336, + "learning_rate": 1.003613162188467e-05, + "loss": 0.0119, + "num_input_tokens_seen": 57168696, + "step": 98540 + }, + { + "epoch": 14.677539469764671, + "grad_norm": 4.9621856305748224e-05, + "learning_rate": 1.003352869554085e-05, + "loss": 0.0006, + "num_input_tokens_seen": 57171768, + "step": 98545 + }, + { + "epoch": 14.67828418230563, + "grad_norm": 0.0003543386119417846, + "learning_rate": 1.0030926022040355e-05, + "loss": 0.0, + "num_input_tokens_seen": 57174392, + "step": 98550 + }, + { + "epoch": 14.679028894846589, + "grad_norm": 0.0003890478110406548, + "learning_rate": 1.002832360142714e-05, + "loss": 0.0002, + "num_input_tokens_seen": 57177240, + "step": 98555 + }, + { + "epoch": 14.679773607387549, + "grad_norm": 0.0007592646288685501, + "learning_rate": 1.0025721433745188e-05, + "loss": 0.0, + "num_input_tokens_seen": 57179960, + "step": 98560 + }, + { + "epoch": 14.680518319928508, + "grad_norm": 0.005707929376512766, + "learning_rate": 1.0023119519038445e-05, + "loss": 0.0, + "num_input_tokens_seen": 57182616, + "step": 98565 + }, + { + "epoch": 14.681263032469467, + "grad_norm": 0.001188717084005475, + "learning_rate": 1.0020517857350886e-05, + "loss": 0.0005, + "num_input_tokens_seen": 57185368, + "step": 98570 + }, + { + "epoch": 14.682007745010425, + "grad_norm": 0.000169017628650181, + "learning_rate": 1.0017916448726444e-05, + "loss": 0.0, + "num_input_tokens_seen": 57188632, + "step": 98575 + }, + { + "epoch": 14.682752457551386, + "grad_norm": 0.0001679424021858722, + "learning_rate": 1.0015315293209087e-05, + "loss": 0.0, + "num_input_tokens_seen": 57191416, + "step": 98580 + }, + { + "epoch": 14.683497170092345, + "grad_norm": 0.006186625920236111, + "learning_rate": 1.0012714390842748e-05, + "loss": 0.0, + "num_input_tokens_seen": 57194296, + "step": 98585 + }, + { + "epoch": 14.684241882633303, + "grad_norm": 0.008159888908267021, + "learning_rate": 1.0010113741671356e-05, + "loss": 0.0733, + "num_input_tokens_seen": 57197080, + "step": 98590 + }, + { + "epoch": 14.684986595174262, + "grad_norm": 22.16130256652832, + "learning_rate": 1.0007513345738867e-05, + "loss": 0.1102, + "num_input_tokens_seen": 57199672, + "step": 98595 + }, + { + "epoch": 14.685731307715223, + "grad_norm": 0.00035460316576063633, + "learning_rate": 1.0004913203089202e-05, + "loss": 0.0, + "num_input_tokens_seen": 57202552, + "step": 98600 + }, + { + "epoch": 14.686476020256181, + "grad_norm": 0.00017344213847536594, + "learning_rate": 1.000231331376629e-05, + "loss": 0.0, + "num_input_tokens_seen": 57205464, + "step": 98605 + }, + { + "epoch": 14.68722073279714, + "grad_norm": 0.0004195963847450912, + "learning_rate": 9.99971367781404e-06, + "loss": 0.1419, + "num_input_tokens_seen": 57208664, + "step": 98610 + }, + { + "epoch": 14.687965445338099, + "grad_norm": 1.3548727110901382e-05, + "learning_rate": 9.997114295276395e-06, + "loss": 0.0, + "num_input_tokens_seen": 57211608, + "step": 98615 + }, + { + "epoch": 14.688710157879058, + "grad_norm": 6.191474676597863e-05, + "learning_rate": 9.994515166197241e-06, + "loss": 0.0, + "num_input_tokens_seen": 57214456, + "step": 98620 + }, + { + "epoch": 14.689454870420018, + "grad_norm": 0.002015757840126753, + "learning_rate": 9.991916290620515e-06, + "loss": 0.0, + "num_input_tokens_seen": 57217336, + "step": 98625 + }, + { + "epoch": 14.690199582960977, + "grad_norm": 0.00035215707612223923, + "learning_rate": 9.98931766859011e-06, + "loss": 0.0006, + "num_input_tokens_seen": 57220056, + "step": 98630 + }, + { + "epoch": 14.690944295501936, + "grad_norm": 0.0017203794559463859, + "learning_rate": 9.986719300149915e-06, + "loss": 0.0, + "num_input_tokens_seen": 57222808, + "step": 98635 + }, + { + "epoch": 14.691689008042896, + "grad_norm": 0.000568665680475533, + "learning_rate": 9.98412118534385e-06, + "loss": 0.0, + "num_input_tokens_seen": 57225752, + "step": 98640 + }, + { + "epoch": 14.692433720583855, + "grad_norm": 0.0005243535269983113, + "learning_rate": 9.981523324215786e-06, + "loss": 0.0, + "num_input_tokens_seen": 57228632, + "step": 98645 + }, + { + "epoch": 14.693178433124814, + "grad_norm": 0.1736762672662735, + "learning_rate": 9.978925716809631e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57231384, + "step": 98650 + }, + { + "epoch": 14.693923145665773, + "grad_norm": 0.0003583990328479558, + "learning_rate": 9.976328363169252e-06, + "loss": 0.0, + "num_input_tokens_seen": 57234360, + "step": 98655 + }, + { + "epoch": 14.694667858206731, + "grad_norm": 0.002199744340032339, + "learning_rate": 9.973731263338542e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57237144, + "step": 98660 + }, + { + "epoch": 14.695412570747692, + "grad_norm": 0.0002952590584754944, + "learning_rate": 9.971134417361371e-06, + "loss": 0.0, + "num_input_tokens_seen": 57239992, + "step": 98665 + }, + { + "epoch": 14.69615728328865, + "grad_norm": 0.0002120370336342603, + "learning_rate": 9.96853782528161e-06, + "loss": 0.3397, + "num_input_tokens_seen": 57242552, + "step": 98670 + }, + { + "epoch": 14.69690199582961, + "grad_norm": 0.0018519229488447309, + "learning_rate": 9.965941487143123e-06, + "loss": 0.0, + "num_input_tokens_seen": 57245656, + "step": 98675 + }, + { + "epoch": 14.697646708370568, + "grad_norm": 0.000838136300444603, + "learning_rate": 9.963345402989768e-06, + "loss": 0.0, + "num_input_tokens_seen": 57248536, + "step": 98680 + }, + { + "epoch": 14.698391420911529, + "grad_norm": 0.138437882065773, + "learning_rate": 9.96074957286542e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57251256, + "step": 98685 + }, + { + "epoch": 14.699136133452487, + "grad_norm": 0.0057997689582407475, + "learning_rate": 9.958153996813912e-06, + "loss": 0.0, + "num_input_tokens_seen": 57254232, + "step": 98690 + }, + { + "epoch": 14.699880845993446, + "grad_norm": 0.0005639445153065026, + "learning_rate": 9.955558674879115e-06, + "loss": 0.0036, + "num_input_tokens_seen": 57257240, + "step": 98695 + }, + { + "epoch": 14.700625558534405, + "grad_norm": 4.9932441470446065e-06, + "learning_rate": 9.952963607104851e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57260056, + "step": 98700 + }, + { + "epoch": 14.701370271075366, + "grad_norm": 0.0018701816443353891, + "learning_rate": 9.950368793534986e-06, + "loss": 0.0152, + "num_input_tokens_seen": 57263256, + "step": 98705 + }, + { + "epoch": 14.702114983616324, + "grad_norm": 0.00028926655068062246, + "learning_rate": 9.947774234213342e-06, + "loss": 0.0, + "num_input_tokens_seen": 57266008, + "step": 98710 + }, + { + "epoch": 14.702859696157283, + "grad_norm": 0.006233549676835537, + "learning_rate": 9.945179929183749e-06, + "loss": 0.0, + "num_input_tokens_seen": 57269208, + "step": 98715 + }, + { + "epoch": 14.703604408698242, + "grad_norm": 0.015701381489634514, + "learning_rate": 9.942585878490046e-06, + "loss": 0.0, + "num_input_tokens_seen": 57272024, + "step": 98720 + }, + { + "epoch": 14.704349121239202, + "grad_norm": 0.001285187085159123, + "learning_rate": 9.939992082176041e-06, + "loss": 0.0, + "num_input_tokens_seen": 57274808, + "step": 98725 + }, + { + "epoch": 14.705093833780161, + "grad_norm": 0.0390603132545948, + "learning_rate": 9.937398540285575e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57277816, + "step": 98730 + }, + { + "epoch": 14.70583854632112, + "grad_norm": 0.0093954186886549, + "learning_rate": 9.93480525286245e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57280632, + "step": 98735 + }, + { + "epoch": 14.706583258862079, + "grad_norm": 0.00028681251569651067, + "learning_rate": 9.93221221995048e-06, + "loss": 0.0, + "num_input_tokens_seen": 57283544, + "step": 98740 + }, + { + "epoch": 14.70732797140304, + "grad_norm": 0.0006552878767251968, + "learning_rate": 9.929619441593469e-06, + "loss": 0.0, + "num_input_tokens_seen": 57286744, + "step": 98745 + }, + { + "epoch": 14.708072683943998, + "grad_norm": 0.000680445518810302, + "learning_rate": 9.927026917835211e-06, + "loss": 0.0074, + "num_input_tokens_seen": 57289848, + "step": 98750 + }, + { + "epoch": 14.708817396484957, + "grad_norm": 0.0004064162785653025, + "learning_rate": 9.924434648719525e-06, + "loss": 0.0, + "num_input_tokens_seen": 57292888, + "step": 98755 + }, + { + "epoch": 14.709562109025915, + "grad_norm": 0.009968101978302002, + "learning_rate": 9.921842634290182e-06, + "loss": 0.0002, + "num_input_tokens_seen": 57295704, + "step": 98760 + }, + { + "epoch": 14.710306821566876, + "grad_norm": 4.411554982652888e-05, + "learning_rate": 9.919250874590993e-06, + "loss": 0.1534, + "num_input_tokens_seen": 57298712, + "step": 98765 + }, + { + "epoch": 14.711051534107835, + "grad_norm": 0.002880201442167163, + "learning_rate": 9.916659369665726e-06, + "loss": 0.0, + "num_input_tokens_seen": 57301720, + "step": 98770 + }, + { + "epoch": 14.711796246648793, + "grad_norm": 0.0006164383958093822, + "learning_rate": 9.914068119558177e-06, + "loss": 0.0, + "num_input_tokens_seen": 57304760, + "step": 98775 + }, + { + "epoch": 14.712540959189752, + "grad_norm": 0.0002234793792013079, + "learning_rate": 9.911477124312104e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57307256, + "step": 98780 + }, + { + "epoch": 14.713285671730713, + "grad_norm": 0.00014975304657127708, + "learning_rate": 9.9088863839713e-06, + "loss": 0.0, + "num_input_tokens_seen": 57309848, + "step": 98785 + }, + { + "epoch": 14.714030384271672, + "grad_norm": 0.0006121994811110198, + "learning_rate": 9.90629589857952e-06, + "loss": 0.0, + "num_input_tokens_seen": 57313080, + "step": 98790 + }, + { + "epoch": 14.71477509681263, + "grad_norm": 0.001056319335475564, + "learning_rate": 9.903705668180524e-06, + "loss": 0.0, + "num_input_tokens_seen": 57315896, + "step": 98795 + }, + { + "epoch": 14.715519809353589, + "grad_norm": 0.0009308294975198805, + "learning_rate": 9.901115692818085e-06, + "loss": 0.0, + "num_input_tokens_seen": 57318872, + "step": 98800 + }, + { + "epoch": 14.716264521894548, + "grad_norm": 0.5693194270133972, + "learning_rate": 9.898525972535952e-06, + "loss": 0.0005, + "num_input_tokens_seen": 57321912, + "step": 98805 + }, + { + "epoch": 14.717009234435508, + "grad_norm": 0.0004747650818899274, + "learning_rate": 9.895936507377873e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57324824, + "step": 98810 + }, + { + "epoch": 14.717753946976467, + "grad_norm": 0.00017564794688951224, + "learning_rate": 9.89334729738759e-06, + "loss": 0.0, + "num_input_tokens_seen": 57327800, + "step": 98815 + }, + { + "epoch": 14.718498659517426, + "grad_norm": 0.00020341030904091895, + "learning_rate": 9.890758342608856e-06, + "loss": 0.0, + "num_input_tokens_seen": 57330456, + "step": 98820 + }, + { + "epoch": 14.719243372058386, + "grad_norm": 0.0014436430064961314, + "learning_rate": 9.888169643085404e-06, + "loss": 0.0, + "num_input_tokens_seen": 57333464, + "step": 98825 + }, + { + "epoch": 14.719988084599345, + "grad_norm": 0.004971683025360107, + "learning_rate": 9.885581198860958e-06, + "loss": 0.0, + "num_input_tokens_seen": 57336408, + "step": 98830 + }, + { + "epoch": 14.720732797140304, + "grad_norm": 0.0016001129988580942, + "learning_rate": 9.882993009979265e-06, + "loss": 0.0191, + "num_input_tokens_seen": 57339288, + "step": 98835 + }, + { + "epoch": 14.721477509681263, + "grad_norm": 0.0039412397891283035, + "learning_rate": 9.880405076484034e-06, + "loss": 0.0, + "num_input_tokens_seen": 57342200, + "step": 98840 + }, + { + "epoch": 14.722222222222221, + "grad_norm": 0.0005950271734036505, + "learning_rate": 9.877817398418998e-06, + "loss": 0.0, + "num_input_tokens_seen": 57345080, + "step": 98845 + }, + { + "epoch": 14.722966934763182, + "grad_norm": 8.868781151250005e-05, + "learning_rate": 9.87522997582786e-06, + "loss": 0.0, + "num_input_tokens_seen": 57348056, + "step": 98850 + }, + { + "epoch": 14.72371164730414, + "grad_norm": 0.00029893461032770574, + "learning_rate": 9.872642808754348e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57351032, + "step": 98855 + }, + { + "epoch": 14.7244563598451, + "grad_norm": 0.00011450184683781117, + "learning_rate": 9.870055897242152e-06, + "loss": 0.0, + "num_input_tokens_seen": 57354040, + "step": 98860 + }, + { + "epoch": 14.725201072386058, + "grad_norm": 0.000707872852217406, + "learning_rate": 9.867469241334994e-06, + "loss": 0.0002, + "num_input_tokens_seen": 57356952, + "step": 98865 + }, + { + "epoch": 14.725945784927019, + "grad_norm": 0.0009566643857397139, + "learning_rate": 9.864882841076564e-06, + "loss": 0.0, + "num_input_tokens_seen": 57359704, + "step": 98870 + }, + { + "epoch": 14.726690497467978, + "grad_norm": 0.0017100382829084992, + "learning_rate": 9.862296696510557e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57362552, + "step": 98875 + }, + { + "epoch": 14.727435210008936, + "grad_norm": 0.0008250519167631865, + "learning_rate": 9.859710807680658e-06, + "loss": 0.0, + "num_input_tokens_seen": 57365560, + "step": 98880 + }, + { + "epoch": 14.728179922549895, + "grad_norm": 0.0001455152378184721, + "learning_rate": 9.85712517463055e-06, + "loss": 0.0824, + "num_input_tokens_seen": 57368216, + "step": 98885 + }, + { + "epoch": 14.728924635090856, + "grad_norm": 0.000380921148462221, + "learning_rate": 9.85453979740393e-06, + "loss": 0.0, + "num_input_tokens_seen": 57370936, + "step": 98890 + }, + { + "epoch": 14.729669347631814, + "grad_norm": 0.0006769598112441599, + "learning_rate": 9.851954676044458e-06, + "loss": 0.0, + "num_input_tokens_seen": 57373624, + "step": 98895 + }, + { + "epoch": 14.730414060172773, + "grad_norm": 0.0001185321671073325, + "learning_rate": 9.849369810595827e-06, + "loss": 0.0, + "num_input_tokens_seen": 57376664, + "step": 98900 + }, + { + "epoch": 14.731158772713732, + "grad_norm": 4.536772394203581e-05, + "learning_rate": 9.846785201101691e-06, + "loss": 0.0, + "num_input_tokens_seen": 57379416, + "step": 98905 + }, + { + "epoch": 14.731903485254692, + "grad_norm": 29.955219268798828, + "learning_rate": 9.84420084760571e-06, + "loss": 0.0824, + "num_input_tokens_seen": 57382200, + "step": 98910 + }, + { + "epoch": 14.732648197795651, + "grad_norm": 78.49525451660156, + "learning_rate": 9.841616750151565e-06, + "loss": 0.0216, + "num_input_tokens_seen": 57385144, + "step": 98915 + }, + { + "epoch": 14.73339291033661, + "grad_norm": 12.78683853149414, + "learning_rate": 9.839032908782885e-06, + "loss": 0.0657, + "num_input_tokens_seen": 57388184, + "step": 98920 + }, + { + "epoch": 14.734137622877569, + "grad_norm": 0.0004974832409061491, + "learning_rate": 9.836449323543345e-06, + "loss": 0.0, + "num_input_tokens_seen": 57391000, + "step": 98925 + }, + { + "epoch": 14.73488233541853, + "grad_norm": 0.000799503643065691, + "learning_rate": 9.833865994476584e-06, + "loss": 0.0, + "num_input_tokens_seen": 57393944, + "step": 98930 + }, + { + "epoch": 14.735627047959488, + "grad_norm": 0.000594024546444416, + "learning_rate": 9.831282921626242e-06, + "loss": 0.0, + "num_input_tokens_seen": 57396792, + "step": 98935 + }, + { + "epoch": 14.736371760500447, + "grad_norm": 8.180204167729244e-05, + "learning_rate": 9.82870010503595e-06, + "loss": 0.0, + "num_input_tokens_seen": 57399896, + "step": 98940 + }, + { + "epoch": 14.737116473041405, + "grad_norm": 0.0004822858900297433, + "learning_rate": 9.826117544749357e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57403096, + "step": 98945 + }, + { + "epoch": 14.737861185582366, + "grad_norm": 0.00010686909081414342, + "learning_rate": 9.823535240810089e-06, + "loss": 0.0612, + "num_input_tokens_seen": 57406008, + "step": 98950 + }, + { + "epoch": 14.738605898123325, + "grad_norm": 0.0012558704474940896, + "learning_rate": 9.820953193261756e-06, + "loss": 0.0, + "num_input_tokens_seen": 57408888, + "step": 98955 + }, + { + "epoch": 14.739350610664284, + "grad_norm": 0.01095641776919365, + "learning_rate": 9.818371402148002e-06, + "loss": 0.0435, + "num_input_tokens_seen": 57411608, + "step": 98960 + }, + { + "epoch": 14.740095323205242, + "grad_norm": 0.00017675268463790417, + "learning_rate": 9.815789867512427e-06, + "loss": 0.1159, + "num_input_tokens_seen": 57414680, + "step": 98965 + }, + { + "epoch": 14.740840035746203, + "grad_norm": 0.010678121820092201, + "learning_rate": 9.813208589398654e-06, + "loss": 0.0016, + "num_input_tokens_seen": 57417688, + "step": 98970 + }, + { + "epoch": 14.741584748287162, + "grad_norm": 1.900621282402426e-05, + "learning_rate": 9.81062756785028e-06, + "loss": 0.0, + "num_input_tokens_seen": 57420408, + "step": 98975 + }, + { + "epoch": 14.74232946082812, + "grad_norm": 0.00014698825543746352, + "learning_rate": 9.808046802910926e-06, + "loss": 0.0, + "num_input_tokens_seen": 57423032, + "step": 98980 + }, + { + "epoch": 14.743074173369079, + "grad_norm": 0.00014992960495874286, + "learning_rate": 9.80546629462417e-06, + "loss": 0.0, + "num_input_tokens_seen": 57426136, + "step": 98985 + }, + { + "epoch": 14.743818885910038, + "grad_norm": 0.0007824372733011842, + "learning_rate": 9.802886043033626e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57429176, + "step": 98990 + }, + { + "epoch": 14.744563598450998, + "grad_norm": 0.00295824883505702, + "learning_rate": 9.800306048182878e-06, + "loss": 0.0, + "num_input_tokens_seen": 57432120, + "step": 98995 + }, + { + "epoch": 14.745308310991957, + "grad_norm": 0.0002813405590131879, + "learning_rate": 9.79772631011551e-06, + "loss": 0.0, + "num_input_tokens_seen": 57435128, + "step": 99000 + }, + { + "epoch": 14.746053023532916, + "grad_norm": 0.0021503157913684845, + "learning_rate": 9.795146828875107e-06, + "loss": 0.0, + "num_input_tokens_seen": 57437784, + "step": 99005 + }, + { + "epoch": 14.746797736073875, + "grad_norm": 0.0014162176521494985, + "learning_rate": 9.792567604505234e-06, + "loss": 0.0, + "num_input_tokens_seen": 57440856, + "step": 99010 + }, + { + "epoch": 14.747542448614835, + "grad_norm": 0.009480820968747139, + "learning_rate": 9.789988637049485e-06, + "loss": 0.0, + "num_input_tokens_seen": 57443864, + "step": 99015 + }, + { + "epoch": 14.748287161155794, + "grad_norm": 0.0006286689313128591, + "learning_rate": 9.787409926551411e-06, + "loss": 0.0, + "num_input_tokens_seen": 57446488, + "step": 99020 + }, + { + "epoch": 14.749031873696753, + "grad_norm": 0.004232615232467651, + "learning_rate": 9.784831473054592e-06, + "loss": 0.0, + "num_input_tokens_seen": 57449400, + "step": 99025 + }, + { + "epoch": 14.749776586237711, + "grad_norm": 0.0013886286178603768, + "learning_rate": 9.782253276602582e-06, + "loss": 0.0, + "num_input_tokens_seen": 57452280, + "step": 99030 + }, + { + "epoch": 14.750521298778672, + "grad_norm": 0.0005027790903113782, + "learning_rate": 9.779675337238928e-06, + "loss": 0.0, + "num_input_tokens_seen": 57455416, + "step": 99035 + }, + { + "epoch": 14.75126601131963, + "grad_norm": 0.0003289569285698235, + "learning_rate": 9.777097655007197e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57458424, + "step": 99040 + }, + { + "epoch": 14.75201072386059, + "grad_norm": 0.0003316309012006968, + "learning_rate": 9.774520229950923e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57461848, + "step": 99045 + }, + { + "epoch": 14.752755436401548, + "grad_norm": 0.00471328292042017, + "learning_rate": 9.771943062113664e-06, + "loss": 0.0, + "num_input_tokens_seen": 57464984, + "step": 99050 + }, + { + "epoch": 14.753500148942509, + "grad_norm": 0.0020611982326954603, + "learning_rate": 9.76936615153894e-06, + "loss": 0.0, + "num_input_tokens_seen": 57467896, + "step": 99055 + }, + { + "epoch": 14.754244861483468, + "grad_norm": 0.001794870593585074, + "learning_rate": 9.766789498270304e-06, + "loss": 0.0, + "num_input_tokens_seen": 57470456, + "step": 99060 + }, + { + "epoch": 14.754989574024426, + "grad_norm": 0.024998731911182404, + "learning_rate": 9.764213102351275e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57473208, + "step": 99065 + }, + { + "epoch": 14.755734286565385, + "grad_norm": 0.00023894893820397556, + "learning_rate": 9.761636963825382e-06, + "loss": 0.0, + "num_input_tokens_seen": 57476184, + "step": 99070 + }, + { + "epoch": 14.756478999106346, + "grad_norm": 0.0014873064355924726, + "learning_rate": 9.759061082736145e-06, + "loss": 0.0043, + "num_input_tokens_seen": 57478776, + "step": 99075 + }, + { + "epoch": 14.757223711647304, + "grad_norm": 0.002789634745568037, + "learning_rate": 9.756485459127073e-06, + "loss": 0.0, + "num_input_tokens_seen": 57481432, + "step": 99080 + }, + { + "epoch": 14.757968424188263, + "grad_norm": 0.01907527633011341, + "learning_rate": 9.753910093041696e-06, + "loss": 0.0, + "num_input_tokens_seen": 57484408, + "step": 99085 + }, + { + "epoch": 14.758713136729222, + "grad_norm": 0.1755334436893463, + "learning_rate": 9.751334984523502e-06, + "loss": 0.104, + "num_input_tokens_seen": 57487544, + "step": 99090 + }, + { + "epoch": 14.759457849270182, + "grad_norm": 3.3743042877176777e-05, + "learning_rate": 9.748760133616015e-06, + "loss": 0.0, + "num_input_tokens_seen": 57490232, + "step": 99095 + }, + { + "epoch": 14.760202561811141, + "grad_norm": 0.00021401044796220958, + "learning_rate": 9.746185540362714e-06, + "loss": 0.0, + "num_input_tokens_seen": 57492824, + "step": 99100 + }, + { + "epoch": 14.7609472743521, + "grad_norm": 0.0036965967155992985, + "learning_rate": 9.743611204807118e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57495608, + "step": 99105 + }, + { + "epoch": 14.761691986893059, + "grad_norm": 0.01303961593657732, + "learning_rate": 9.741037126992702e-06, + "loss": 0.0, + "num_input_tokens_seen": 57498648, + "step": 99110 + }, + { + "epoch": 14.76243669943402, + "grad_norm": 0.0037732443306595087, + "learning_rate": 9.738463306962947e-06, + "loss": 0.0, + "num_input_tokens_seen": 57501592, + "step": 99115 + }, + { + "epoch": 14.763181411974978, + "grad_norm": 0.0002660687896423042, + "learning_rate": 9.73588974476135e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57504600, + "step": 99120 + }, + { + "epoch": 14.763926124515937, + "grad_norm": 0.00312762800604105, + "learning_rate": 9.733316440431375e-06, + "loss": 0.0, + "num_input_tokens_seen": 57507512, + "step": 99125 + }, + { + "epoch": 14.764670837056896, + "grad_norm": 0.0011482916306704283, + "learning_rate": 9.730743394016512e-06, + "loss": 0.0, + "num_input_tokens_seen": 57510584, + "step": 99130 + }, + { + "epoch": 14.765415549597854, + "grad_norm": 8.042353874770924e-05, + "learning_rate": 9.72817060556022e-06, + "loss": 0.0005, + "num_input_tokens_seen": 57513592, + "step": 99135 + }, + { + "epoch": 14.766160262138815, + "grad_norm": 0.0006502403412014246, + "learning_rate": 9.725598075105963e-06, + "loss": 0.1533, + "num_input_tokens_seen": 57516504, + "step": 99140 + }, + { + "epoch": 14.766904974679774, + "grad_norm": 0.004841993562877178, + "learning_rate": 9.723025802697195e-06, + "loss": 0.0, + "num_input_tokens_seen": 57519256, + "step": 99145 + }, + { + "epoch": 14.767649687220732, + "grad_norm": 0.002307395450770855, + "learning_rate": 9.720453788377387e-06, + "loss": 0.048, + "num_input_tokens_seen": 57522616, + "step": 99150 + }, + { + "epoch": 14.768394399761693, + "grad_norm": 0.030044453218579292, + "learning_rate": 9.71788203218998e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57525496, + "step": 99155 + }, + { + "epoch": 14.769139112302652, + "grad_norm": 0.0014614781830459833, + "learning_rate": 9.71531053417842e-06, + "loss": 0.0, + "num_input_tokens_seen": 57528824, + "step": 99160 + }, + { + "epoch": 14.76988382484361, + "grad_norm": 0.0004457752511370927, + "learning_rate": 9.712739294386161e-06, + "loss": 0.0, + "num_input_tokens_seen": 57531864, + "step": 99165 + }, + { + "epoch": 14.77062853738457, + "grad_norm": 0.0013746173353865743, + "learning_rate": 9.710168312856626e-06, + "loss": 0.0, + "num_input_tokens_seen": 57534680, + "step": 99170 + }, + { + "epoch": 14.771373249925528, + "grad_norm": 0.002348515670746565, + "learning_rate": 9.707597589633267e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57537688, + "step": 99175 + }, + { + "epoch": 14.772117962466488, + "grad_norm": 0.0004301834269426763, + "learning_rate": 9.705027124759495e-06, + "loss": 0.0, + "num_input_tokens_seen": 57540504, + "step": 99180 + }, + { + "epoch": 14.772862675007447, + "grad_norm": 0.0003011939115822315, + "learning_rate": 9.702456918278752e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57543480, + "step": 99185 + }, + { + "epoch": 14.773607387548406, + "grad_norm": 0.9741016626358032, + "learning_rate": 9.69988697023445e-06, + "loss": 0.0048, + "num_input_tokens_seen": 57546328, + "step": 99190 + }, + { + "epoch": 14.774352100089365, + "grad_norm": 0.017790021374821663, + "learning_rate": 9.69731728067e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57549016, + "step": 99195 + }, + { + "epoch": 14.775096812630325, + "grad_norm": 0.0002125776809407398, + "learning_rate": 9.694747849628833e-06, + "loss": 0.0, + "num_input_tokens_seen": 57551864, + "step": 99200 + }, + { + "epoch": 14.775841525171284, + "grad_norm": 0.001551509601995349, + "learning_rate": 9.692178677154342e-06, + "loss": 0.0, + "num_input_tokens_seen": 57554744, + "step": 99205 + }, + { + "epoch": 14.776586237712243, + "grad_norm": 0.021439092233777046, + "learning_rate": 9.689609763289936e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57557752, + "step": 99210 + }, + { + "epoch": 14.777330950253202, + "grad_norm": 6.909408693900332e-05, + "learning_rate": 9.687041108079003e-06, + "loss": 0.0, + "num_input_tokens_seen": 57560472, + "step": 99215 + }, + { + "epoch": 14.778075662794162, + "grad_norm": 0.002055924851447344, + "learning_rate": 9.684472711564957e-06, + "loss": 0.0, + "num_input_tokens_seen": 57563608, + "step": 99220 + }, + { + "epoch": 14.77882037533512, + "grad_norm": 0.0010410413378849626, + "learning_rate": 9.681904573791168e-06, + "loss": 0.0, + "num_input_tokens_seen": 57566392, + "step": 99225 + }, + { + "epoch": 14.77956508787608, + "grad_norm": 0.004686088766902685, + "learning_rate": 9.679336694801041e-06, + "loss": 0.1534, + "num_input_tokens_seen": 57569400, + "step": 99230 + }, + { + "epoch": 14.780309800417038, + "grad_norm": 0.018034419044852257, + "learning_rate": 9.67676907463795e-06, + "loss": 0.0044, + "num_input_tokens_seen": 57572440, + "step": 99235 + }, + { + "epoch": 14.781054512957999, + "grad_norm": 0.0003701067471411079, + "learning_rate": 9.674201713345265e-06, + "loss": 0.0083, + "num_input_tokens_seen": 57575160, + "step": 99240 + }, + { + "epoch": 14.781799225498958, + "grad_norm": 2.7104944820166565e-05, + "learning_rate": 9.671634610966373e-06, + "loss": 0.0006, + "num_input_tokens_seen": 57577976, + "step": 99245 + }, + { + "epoch": 14.782543938039916, + "grad_norm": 0.0002085230516968295, + "learning_rate": 9.669067767544626e-06, + "loss": 0.0, + "num_input_tokens_seen": 57580664, + "step": 99250 + }, + { + "epoch": 14.783288650580875, + "grad_norm": 0.021344996988773346, + "learning_rate": 9.666501183123406e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57583640, + "step": 99255 + }, + { + "epoch": 14.784033363121836, + "grad_norm": 0.0014252291293814778, + "learning_rate": 9.663934857746065e-06, + "loss": 0.0, + "num_input_tokens_seen": 57586264, + "step": 99260 + }, + { + "epoch": 14.784778075662794, + "grad_norm": 7.544085383415222e-05, + "learning_rate": 9.661368791455957e-06, + "loss": 0.0, + "num_input_tokens_seen": 57588984, + "step": 99265 + }, + { + "epoch": 14.785522788203753, + "grad_norm": 0.004243881441652775, + "learning_rate": 9.658802984296426e-06, + "loss": 0.0, + "num_input_tokens_seen": 57591960, + "step": 99270 + }, + { + "epoch": 14.786267500744712, + "grad_norm": 5.1464030548231676e-05, + "learning_rate": 9.656237436310834e-06, + "loss": 0.0, + "num_input_tokens_seen": 57595192, + "step": 99275 + }, + { + "epoch": 14.787012213285673, + "grad_norm": 32.13618850708008, + "learning_rate": 9.653672147542515e-06, + "loss": 0.3174, + "num_input_tokens_seen": 57597944, + "step": 99280 + }, + { + "epoch": 14.787756925826631, + "grad_norm": 0.0003913222171831876, + "learning_rate": 9.651107118034799e-06, + "loss": 0.0, + "num_input_tokens_seen": 57601112, + "step": 99285 + }, + { + "epoch": 14.78850163836759, + "grad_norm": 0.0008105671731755137, + "learning_rate": 9.648542347831041e-06, + "loss": 0.0, + "num_input_tokens_seen": 57603960, + "step": 99290 + }, + { + "epoch": 14.789246350908549, + "grad_norm": 0.001613016240298748, + "learning_rate": 9.645977836974545e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57607000, + "step": 99295 + }, + { + "epoch": 14.78999106344951, + "grad_norm": 3.8373334407806396, + "learning_rate": 9.643413585508659e-06, + "loss": 0.018, + "num_input_tokens_seen": 57609720, + "step": 99300 + }, + { + "epoch": 14.790735775990468, + "grad_norm": 0.0005980082205496728, + "learning_rate": 9.640849593476684e-06, + "loss": 0.0, + "num_input_tokens_seen": 57612568, + "step": 99305 + }, + { + "epoch": 14.791480488531427, + "grad_norm": 120.15621948242188, + "learning_rate": 9.63828586092195e-06, + "loss": 0.1814, + "num_input_tokens_seen": 57615448, + "step": 99310 + }, + { + "epoch": 14.792225201072386, + "grad_norm": 0.00011321755300741643, + "learning_rate": 9.635722387887766e-06, + "loss": 0.1407, + "num_input_tokens_seen": 57618392, + "step": 99315 + }, + { + "epoch": 14.792969913613344, + "grad_norm": 0.0015716670313850045, + "learning_rate": 9.63315917441743e-06, + "loss": 0.0, + "num_input_tokens_seen": 57621016, + "step": 99320 + }, + { + "epoch": 14.793714626154305, + "grad_norm": 0.0020214179530739784, + "learning_rate": 9.630596220554259e-06, + "loss": 0.0, + "num_input_tokens_seen": 57624504, + "step": 99325 + }, + { + "epoch": 14.794459338695264, + "grad_norm": 0.0024398399982601404, + "learning_rate": 9.628033526341542e-06, + "loss": 0.0, + "num_input_tokens_seen": 57627416, + "step": 99330 + }, + { + "epoch": 14.795204051236222, + "grad_norm": 0.0005282533820718527, + "learning_rate": 9.625471091822576e-06, + "loss": 0.0, + "num_input_tokens_seen": 57630264, + "step": 99335 + }, + { + "epoch": 14.795948763777183, + "grad_norm": 0.00019823281036224216, + "learning_rate": 9.622908917040643e-06, + "loss": 0.0, + "num_input_tokens_seen": 57633144, + "step": 99340 + }, + { + "epoch": 14.796693476318142, + "grad_norm": 0.0007736619445495307, + "learning_rate": 9.620347002039042e-06, + "loss": 0.0, + "num_input_tokens_seen": 57635928, + "step": 99345 + }, + { + "epoch": 14.7974381888591, + "grad_norm": 9.696435881778598e-05, + "learning_rate": 9.61778534686105e-06, + "loss": 0.0, + "num_input_tokens_seen": 57638712, + "step": 99350 + }, + { + "epoch": 14.79818290140006, + "grad_norm": 0.00021594803547486663, + "learning_rate": 9.615223951549929e-06, + "loss": 0.0, + "num_input_tokens_seen": 57641400, + "step": 99355 + }, + { + "epoch": 14.798927613941018, + "grad_norm": 0.038424644619226456, + "learning_rate": 9.612662816148974e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57644216, + "step": 99360 + }, + { + "epoch": 14.799672326481979, + "grad_norm": 0.01859818398952484, + "learning_rate": 9.61010194070143e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57647224, + "step": 99365 + }, + { + "epoch": 14.800417039022937, + "grad_norm": 0.0006554220453836024, + "learning_rate": 9.607541325250582e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57649944, + "step": 99370 + }, + { + "epoch": 14.801161751563896, + "grad_norm": 0.0008276175940409303, + "learning_rate": 9.604980969839672e-06, + "loss": 0.0073, + "num_input_tokens_seen": 57652728, + "step": 99375 + }, + { + "epoch": 14.801906464104855, + "grad_norm": 0.00010588036820990965, + "learning_rate": 9.60242087451197e-06, + "loss": 0.0, + "num_input_tokens_seen": 57655576, + "step": 99380 + }, + { + "epoch": 14.802651176645815, + "grad_norm": 0.00012153484567534178, + "learning_rate": 9.599861039310709e-06, + "loss": 0.0944, + "num_input_tokens_seen": 57658488, + "step": 99385 + }, + { + "epoch": 14.803395889186774, + "grad_norm": 8.50811557029374e-05, + "learning_rate": 9.597301464279151e-06, + "loss": 0.0, + "num_input_tokens_seen": 57661272, + "step": 99390 + }, + { + "epoch": 14.804140601727733, + "grad_norm": 0.001879361574538052, + "learning_rate": 9.59474214946053e-06, + "loss": 0.0004, + "num_input_tokens_seen": 57664280, + "step": 99395 + }, + { + "epoch": 14.804885314268692, + "grad_norm": 0.0010901311179623008, + "learning_rate": 9.592183094898086e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57667032, + "step": 99400 + }, + { + "epoch": 14.805630026809652, + "grad_norm": 0.002734099980443716, + "learning_rate": 9.589624300635047e-06, + "loss": 0.0, + "num_input_tokens_seen": 57669816, + "step": 99405 + }, + { + "epoch": 14.80637473935061, + "grad_norm": 3.950815153075382e-05, + "learning_rate": 9.587065766714635e-06, + "loss": 0.0, + "num_input_tokens_seen": 57672696, + "step": 99410 + }, + { + "epoch": 14.80711945189157, + "grad_norm": 6.955243861739291e-06, + "learning_rate": 9.584507493180089e-06, + "loss": 0.0, + "num_input_tokens_seen": 57675672, + "step": 99415 + }, + { + "epoch": 14.807864164432528, + "grad_norm": 3.016190767288208, + "learning_rate": 9.581949480074615e-06, + "loss": 0.0291, + "num_input_tokens_seen": 57678808, + "step": 99420 + }, + { + "epoch": 14.808608876973489, + "grad_norm": 0.0012390997726470232, + "learning_rate": 9.579391727441442e-06, + "loss": 0.0, + "num_input_tokens_seen": 57681688, + "step": 99425 + }, + { + "epoch": 14.809353589514448, + "grad_norm": 1.2532442808151245, + "learning_rate": 9.576834235323773e-06, + "loss": 0.0003, + "num_input_tokens_seen": 57684664, + "step": 99430 + }, + { + "epoch": 14.810098302055406, + "grad_norm": 0.0006702003884129226, + "learning_rate": 9.574277003764807e-06, + "loss": 0.0, + "num_input_tokens_seen": 57687416, + "step": 99435 + }, + { + "epoch": 14.810843014596365, + "grad_norm": 0.0032472992315888405, + "learning_rate": 9.571720032807758e-06, + "loss": 0.0004, + "num_input_tokens_seen": 57690648, + "step": 99440 + }, + { + "epoch": 14.811587727137326, + "grad_norm": 0.00037231759051792324, + "learning_rate": 9.569163322495811e-06, + "loss": 0.0, + "num_input_tokens_seen": 57693720, + "step": 99445 + }, + { + "epoch": 14.812332439678285, + "grad_norm": 5.905674697714858e-05, + "learning_rate": 9.566606872872178e-06, + "loss": 0.0012, + "num_input_tokens_seen": 57696664, + "step": 99450 + }, + { + "epoch": 14.813077152219243, + "grad_norm": 5.557838812819682e-05, + "learning_rate": 9.564050683980025e-06, + "loss": 0.0285, + "num_input_tokens_seen": 57700056, + "step": 99455 + }, + { + "epoch": 14.813821864760202, + "grad_norm": 0.01721043512225151, + "learning_rate": 9.561494755862554e-06, + "loss": 0.0, + "num_input_tokens_seen": 57702872, + "step": 99460 + }, + { + "epoch": 14.814566577301163, + "grad_norm": 0.00038432006840594113, + "learning_rate": 9.55893908856294e-06, + "loss": 0.0734, + "num_input_tokens_seen": 57705752, + "step": 99465 + }, + { + "epoch": 14.815311289842121, + "grad_norm": 0.003324268851429224, + "learning_rate": 9.55638368212436e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57708376, + "step": 99470 + }, + { + "epoch": 14.81605600238308, + "grad_norm": 0.00039135554106906056, + "learning_rate": 9.553828536589976e-06, + "loss": 0.3781, + "num_input_tokens_seen": 57711448, + "step": 99475 + }, + { + "epoch": 14.816800714924039, + "grad_norm": 0.00019144850375596434, + "learning_rate": 9.551273652002955e-06, + "loss": 0.0, + "num_input_tokens_seen": 57714520, + "step": 99480 + }, + { + "epoch": 14.817545427465, + "grad_norm": 0.0008899008389562368, + "learning_rate": 9.548719028406472e-06, + "loss": 0.0, + "num_input_tokens_seen": 57717400, + "step": 99485 + }, + { + "epoch": 14.818290140005958, + "grad_norm": 0.000394335831515491, + "learning_rate": 9.546164665843669e-06, + "loss": 0.0, + "num_input_tokens_seen": 57720248, + "step": 99490 + }, + { + "epoch": 14.819034852546917, + "grad_norm": 0.0022507617250084877, + "learning_rate": 9.543610564357714e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57723480, + "step": 99495 + }, + { + "epoch": 14.819779565087876, + "grad_norm": 0.025871029123663902, + "learning_rate": 9.541056723991739e-06, + "loss": 0.0, + "num_input_tokens_seen": 57726264, + "step": 99500 + }, + { + "epoch": 14.820524277628834, + "grad_norm": 0.00027670833515003324, + "learning_rate": 9.538503144788914e-06, + "loss": 0.0, + "num_input_tokens_seen": 57729080, + "step": 99505 + }, + { + "epoch": 14.821268990169795, + "grad_norm": 0.003409743309020996, + "learning_rate": 9.535949826792358e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57731960, + "step": 99510 + }, + { + "epoch": 14.822013702710754, + "grad_norm": 0.001625358359888196, + "learning_rate": 9.533396770045208e-06, + "loss": 0.0, + "num_input_tokens_seen": 57734840, + "step": 99515 + }, + { + "epoch": 14.822758415251712, + "grad_norm": 0.006091665476560593, + "learning_rate": 9.530843974590606e-06, + "loss": 0.1252, + "num_input_tokens_seen": 57737560, + "step": 99520 + }, + { + "epoch": 14.823503127792671, + "grad_norm": 0.0023662818130105734, + "learning_rate": 9.528291440471665e-06, + "loss": 0.0, + "num_input_tokens_seen": 57740184, + "step": 99525 + }, + { + "epoch": 14.824247840333632, + "grad_norm": 0.00011187163909198716, + "learning_rate": 9.525739167731527e-06, + "loss": 0.0, + "num_input_tokens_seen": 57742936, + "step": 99530 + }, + { + "epoch": 14.82499255287459, + "grad_norm": 0.0005262221093289554, + "learning_rate": 9.523187156413294e-06, + "loss": 0.0, + "num_input_tokens_seen": 57745912, + "step": 99535 + }, + { + "epoch": 14.82573726541555, + "grad_norm": 3.618578921305016e-05, + "learning_rate": 9.520635406560086e-06, + "loss": 0.0, + "num_input_tokens_seen": 57748728, + "step": 99540 + }, + { + "epoch": 14.826481977956508, + "grad_norm": 0.0007152193575166166, + "learning_rate": 9.518083918215e-06, + "loss": 0.0, + "num_input_tokens_seen": 57751768, + "step": 99545 + }, + { + "epoch": 14.827226690497469, + "grad_norm": 0.00031328073237091303, + "learning_rate": 9.515532691421162e-06, + "loss": 0.0, + "num_input_tokens_seen": 57754552, + "step": 99550 + }, + { + "epoch": 14.827971403038427, + "grad_norm": 0.0002253648272017017, + "learning_rate": 9.512981726221661e-06, + "loss": 0.0, + "num_input_tokens_seen": 57757592, + "step": 99555 + }, + { + "epoch": 14.828716115579386, + "grad_norm": 0.001199397025629878, + "learning_rate": 9.510431022659586e-06, + "loss": 0.0, + "num_input_tokens_seen": 57760248, + "step": 99560 + }, + { + "epoch": 14.829460828120345, + "grad_norm": 4.7677931434009224e-05, + "learning_rate": 9.507880580778042e-06, + "loss": 0.0, + "num_input_tokens_seen": 57763128, + "step": 99565 + }, + { + "epoch": 14.830205540661305, + "grad_norm": 3.587069295463152e-05, + "learning_rate": 9.505330400620101e-06, + "loss": 0.0, + "num_input_tokens_seen": 57765784, + "step": 99570 + }, + { + "epoch": 14.830950253202264, + "grad_norm": 0.0012754357885569334, + "learning_rate": 9.502780482228866e-06, + "loss": 0.0002, + "num_input_tokens_seen": 57768696, + "step": 99575 + }, + { + "epoch": 14.831694965743223, + "grad_norm": 4.2984076571883634e-05, + "learning_rate": 9.500230825647394e-06, + "loss": 0.0074, + "num_input_tokens_seen": 57771832, + "step": 99580 + }, + { + "epoch": 14.832439678284182, + "grad_norm": 0.002757859881967306, + "learning_rate": 9.497681430918778e-06, + "loss": 0.0, + "num_input_tokens_seen": 57774520, + "step": 99585 + }, + { + "epoch": 14.833184390825142, + "grad_norm": 0.0005279623437672853, + "learning_rate": 9.495132298086079e-06, + "loss": 0.0, + "num_input_tokens_seen": 57777400, + "step": 99590 + }, + { + "epoch": 14.833929103366101, + "grad_norm": 0.0042128972709178925, + "learning_rate": 9.492583427192361e-06, + "loss": 0.2602, + "num_input_tokens_seen": 57780408, + "step": 99595 + }, + { + "epoch": 14.83467381590706, + "grad_norm": 0.0003678476787172258, + "learning_rate": 9.490034818280677e-06, + "loss": 0.0, + "num_input_tokens_seen": 57783512, + "step": 99600 + }, + { + "epoch": 14.835418528448018, + "grad_norm": 0.00013305971515364945, + "learning_rate": 9.487486471394096e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57786520, + "step": 99605 + }, + { + "epoch": 14.836163240988979, + "grad_norm": 0.00012972185504622757, + "learning_rate": 9.48493838657567e-06, + "loss": 0.0, + "num_input_tokens_seen": 57789368, + "step": 99610 + }, + { + "epoch": 14.836907953529938, + "grad_norm": 0.005032144952565432, + "learning_rate": 9.482390563868429e-06, + "loss": 0.0, + "num_input_tokens_seen": 57792216, + "step": 99615 + }, + { + "epoch": 14.837652666070897, + "grad_norm": 0.0010399480815976858, + "learning_rate": 9.479843003315439e-06, + "loss": 0.0, + "num_input_tokens_seen": 57794904, + "step": 99620 + }, + { + "epoch": 14.838397378611855, + "grad_norm": 0.21121080219745636, + "learning_rate": 9.477295704959718e-06, + "loss": 0.0008, + "num_input_tokens_seen": 57797528, + "step": 99625 + }, + { + "epoch": 14.839142091152816, + "grad_norm": 0.0008302798960357904, + "learning_rate": 9.474748668844316e-06, + "loss": 0.0, + "num_input_tokens_seen": 57800088, + "step": 99630 + }, + { + "epoch": 14.839886803693775, + "grad_norm": 2.1567298972513527e-05, + "learning_rate": 9.47220189501226e-06, + "loss": 0.001, + "num_input_tokens_seen": 57803192, + "step": 99635 + }, + { + "epoch": 14.840631516234733, + "grad_norm": 0.002919851802289486, + "learning_rate": 9.46965538350656e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57805816, + "step": 99640 + }, + { + "epoch": 14.841376228775692, + "grad_norm": 0.0004891195567324758, + "learning_rate": 9.467109134370255e-06, + "loss": 0.0002, + "num_input_tokens_seen": 57808920, + "step": 99645 + }, + { + "epoch": 14.842120941316653, + "grad_norm": 0.0006344998255372047, + "learning_rate": 9.46456314764635e-06, + "loss": 0.0, + "num_input_tokens_seen": 57811896, + "step": 99650 + }, + { + "epoch": 14.842865653857611, + "grad_norm": 0.007875470444560051, + "learning_rate": 9.462017423377867e-06, + "loss": 0.0, + "num_input_tokens_seen": 57814808, + "step": 99655 + }, + { + "epoch": 14.84361036639857, + "grad_norm": 0.0012360612163320184, + "learning_rate": 9.459471961607808e-06, + "loss": 0.0, + "num_input_tokens_seen": 57817624, + "step": 99660 + }, + { + "epoch": 14.844355078939529, + "grad_norm": 0.00023832412261981517, + "learning_rate": 9.456926762379175e-06, + "loss": 0.0, + "num_input_tokens_seen": 57820696, + "step": 99665 + }, + { + "epoch": 14.84509979148049, + "grad_norm": 1.5695268302806653e-05, + "learning_rate": 9.45438182573496e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57823544, + "step": 99670 + }, + { + "epoch": 14.845844504021448, + "grad_norm": 0.0007029015687294304, + "learning_rate": 9.451837151718171e-06, + "loss": 0.0, + "num_input_tokens_seen": 57826520, + "step": 99675 + }, + { + "epoch": 14.846589216562407, + "grad_norm": 0.0003541543264873326, + "learning_rate": 9.449292740371793e-06, + "loss": 0.0, + "num_input_tokens_seen": 57829784, + "step": 99680 + }, + { + "epoch": 14.847333929103366, + "grad_norm": 0.000408386840717867, + "learning_rate": 9.4467485917388e-06, + "loss": 0.0, + "num_input_tokens_seen": 57832376, + "step": 99685 + }, + { + "epoch": 14.848078641644324, + "grad_norm": 0.0004311621014494449, + "learning_rate": 9.444204705862189e-06, + "loss": 0.0, + "num_input_tokens_seen": 57835480, + "step": 99690 + }, + { + "epoch": 14.848823354185285, + "grad_norm": 0.0009885294130071998, + "learning_rate": 9.441661082784919e-06, + "loss": 0.0, + "num_input_tokens_seen": 57838232, + "step": 99695 + }, + { + "epoch": 14.849568066726244, + "grad_norm": 0.0007261888822540641, + "learning_rate": 9.439117722549984e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57840824, + "step": 99700 + }, + { + "epoch": 14.850312779267203, + "grad_norm": 0.0016329376958310604, + "learning_rate": 9.436574625200332e-06, + "loss": 0.0, + "num_input_tokens_seen": 57843544, + "step": 99705 + }, + { + "epoch": 14.851057491808161, + "grad_norm": 0.00124551507178694, + "learning_rate": 9.434031790778941e-06, + "loss": 0.0, + "num_input_tokens_seen": 57846520, + "step": 99710 + }, + { + "epoch": 14.851802204349122, + "grad_norm": 0.0011032592738047242, + "learning_rate": 9.431489219328759e-06, + "loss": 0.0, + "num_input_tokens_seen": 57849528, + "step": 99715 + }, + { + "epoch": 14.85254691689008, + "grad_norm": 7.69870457588695e-05, + "learning_rate": 9.42894691089274e-06, + "loss": 0.0014, + "num_input_tokens_seen": 57852312, + "step": 99720 + }, + { + "epoch": 14.85329162943104, + "grad_norm": 0.11879818886518478, + "learning_rate": 9.426404865513843e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57855544, + "step": 99725 + }, + { + "epoch": 14.854036341971998, + "grad_norm": 0.02088002860546112, + "learning_rate": 9.42386308323501e-06, + "loss": 0.0006, + "num_input_tokens_seen": 57858424, + "step": 99730 + }, + { + "epoch": 14.854781054512959, + "grad_norm": 0.0003715589118655771, + "learning_rate": 9.421321564099175e-06, + "loss": 0.0621, + "num_input_tokens_seen": 57861432, + "step": 99735 + }, + { + "epoch": 14.855525767053917, + "grad_norm": 0.000670996552798897, + "learning_rate": 9.418780308149276e-06, + "loss": 0.0016, + "num_input_tokens_seen": 57864408, + "step": 99740 + }, + { + "epoch": 14.856270479594876, + "grad_norm": 0.002612158190459013, + "learning_rate": 9.416239315428252e-06, + "loss": 0.0003, + "num_input_tokens_seen": 57867352, + "step": 99745 + }, + { + "epoch": 14.857015192135835, + "grad_norm": 0.000764430733397603, + "learning_rate": 9.413698585979016e-06, + "loss": 0.0002, + "num_input_tokens_seen": 57870488, + "step": 99750 + }, + { + "epoch": 14.857759904676795, + "grad_norm": 0.07319119572639465, + "learning_rate": 9.411158119844512e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57873464, + "step": 99755 + }, + { + "epoch": 14.858504617217754, + "grad_norm": 4.376292781671509e-05, + "learning_rate": 9.40861791706765e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57876344, + "step": 99760 + }, + { + "epoch": 14.859249329758713, + "grad_norm": 0.0017766135279089212, + "learning_rate": 9.40607797769133e-06, + "loss": 0.0, + "num_input_tokens_seen": 57878968, + "step": 99765 + }, + { + "epoch": 14.859994042299672, + "grad_norm": 0.09487221390008926, + "learning_rate": 9.403538301758486e-06, + "loss": 0.0, + "num_input_tokens_seen": 57882328, + "step": 99770 + }, + { + "epoch": 14.860738754840632, + "grad_norm": 0.00011164408351760358, + "learning_rate": 9.400998889311999e-06, + "loss": 0.0, + "num_input_tokens_seen": 57885144, + "step": 99775 + }, + { + "epoch": 14.861483467381591, + "grad_norm": 0.00038494475302286446, + "learning_rate": 9.398459740394792e-06, + "loss": 0.0, + "num_input_tokens_seen": 57887896, + "step": 99780 + }, + { + "epoch": 14.86222817992255, + "grad_norm": 0.00010553328320384026, + "learning_rate": 9.395920855049739e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57890680, + "step": 99785 + }, + { + "epoch": 14.862972892463509, + "grad_norm": 1.136595801654039e-05, + "learning_rate": 9.393382233319757e-06, + "loss": 0.0, + "num_input_tokens_seen": 57893464, + "step": 99790 + }, + { + "epoch": 14.863717605004469, + "grad_norm": 5.270318508148193, + "learning_rate": 9.390843875247717e-06, + "loss": 0.0004, + "num_input_tokens_seen": 57896408, + "step": 99795 + }, + { + "epoch": 14.864462317545428, + "grad_norm": 0.00020509841851890087, + "learning_rate": 9.388305780876508e-06, + "loss": 0.0073, + "num_input_tokens_seen": 57899192, + "step": 99800 + }, + { + "epoch": 14.865207030086387, + "grad_norm": 0.00011114904918940738, + "learning_rate": 9.385767950249003e-06, + "loss": 0.0, + "num_input_tokens_seen": 57902136, + "step": 99805 + }, + { + "epoch": 14.865951742627345, + "grad_norm": 0.0019242580747231841, + "learning_rate": 9.383230383408073e-06, + "loss": 0.2313, + "num_input_tokens_seen": 57904792, + "step": 99810 + }, + { + "epoch": 14.866696455168306, + "grad_norm": 61.40816879272461, + "learning_rate": 9.380693080396599e-06, + "loss": 0.119, + "num_input_tokens_seen": 57907992, + "step": 99815 + }, + { + "epoch": 14.867441167709265, + "grad_norm": 0.0002658355806488544, + "learning_rate": 9.378156041257436e-06, + "loss": 0.0, + "num_input_tokens_seen": 57910840, + "step": 99820 + }, + { + "epoch": 14.868185880250223, + "grad_norm": 0.0002547113981563598, + "learning_rate": 9.375619266033456e-06, + "loss": 0.0003, + "num_input_tokens_seen": 57913400, + "step": 99825 + }, + { + "epoch": 14.868930592791182, + "grad_norm": 0.0007902481011115015, + "learning_rate": 9.373082754767497e-06, + "loss": 0.0, + "num_input_tokens_seen": 57916312, + "step": 99830 + }, + { + "epoch": 14.86967530533214, + "grad_norm": 0.0014700343599542975, + "learning_rate": 9.370546507502433e-06, + "loss": 0.0, + "num_input_tokens_seen": 57919128, + "step": 99835 + }, + { + "epoch": 14.870420017873101, + "grad_norm": 0.0040117246098816395, + "learning_rate": 9.368010524281104e-06, + "loss": 0.0, + "num_input_tokens_seen": 57922104, + "step": 99840 + }, + { + "epoch": 14.87116473041406, + "grad_norm": 0.0168145839124918, + "learning_rate": 9.365474805146337e-06, + "loss": 0.0, + "num_input_tokens_seen": 57924920, + "step": 99845 + }, + { + "epoch": 14.871909442955019, + "grad_norm": 9.353617497254163e-05, + "learning_rate": 9.362939350140992e-06, + "loss": 0.0514, + "num_input_tokens_seen": 57927768, + "step": 99850 + }, + { + "epoch": 14.87265415549598, + "grad_norm": 2.8231546821189113e-05, + "learning_rate": 9.360404159307887e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57930488, + "step": 99855 + }, + { + "epoch": 14.873398868036938, + "grad_norm": 0.0008131580543704331, + "learning_rate": 9.357869232689867e-06, + "loss": 0.0, + "num_input_tokens_seen": 57933048, + "step": 99860 + }, + { + "epoch": 14.874143580577897, + "grad_norm": 6.349205250444356e-06, + "learning_rate": 9.355334570329746e-06, + "loss": 0.0011, + "num_input_tokens_seen": 57935896, + "step": 99865 + }, + { + "epoch": 14.874888293118856, + "grad_norm": 0.0010936533799394965, + "learning_rate": 9.352800172270352e-06, + "loss": 0.0, + "num_input_tokens_seen": 57938840, + "step": 99870 + }, + { + "epoch": 14.875633005659815, + "grad_norm": 0.00011885679850820452, + "learning_rate": 9.35026603855449e-06, + "loss": 0.0, + "num_input_tokens_seen": 57941560, + "step": 99875 + }, + { + "epoch": 14.876377718200775, + "grad_norm": 0.0005987751646898687, + "learning_rate": 9.347732169224972e-06, + "loss": 0.2625, + "num_input_tokens_seen": 57944344, + "step": 99880 + }, + { + "epoch": 14.877122430741734, + "grad_norm": 0.0005759124760515988, + "learning_rate": 9.345198564324616e-06, + "loss": 0.0, + "num_input_tokens_seen": 57948472, + "step": 99885 + }, + { + "epoch": 14.877867143282693, + "grad_norm": 9.468549978919327e-05, + "learning_rate": 9.342665223896216e-06, + "loss": 0.0532, + "num_input_tokens_seen": 57951352, + "step": 99890 + }, + { + "epoch": 14.878611855823651, + "grad_norm": 8.921784319682047e-05, + "learning_rate": 9.34013214798258e-06, + "loss": 0.0016, + "num_input_tokens_seen": 57954712, + "step": 99895 + }, + { + "epoch": 14.879356568364612, + "grad_norm": 5.272090857033618e-05, + "learning_rate": 9.337599336626488e-06, + "loss": 0.2656, + "num_input_tokens_seen": 57957592, + "step": 99900 + }, + { + "epoch": 14.88010128090557, + "grad_norm": 1.763284672051668e-05, + "learning_rate": 9.335066789870741e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57960632, + "step": 99905 + }, + { + "epoch": 14.88084599344653, + "grad_norm": 2.9303244446055032e-05, + "learning_rate": 9.332534507758114e-06, + "loss": 0.0, + "num_input_tokens_seen": 57963768, + "step": 99910 + }, + { + "epoch": 14.881590705987488, + "grad_norm": 0.0004844434733968228, + "learning_rate": 9.330002490331402e-06, + "loss": 0.0, + "num_input_tokens_seen": 57966648, + "step": 99915 + }, + { + "epoch": 14.882335418528449, + "grad_norm": 3.4952778816223145, + "learning_rate": 9.32747073763337e-06, + "loss": 0.0037, + "num_input_tokens_seen": 57969464, + "step": 99920 + }, + { + "epoch": 14.883080131069407, + "grad_norm": 0.004240571055561304, + "learning_rate": 9.324939249706793e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57972792, + "step": 99925 + }, + { + "epoch": 14.883824843610366, + "grad_norm": 0.00012341370165813714, + "learning_rate": 9.322408026594427e-06, + "loss": 0.0, + "num_input_tokens_seen": 57975896, + "step": 99930 + }, + { + "epoch": 14.884569556151325, + "grad_norm": 0.0018077932763844728, + "learning_rate": 9.319877068339051e-06, + "loss": 0.0, + "num_input_tokens_seen": 57979096, + "step": 99935 + }, + { + "epoch": 14.885314268692285, + "grad_norm": 0.11657734960317612, + "learning_rate": 9.317346374983416e-06, + "loss": 0.0002, + "num_input_tokens_seen": 57982008, + "step": 99940 + }, + { + "epoch": 14.886058981233244, + "grad_norm": 0.04350167512893677, + "learning_rate": 9.314815946570263e-06, + "loss": 0.0001, + "num_input_tokens_seen": 57984568, + "step": 99945 + }, + { + "epoch": 14.886803693774203, + "grad_norm": 0.00016625141142867506, + "learning_rate": 9.312285783142366e-06, + "loss": 0.0, + "num_input_tokens_seen": 57987512, + "step": 99950 + }, + { + "epoch": 14.887548406315162, + "grad_norm": 0.0003094373969361186, + "learning_rate": 9.309755884742455e-06, + "loss": 0.0852, + "num_input_tokens_seen": 57990264, + "step": 99955 + }, + { + "epoch": 14.888293118856122, + "grad_norm": 3.094921703450382e-05, + "learning_rate": 9.307226251413262e-06, + "loss": 0.0, + "num_input_tokens_seen": 57993272, + "step": 99960 + }, + { + "epoch": 14.889037831397081, + "grad_norm": 0.01205570250749588, + "learning_rate": 9.304696883197542e-06, + "loss": 0.0, + "num_input_tokens_seen": 57996216, + "step": 99965 + }, + { + "epoch": 14.88978254393804, + "grad_norm": 0.0001845170190790668, + "learning_rate": 9.302167780138005e-06, + "loss": 0.0002, + "num_input_tokens_seen": 57999160, + "step": 99970 + }, + { + "epoch": 14.890527256478999, + "grad_norm": 0.000995744252577424, + "learning_rate": 9.2996389422774e-06, + "loss": 0.0012, + "num_input_tokens_seen": 58002040, + "step": 99975 + }, + { + "epoch": 14.891271969019959, + "grad_norm": 0.0012110312236472964, + "learning_rate": 9.297110369658426e-06, + "loss": 0.0, + "num_input_tokens_seen": 58004824, + "step": 99980 + }, + { + "epoch": 14.892016681560918, + "grad_norm": 0.00256472360342741, + "learning_rate": 9.294582062323825e-06, + "loss": 0.0, + "num_input_tokens_seen": 58007672, + "step": 99985 + }, + { + "epoch": 14.892761394101877, + "grad_norm": 0.00011588199413381517, + "learning_rate": 9.292054020316297e-06, + "loss": 0.0561, + "num_input_tokens_seen": 58010808, + "step": 99990 + }, + { + "epoch": 14.893506106642835, + "grad_norm": 0.00029020803049206734, + "learning_rate": 9.28952624367855e-06, + "loss": 0.2063, + "num_input_tokens_seen": 58013912, + "step": 99995 + }, + { + "epoch": 14.894250819183796, + "grad_norm": 0.00045088690239936113, + "learning_rate": 9.286998732453292e-06, + "loss": 0.0, + "num_input_tokens_seen": 58016824, + "step": 100000 + }, + { + "epoch": 14.894995531724755, + "grad_norm": 0.0001288884086534381, + "learning_rate": 9.28447148668321e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58019832, + "step": 100005 + }, + { + "epoch": 14.895740244265713, + "grad_norm": 0.0038883632514625788, + "learning_rate": 9.28194450641102e-06, + "loss": 0.0, + "num_input_tokens_seen": 58022552, + "step": 100010 + }, + { + "epoch": 14.896484956806672, + "grad_norm": 7.284088496817276e-05, + "learning_rate": 9.27941779167939e-06, + "loss": 0.0, + "num_input_tokens_seen": 58025432, + "step": 100015 + }, + { + "epoch": 14.897229669347631, + "grad_norm": 0.0001479050552006811, + "learning_rate": 9.27689134253103e-06, + "loss": 0.0, + "num_input_tokens_seen": 58028376, + "step": 100020 + }, + { + "epoch": 14.897974381888591, + "grad_norm": 0.17356379330158234, + "learning_rate": 9.274365159008602e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58031320, + "step": 100025 + }, + { + "epoch": 14.89871909442955, + "grad_norm": 0.0001025009696604684, + "learning_rate": 9.2718392411548e-06, + "loss": 0.0, + "num_input_tokens_seen": 58034232, + "step": 100030 + }, + { + "epoch": 14.899463806970509, + "grad_norm": 9.646952821640298e-05, + "learning_rate": 9.26931358901229e-06, + "loss": 0.0284, + "num_input_tokens_seen": 58037240, + "step": 100035 + }, + { + "epoch": 14.900208519511468, + "grad_norm": 3.6634563002735376e-05, + "learning_rate": 9.26678820262373e-06, + "loss": 0.0, + "num_input_tokens_seen": 58040088, + "step": 100040 + }, + { + "epoch": 14.900953232052428, + "grad_norm": 68.19993591308594, + "learning_rate": 9.2642630820318e-06, + "loss": 0.2281, + "num_input_tokens_seen": 58042680, + "step": 100045 + }, + { + "epoch": 14.901697944593387, + "grad_norm": 0.0018133977428078651, + "learning_rate": 9.261738227279144e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58045400, + "step": 100050 + }, + { + "epoch": 14.902442657134346, + "grad_norm": 0.03197111561894417, + "learning_rate": 9.259213638408434e-06, + "loss": 0.0, + "num_input_tokens_seen": 58048216, + "step": 100055 + }, + { + "epoch": 14.903187369675305, + "grad_norm": 0.005212141200900078, + "learning_rate": 9.25668931546231e-06, + "loss": 0.0, + "num_input_tokens_seen": 58051064, + "step": 100060 + }, + { + "epoch": 14.903932082216265, + "grad_norm": 0.009329231455922127, + "learning_rate": 9.254165258483421e-06, + "loss": 0.1441, + "num_input_tokens_seen": 58053848, + "step": 100065 + }, + { + "epoch": 14.904676794757224, + "grad_norm": 0.00935449730604887, + "learning_rate": 9.251641467514399e-06, + "loss": 0.0, + "num_input_tokens_seen": 58057144, + "step": 100070 + }, + { + "epoch": 14.905421507298183, + "grad_norm": 0.0008219737210310996, + "learning_rate": 9.249117942597895e-06, + "loss": 0.0, + "num_input_tokens_seen": 58060344, + "step": 100075 + }, + { + "epoch": 14.906166219839141, + "grad_norm": 0.001941185211762786, + "learning_rate": 9.246594683776536e-06, + "loss": 0.0, + "num_input_tokens_seen": 58063448, + "step": 100080 + }, + { + "epoch": 14.906910932380102, + "grad_norm": 0.00029213528614491224, + "learning_rate": 9.244071691092937e-06, + "loss": 0.0, + "num_input_tokens_seen": 58066488, + "step": 100085 + }, + { + "epoch": 14.90765564492106, + "grad_norm": 3.377149550942704e-05, + "learning_rate": 9.241548964589747e-06, + "loss": 0.001, + "num_input_tokens_seen": 58069592, + "step": 100090 + }, + { + "epoch": 14.90840035746202, + "grad_norm": 0.00040206380072049797, + "learning_rate": 9.239026504309558e-06, + "loss": 0.0, + "num_input_tokens_seen": 58072504, + "step": 100095 + }, + { + "epoch": 14.909145070002978, + "grad_norm": 0.00025742012076079845, + "learning_rate": 9.236504310295007e-06, + "loss": 0.0, + "num_input_tokens_seen": 58075384, + "step": 100100 + }, + { + "epoch": 14.909889782543939, + "grad_norm": 0.0003060655726585537, + "learning_rate": 9.233982382588688e-06, + "loss": 0.0012, + "num_input_tokens_seen": 58078104, + "step": 100105 + }, + { + "epoch": 14.910634495084897, + "grad_norm": 0.00020785997912753373, + "learning_rate": 9.23146072123322e-06, + "loss": 0.0, + "num_input_tokens_seen": 58081048, + "step": 100110 + }, + { + "epoch": 14.911379207625856, + "grad_norm": 0.01892426423728466, + "learning_rate": 9.228939326271197e-06, + "loss": 0.0423, + "num_input_tokens_seen": 58084056, + "step": 100115 + }, + { + "epoch": 14.912123920166815, + "grad_norm": 0.0017747904639691114, + "learning_rate": 9.226418197745206e-06, + "loss": 0.0, + "num_input_tokens_seen": 58086840, + "step": 100120 + }, + { + "epoch": 14.912868632707776, + "grad_norm": 0.011085190810263157, + "learning_rate": 9.223897335697856e-06, + "loss": 0.0353, + "num_input_tokens_seen": 58089656, + "step": 100125 + }, + { + "epoch": 14.913613345248734, + "grad_norm": 0.00013400628813542426, + "learning_rate": 9.221376740171727e-06, + "loss": 0.0, + "num_input_tokens_seen": 58092440, + "step": 100130 + }, + { + "epoch": 14.914358057789693, + "grad_norm": 1.1263958185736556e-05, + "learning_rate": 9.2188564112094e-06, + "loss": 0.0, + "num_input_tokens_seen": 58095352, + "step": 100135 + }, + { + "epoch": 14.915102770330652, + "grad_norm": 2.2236155928112566e-05, + "learning_rate": 9.216336348853449e-06, + "loss": 0.0327, + "num_input_tokens_seen": 58098328, + "step": 100140 + }, + { + "epoch": 14.915847482871612, + "grad_norm": 0.00011122751311631873, + "learning_rate": 9.213816553146462e-06, + "loss": 0.0, + "num_input_tokens_seen": 58101208, + "step": 100145 + }, + { + "epoch": 14.916592195412571, + "grad_norm": 0.0031168200075626373, + "learning_rate": 9.211297024130989e-06, + "loss": 0.0003, + "num_input_tokens_seen": 58104120, + "step": 100150 + }, + { + "epoch": 14.91733690795353, + "grad_norm": 0.001926942728459835, + "learning_rate": 9.208777761849616e-06, + "loss": 0.0, + "num_input_tokens_seen": 58107448, + "step": 100155 + }, + { + "epoch": 14.918081620494489, + "grad_norm": 0.00023199321003630757, + "learning_rate": 9.20625876634489e-06, + "loss": 0.0, + "num_input_tokens_seen": 58110296, + "step": 100160 + }, + { + "epoch": 14.91882633303545, + "grad_norm": 0.00861083809286356, + "learning_rate": 9.203740037659367e-06, + "loss": 0.0, + "num_input_tokens_seen": 58112984, + "step": 100165 + }, + { + "epoch": 14.919571045576408, + "grad_norm": 0.0006991121335886419, + "learning_rate": 9.201221575835608e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58116120, + "step": 100170 + }, + { + "epoch": 14.920315758117367, + "grad_norm": 0.002830547047778964, + "learning_rate": 9.198703380916143e-06, + "loss": 0.0, + "num_input_tokens_seen": 58118872, + "step": 100175 + }, + { + "epoch": 14.921060470658325, + "grad_norm": 0.0008905582362785935, + "learning_rate": 9.196185452943534e-06, + "loss": 0.0226, + "num_input_tokens_seen": 58121720, + "step": 100180 + }, + { + "epoch": 14.921805183199286, + "grad_norm": 0.0009667233098298311, + "learning_rate": 9.193667791960303e-06, + "loss": 0.0, + "num_input_tokens_seen": 58124632, + "step": 100185 + }, + { + "epoch": 14.922549895740245, + "grad_norm": 0.007774511352181435, + "learning_rate": 9.191150398008996e-06, + "loss": 0.0, + "num_input_tokens_seen": 58127512, + "step": 100190 + }, + { + "epoch": 14.923294608281203, + "grad_norm": 0.002007113303989172, + "learning_rate": 9.188633271132135e-06, + "loss": 0.0017, + "num_input_tokens_seen": 58130488, + "step": 100195 + }, + { + "epoch": 14.924039320822162, + "grad_norm": 0.00024686031974852085, + "learning_rate": 9.186116411372248e-06, + "loss": 0.0, + "num_input_tokens_seen": 58133720, + "step": 100200 + }, + { + "epoch": 14.924784033363121, + "grad_norm": 0.00140757963526994, + "learning_rate": 9.183599818771849e-06, + "loss": 0.1284, + "num_input_tokens_seen": 58136408, + "step": 100205 + }, + { + "epoch": 14.925528745904082, + "grad_norm": 0.00021293688041623682, + "learning_rate": 9.181083493373449e-06, + "loss": 0.0, + "num_input_tokens_seen": 58139480, + "step": 100210 + }, + { + "epoch": 14.92627345844504, + "grad_norm": 0.0009326253784820437, + "learning_rate": 9.178567435219574e-06, + "loss": 0.0, + "num_input_tokens_seen": 58142424, + "step": 100215 + }, + { + "epoch": 14.927018170985999, + "grad_norm": 3.2781576010165736e-05, + "learning_rate": 9.176051644352713e-06, + "loss": 0.0, + "num_input_tokens_seen": 58145176, + "step": 100220 + }, + { + "epoch": 14.927762883526958, + "grad_norm": 0.00013280885468702763, + "learning_rate": 9.173536120815385e-06, + "loss": 0.0, + "num_input_tokens_seen": 58147800, + "step": 100225 + }, + { + "epoch": 14.928507596067918, + "grad_norm": 0.0022528020199388266, + "learning_rate": 9.171020864650071e-06, + "loss": 0.0, + "num_input_tokens_seen": 58150648, + "step": 100230 + }, + { + "epoch": 14.929252308608877, + "grad_norm": 9.747262811288238e-05, + "learning_rate": 9.16850587589928e-06, + "loss": 0.1657, + "num_input_tokens_seen": 58153432, + "step": 100235 + }, + { + "epoch": 14.929997021149836, + "grad_norm": 0.0006904290057718754, + "learning_rate": 9.16599115460549e-06, + "loss": 0.0, + "num_input_tokens_seen": 58156280, + "step": 100240 + }, + { + "epoch": 14.930741733690795, + "grad_norm": 0.0037009473890066147, + "learning_rate": 9.16347670081118e-06, + "loss": 0.0271, + "num_input_tokens_seen": 58158872, + "step": 100245 + }, + { + "epoch": 14.931486446231755, + "grad_norm": 0.23649892210960388, + "learning_rate": 9.160962514558843e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58161976, + "step": 100250 + }, + { + "epoch": 14.932231158772714, + "grad_norm": 0.0059602512046694756, + "learning_rate": 9.158448595890948e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58164664, + "step": 100255 + }, + { + "epoch": 14.932975871313673, + "grad_norm": 0.015598520636558533, + "learning_rate": 9.155934944849953e-06, + "loss": 0.0, + "num_input_tokens_seen": 58167672, + "step": 100260 + }, + { + "epoch": 14.933720583854631, + "grad_norm": 0.0002639967715367675, + "learning_rate": 9.153421561478346e-06, + "loss": 0.198, + "num_input_tokens_seen": 58170584, + "step": 100265 + }, + { + "epoch": 14.934465296395592, + "grad_norm": 0.00010564661351963878, + "learning_rate": 9.150908445818571e-06, + "loss": 0.0, + "num_input_tokens_seen": 58173528, + "step": 100270 + }, + { + "epoch": 14.93521000893655, + "grad_norm": 0.03990735113620758, + "learning_rate": 9.148395597913085e-06, + "loss": 0.0, + "num_input_tokens_seen": 58176728, + "step": 100275 + }, + { + "epoch": 14.93595472147751, + "grad_norm": 2.5619925509090535e-05, + "learning_rate": 9.14588301780435e-06, + "loss": 0.0, + "num_input_tokens_seen": 58179480, + "step": 100280 + }, + { + "epoch": 14.936699434018468, + "grad_norm": 0.0001650555059313774, + "learning_rate": 9.14337070553481e-06, + "loss": 0.1875, + "num_input_tokens_seen": 58182392, + "step": 100285 + }, + { + "epoch": 14.937444146559429, + "grad_norm": 1.7026597561198287e-05, + "learning_rate": 9.140858661146897e-06, + "loss": 0.0, + "num_input_tokens_seen": 58185272, + "step": 100290 + }, + { + "epoch": 14.938188859100388, + "grad_norm": 0.00022052742133382708, + "learning_rate": 9.138346884683066e-06, + "loss": 0.0, + "num_input_tokens_seen": 58188344, + "step": 100295 + }, + { + "epoch": 14.938933571641346, + "grad_norm": 0.006847086828202009, + "learning_rate": 9.135835376185737e-06, + "loss": 0.0, + "num_input_tokens_seen": 58191256, + "step": 100300 + }, + { + "epoch": 14.939678284182305, + "grad_norm": 0.0002282740897499025, + "learning_rate": 9.133324135697351e-06, + "loss": 0.0, + "num_input_tokens_seen": 58194456, + "step": 100305 + }, + { + "epoch": 14.940422996723266, + "grad_norm": 0.00023084842541720718, + "learning_rate": 9.130813163260321e-06, + "loss": 0.0, + "num_input_tokens_seen": 58197464, + "step": 100310 + }, + { + "epoch": 14.941167709264224, + "grad_norm": 1.300061649089912e-05, + "learning_rate": 9.128302458917081e-06, + "loss": 0.0, + "num_input_tokens_seen": 58200152, + "step": 100315 + }, + { + "epoch": 14.941912421805183, + "grad_norm": 0.003249454079195857, + "learning_rate": 9.125792022710042e-06, + "loss": 0.0, + "num_input_tokens_seen": 58202936, + "step": 100320 + }, + { + "epoch": 14.942657134346142, + "grad_norm": 0.0007349003572016954, + "learning_rate": 9.123281854681612e-06, + "loss": 0.0, + "num_input_tokens_seen": 58205880, + "step": 100325 + }, + { + "epoch": 14.943401846887102, + "grad_norm": 0.0008250275859609246, + "learning_rate": 9.120771954874199e-06, + "loss": 0.0, + "num_input_tokens_seen": 58208792, + "step": 100330 + }, + { + "epoch": 14.944146559428061, + "grad_norm": 0.0001870858686743304, + "learning_rate": 9.118262323330196e-06, + "loss": 0.0, + "num_input_tokens_seen": 58211576, + "step": 100335 + }, + { + "epoch": 14.94489127196902, + "grad_norm": 0.023188943043351173, + "learning_rate": 9.115752960092017e-06, + "loss": 0.0, + "num_input_tokens_seen": 58214552, + "step": 100340 + }, + { + "epoch": 14.945635984509979, + "grad_norm": 0.0008011471945792437, + "learning_rate": 9.11324386520204e-06, + "loss": 0.0003, + "num_input_tokens_seen": 58217272, + "step": 100345 + }, + { + "epoch": 14.946380697050937, + "grad_norm": 0.0006896066479384899, + "learning_rate": 9.11073503870267e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58220216, + "step": 100350 + }, + { + "epoch": 14.947125409591898, + "grad_norm": 0.0004530654987320304, + "learning_rate": 9.108226480636276e-06, + "loss": 0.0, + "num_input_tokens_seen": 58223160, + "step": 100355 + }, + { + "epoch": 14.947870122132857, + "grad_norm": 0.00017090233450289816, + "learning_rate": 9.105718191045248e-06, + "loss": 0.0, + "num_input_tokens_seen": 58225848, + "step": 100360 + }, + { + "epoch": 14.948614834673815, + "grad_norm": 0.00025176297640427947, + "learning_rate": 9.10321016997196e-06, + "loss": 0.0, + "num_input_tokens_seen": 58228664, + "step": 100365 + }, + { + "epoch": 14.949359547214776, + "grad_norm": 1.5559353414573707e-05, + "learning_rate": 9.10070241745877e-06, + "loss": 0.0, + "num_input_tokens_seen": 58231576, + "step": 100370 + }, + { + "epoch": 14.950104259755735, + "grad_norm": 0.025494832545518875, + "learning_rate": 9.098194933548063e-06, + "loss": 0.0, + "num_input_tokens_seen": 58234680, + "step": 100375 + }, + { + "epoch": 14.950848972296694, + "grad_norm": 4.530619116849266e-05, + "learning_rate": 9.09568771828218e-06, + "loss": 0.0, + "num_input_tokens_seen": 58237464, + "step": 100380 + }, + { + "epoch": 14.951593684837652, + "grad_norm": 0.0013649618485942483, + "learning_rate": 9.0931807717035e-06, + "loss": 0.0, + "num_input_tokens_seen": 58240472, + "step": 100385 + }, + { + "epoch": 14.952338397378611, + "grad_norm": 0.0606965608894825, + "learning_rate": 9.090674093854362e-06, + "loss": 0.0452, + "num_input_tokens_seen": 58243160, + "step": 100390 + }, + { + "epoch": 14.953083109919572, + "grad_norm": 0.001346447505056858, + "learning_rate": 9.088167684777115e-06, + "loss": 0.0796, + "num_input_tokens_seen": 58245880, + "step": 100395 + }, + { + "epoch": 14.95382782246053, + "grad_norm": 0.00012426840839907527, + "learning_rate": 9.085661544514104e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58248888, + "step": 100400 + }, + { + "epoch": 14.954572535001489, + "grad_norm": 0.00010656202357495204, + "learning_rate": 9.083155673107657e-06, + "loss": 0.0, + "num_input_tokens_seen": 58251672, + "step": 100405 + }, + { + "epoch": 14.955317247542448, + "grad_norm": 0.00012396305100992322, + "learning_rate": 9.080650070600128e-06, + "loss": 0.0, + "num_input_tokens_seen": 58254936, + "step": 100410 + }, + { + "epoch": 14.956061960083408, + "grad_norm": 0.001283811405301094, + "learning_rate": 9.078144737033827e-06, + "loss": 0.0, + "num_input_tokens_seen": 58257624, + "step": 100415 + }, + { + "epoch": 14.956806672624367, + "grad_norm": 7.177744555519894e-05, + "learning_rate": 9.075639672451097e-06, + "loss": 0.0, + "num_input_tokens_seen": 58260568, + "step": 100420 + }, + { + "epoch": 14.957551385165326, + "grad_norm": 0.0019267069874331355, + "learning_rate": 9.073134876894241e-06, + "loss": 0.0, + "num_input_tokens_seen": 58263800, + "step": 100425 + }, + { + "epoch": 14.958296097706285, + "grad_norm": 0.00039857981028035283, + "learning_rate": 9.070630350405593e-06, + "loss": 0.0, + "num_input_tokens_seen": 58266584, + "step": 100430 + }, + { + "epoch": 14.959040810247245, + "grad_norm": 5.4315762099577114e-05, + "learning_rate": 9.068126093027447e-06, + "loss": 0.0, + "num_input_tokens_seen": 58269624, + "step": 100435 + }, + { + "epoch": 14.959785522788204, + "grad_norm": 0.002421935787424445, + "learning_rate": 9.065622104802126e-06, + "loss": 0.0, + "num_input_tokens_seen": 58272600, + "step": 100440 + }, + { + "epoch": 14.960530235329163, + "grad_norm": 0.007897501811385155, + "learning_rate": 9.063118385771924e-06, + "loss": 0.0, + "num_input_tokens_seen": 58275352, + "step": 100445 + }, + { + "epoch": 14.961274947870121, + "grad_norm": 0.0007085256511345506, + "learning_rate": 9.060614935979131e-06, + "loss": 0.0, + "num_input_tokens_seen": 58278136, + "step": 100450 + }, + { + "epoch": 14.962019660411082, + "grad_norm": 0.0006544901989400387, + "learning_rate": 9.058111755466059e-06, + "loss": 0.0, + "num_input_tokens_seen": 58281080, + "step": 100455 + }, + { + "epoch": 14.96276437295204, + "grad_norm": 0.010297637432813644, + "learning_rate": 9.055608844274985e-06, + "loss": 0.0, + "num_input_tokens_seen": 58284632, + "step": 100460 + }, + { + "epoch": 14.963509085493, + "grad_norm": 0.000750409031752497, + "learning_rate": 9.053106202448194e-06, + "loss": 0.2094, + "num_input_tokens_seen": 58287544, + "step": 100465 + }, + { + "epoch": 14.964253798033958, + "grad_norm": 0.004176296293735504, + "learning_rate": 9.050603830027959e-06, + "loss": 0.0, + "num_input_tokens_seen": 58290232, + "step": 100470 + }, + { + "epoch": 14.964998510574919, + "grad_norm": 0.0002052185300271958, + "learning_rate": 9.048101727056568e-06, + "loss": 0.0, + "num_input_tokens_seen": 58292984, + "step": 100475 + }, + { + "epoch": 14.965743223115878, + "grad_norm": 0.0004069541464559734, + "learning_rate": 9.045599893576287e-06, + "loss": 0.0, + "num_input_tokens_seen": 58295896, + "step": 100480 + }, + { + "epoch": 14.966487935656836, + "grad_norm": 0.0017335491720587015, + "learning_rate": 9.043098329629374e-06, + "loss": 0.0, + "num_input_tokens_seen": 58298680, + "step": 100485 + }, + { + "epoch": 14.967232648197795, + "grad_norm": 0.06505700200796127, + "learning_rate": 9.040597035258103e-06, + "loss": 0.0, + "num_input_tokens_seen": 58301592, + "step": 100490 + }, + { + "epoch": 14.967977360738756, + "grad_norm": 0.00053866405505687, + "learning_rate": 9.038096010504714e-06, + "loss": 0.0, + "num_input_tokens_seen": 58304696, + "step": 100495 + }, + { + "epoch": 14.968722073279714, + "grad_norm": 0.00011302617349429056, + "learning_rate": 9.035595255411482e-06, + "loss": 0.0006, + "num_input_tokens_seen": 58307736, + "step": 100500 + }, + { + "epoch": 14.969466785820673, + "grad_norm": 0.0008711257250979543, + "learning_rate": 9.033094770020634e-06, + "loss": 0.0, + "num_input_tokens_seen": 58310712, + "step": 100505 + }, + { + "epoch": 14.970211498361632, + "grad_norm": 0.00893809087574482, + "learning_rate": 9.03059455437443e-06, + "loss": 0.0, + "num_input_tokens_seen": 58313784, + "step": 100510 + }, + { + "epoch": 14.970956210902592, + "grad_norm": 0.0001618155074538663, + "learning_rate": 9.028094608515093e-06, + "loss": 0.0002, + "num_input_tokens_seen": 58316696, + "step": 100515 + }, + { + "epoch": 14.971700923443551, + "grad_norm": 0.000586890906561166, + "learning_rate": 9.02559493248487e-06, + "loss": 0.0, + "num_input_tokens_seen": 58319448, + "step": 100520 + }, + { + "epoch": 14.97244563598451, + "grad_norm": 0.0006426781765185297, + "learning_rate": 9.023095526325987e-06, + "loss": 0.0, + "num_input_tokens_seen": 58322392, + "step": 100525 + }, + { + "epoch": 14.973190348525469, + "grad_norm": 9.322798723587766e-05, + "learning_rate": 9.020596390080665e-06, + "loss": 0.0, + "num_input_tokens_seen": 58325400, + "step": 100530 + }, + { + "epoch": 14.973935061066427, + "grad_norm": 0.00048208588850684464, + "learning_rate": 9.018097523791127e-06, + "loss": 0.0, + "num_input_tokens_seen": 58328184, + "step": 100535 + }, + { + "epoch": 14.974679773607388, + "grad_norm": 0.00025226169964298606, + "learning_rate": 9.01559892749958e-06, + "loss": 0.0479, + "num_input_tokens_seen": 58331032, + "step": 100540 + }, + { + "epoch": 14.975424486148347, + "grad_norm": 0.028397230431437492, + "learning_rate": 9.013100601248254e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58333592, + "step": 100545 + }, + { + "epoch": 14.976169198689306, + "grad_norm": 0.00011229402298340574, + "learning_rate": 9.010602545079332e-06, + "loss": 0.0, + "num_input_tokens_seen": 58336184, + "step": 100550 + }, + { + "epoch": 14.976913911230266, + "grad_norm": 0.00031476945150643587, + "learning_rate": 9.00810475903504e-06, + "loss": 0.0, + "num_input_tokens_seen": 58339480, + "step": 100555 + }, + { + "epoch": 14.977658623771225, + "grad_norm": 0.0055160303600132465, + "learning_rate": 9.005607243157565e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58342232, + "step": 100560 + }, + { + "epoch": 14.978403336312184, + "grad_norm": 0.00019148118735756725, + "learning_rate": 9.003109997489092e-06, + "loss": 0.0, + "num_input_tokens_seen": 58344888, + "step": 100565 + }, + { + "epoch": 14.979148048853142, + "grad_norm": 0.0001351194514427334, + "learning_rate": 9.000613022071824e-06, + "loss": 0.0, + "num_input_tokens_seen": 58347896, + "step": 100570 + }, + { + "epoch": 14.979892761394101, + "grad_norm": 0.0002594592806417495, + "learning_rate": 8.99811631694793e-06, + "loss": 0.0, + "num_input_tokens_seen": 58350616, + "step": 100575 + }, + { + "epoch": 14.980637473935062, + "grad_norm": 105.90843963623047, + "learning_rate": 8.995619882159606e-06, + "loss": 0.0761, + "num_input_tokens_seen": 58353464, + "step": 100580 + }, + { + "epoch": 14.98138218647602, + "grad_norm": 0.00014163609012030065, + "learning_rate": 8.993123717749016e-06, + "loss": 0.0, + "num_input_tokens_seen": 58356600, + "step": 100585 + }, + { + "epoch": 14.98212689901698, + "grad_norm": 0.00044003876973874867, + "learning_rate": 8.990627823758327e-06, + "loss": 0.0, + "num_input_tokens_seen": 58359448, + "step": 100590 + }, + { + "epoch": 14.982871611557938, + "grad_norm": 8.287949458463117e-05, + "learning_rate": 8.988132200229716e-06, + "loss": 0.0, + "num_input_tokens_seen": 58362328, + "step": 100595 + }, + { + "epoch": 14.983616324098898, + "grad_norm": 0.0005291813286021352, + "learning_rate": 8.985636847205336e-06, + "loss": 0.0, + "num_input_tokens_seen": 58365400, + "step": 100600 + }, + { + "epoch": 14.984361036639857, + "grad_norm": 0.5383652448654175, + "learning_rate": 8.983141764727348e-06, + "loss": 0.0002, + "num_input_tokens_seen": 58368376, + "step": 100605 + }, + { + "epoch": 14.985105749180816, + "grad_norm": 0.000369863846572116, + "learning_rate": 8.980646952837894e-06, + "loss": 0.042, + "num_input_tokens_seen": 58371160, + "step": 100610 + }, + { + "epoch": 14.985850461721775, + "grad_norm": 0.0013662767596542835, + "learning_rate": 8.978152411579133e-06, + "loss": 0.1707, + "num_input_tokens_seen": 58374008, + "step": 100615 + }, + { + "epoch": 14.986595174262735, + "grad_norm": 0.01222183182835579, + "learning_rate": 8.975658140993196e-06, + "loss": 0.0, + "num_input_tokens_seen": 58376952, + "step": 100620 + }, + { + "epoch": 14.987339886803694, + "grad_norm": 9.069361840374768e-05, + "learning_rate": 8.973164141122237e-06, + "loss": 0.0663, + "num_input_tokens_seen": 58379736, + "step": 100625 + }, + { + "epoch": 14.988084599344653, + "grad_norm": 0.0010779575677588582, + "learning_rate": 8.970670412008372e-06, + "loss": 0.1597, + "num_input_tokens_seen": 58382840, + "step": 100630 + }, + { + "epoch": 14.988829311885612, + "grad_norm": 0.001869696076028049, + "learning_rate": 8.96817695369375e-06, + "loss": 0.0, + "num_input_tokens_seen": 58385688, + "step": 100635 + }, + { + "epoch": 14.989574024426572, + "grad_norm": 0.0072113084606826305, + "learning_rate": 8.965683766220481e-06, + "loss": 0.0, + "num_input_tokens_seen": 58388600, + "step": 100640 + }, + { + "epoch": 14.99031873696753, + "grad_norm": 0.023045266047120094, + "learning_rate": 8.963190849630682e-06, + "loss": 0.0, + "num_input_tokens_seen": 58391384, + "step": 100645 + }, + { + "epoch": 14.99106344950849, + "grad_norm": 1.8722788809100166e-05, + "learning_rate": 8.96069820396648e-06, + "loss": 0.0004, + "num_input_tokens_seen": 58394264, + "step": 100650 + }, + { + "epoch": 14.991808162049448, + "grad_norm": 0.0007280904683284461, + "learning_rate": 8.958205829269984e-06, + "loss": 0.0, + "num_input_tokens_seen": 58397240, + "step": 100655 + }, + { + "epoch": 14.992552874590409, + "grad_norm": 0.0007898163166828454, + "learning_rate": 8.955713725583295e-06, + "loss": 0.0, + "num_input_tokens_seen": 58400024, + "step": 100660 + }, + { + "epoch": 14.993297587131368, + "grad_norm": 0.0018876013346016407, + "learning_rate": 8.953221892948508e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58403160, + "step": 100665 + }, + { + "epoch": 14.994042299672326, + "grad_norm": 8.544101001461968e-05, + "learning_rate": 8.950730331407733e-06, + "loss": 0.0, + "num_input_tokens_seen": 58406104, + "step": 100670 + }, + { + "epoch": 14.994787012213285, + "grad_norm": 0.00018570410611573607, + "learning_rate": 8.94823904100305e-06, + "loss": 0.0, + "num_input_tokens_seen": 58408920, + "step": 100675 + }, + { + "epoch": 14.995531724754246, + "grad_norm": 0.043777089565992355, + "learning_rate": 8.945748021776564e-06, + "loss": 0.2594, + "num_input_tokens_seen": 58411896, + "step": 100680 + }, + { + "epoch": 14.996276437295204, + "grad_norm": 0.0017226814525201917, + "learning_rate": 8.943257273770351e-06, + "loss": 0.0, + "num_input_tokens_seen": 58414744, + "step": 100685 + }, + { + "epoch": 14.997021149836163, + "grad_norm": 0.009147406555712223, + "learning_rate": 8.940766797026476e-06, + "loss": 0.0, + "num_input_tokens_seen": 58417656, + "step": 100690 + }, + { + "epoch": 14.997765862377122, + "grad_norm": 0.00032264881883747876, + "learning_rate": 8.938276591587031e-06, + "loss": 0.0, + "num_input_tokens_seen": 58420632, + "step": 100695 + }, + { + "epoch": 14.998510574918082, + "grad_norm": 0.00960069615393877, + "learning_rate": 8.935786657494072e-06, + "loss": 0.2277, + "num_input_tokens_seen": 58423672, + "step": 100700 + }, + { + "epoch": 14.999255287459041, + "grad_norm": 0.00015415047528222203, + "learning_rate": 8.933296994789678e-06, + "loss": 0.0, + "num_input_tokens_seen": 58426904, + "step": 100705 + }, + { + "epoch": 15.0, + "grad_norm": 8.633655852463562e-06, + "learning_rate": 8.930807603515895e-06, + "loss": 0.0, + "num_input_tokens_seen": 58429624, + "step": 100710 + }, + { + "epoch": 15.0, + "eval_loss": 2.406440258026123, + "eval_runtime": 51.2438, + "eval_samples_per_second": 58.231, + "eval_steps_per_second": 14.558, + "num_input_tokens_seen": 58429624, + "step": 100710 + }, + { + "epoch": 15.000744712540959, + "grad_norm": 0.09814824163913727, + "learning_rate": 8.928318483714793e-06, + "loss": 0.0002, + "num_input_tokens_seen": 58433016, + "step": 100715 + }, + { + "epoch": 15.001489425081918, + "grad_norm": 0.03037194162607193, + "learning_rate": 8.925829635428414e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58435928, + "step": 100720 + }, + { + "epoch": 15.002234137622878, + "grad_norm": 3.516468495945446e-05, + "learning_rate": 8.92334105869881e-06, + "loss": 0.0, + "num_input_tokens_seen": 58438776, + "step": 100725 + }, + { + "epoch": 15.002978850163837, + "grad_norm": 0.003937537781894207, + "learning_rate": 8.920852753568015e-06, + "loss": 0.019, + "num_input_tokens_seen": 58441848, + "step": 100730 + }, + { + "epoch": 15.003723562704796, + "grad_norm": 0.006318967789411545, + "learning_rate": 8.918364720078063e-06, + "loss": 0.0, + "num_input_tokens_seen": 58445112, + "step": 100735 + }, + { + "epoch": 15.004468275245754, + "grad_norm": 0.0043523251079022884, + "learning_rate": 8.915876958271006e-06, + "loss": 0.0, + "num_input_tokens_seen": 58447832, + "step": 100740 + }, + { + "epoch": 15.005212987786715, + "grad_norm": 1.018108606338501, + "learning_rate": 8.913389468188849e-06, + "loss": 0.0056, + "num_input_tokens_seen": 58450776, + "step": 100745 + }, + { + "epoch": 15.005957700327674, + "grad_norm": 0.0006721038953401148, + "learning_rate": 8.910902249873637e-06, + "loss": 0.0, + "num_input_tokens_seen": 58453752, + "step": 100750 + }, + { + "epoch": 15.006702412868632, + "grad_norm": 0.00030951015651226044, + "learning_rate": 8.908415303367371e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58456696, + "step": 100755 + }, + { + "epoch": 15.007447125409591, + "grad_norm": 0.014398387633264065, + "learning_rate": 8.905928628712083e-06, + "loss": 0.0, + "num_input_tokens_seen": 58459416, + "step": 100760 + }, + { + "epoch": 15.008191837950552, + "grad_norm": 0.00010763003956526518, + "learning_rate": 8.90344222594977e-06, + "loss": 0.0, + "num_input_tokens_seen": 58462264, + "step": 100765 + }, + { + "epoch": 15.00893655049151, + "grad_norm": 0.0017026725690811872, + "learning_rate": 8.900956095122435e-06, + "loss": 0.0002, + "num_input_tokens_seen": 58465176, + "step": 100770 + }, + { + "epoch": 15.00968126303247, + "grad_norm": 2.0287530787754804e-05, + "learning_rate": 8.898470236272091e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58468088, + "step": 100775 + }, + { + "epoch": 15.010425975573428, + "grad_norm": 0.0007250700145959854, + "learning_rate": 8.895984649440722e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58471064, + "step": 100780 + }, + { + "epoch": 15.011170688114388, + "grad_norm": 2.5795900000957772e-05, + "learning_rate": 8.89349933467033e-06, + "loss": 0.0, + "num_input_tokens_seen": 58474104, + "step": 100785 + }, + { + "epoch": 15.011915400655347, + "grad_norm": 0.00041395481093786657, + "learning_rate": 8.8910142920029e-06, + "loss": 0.0026, + "num_input_tokens_seen": 58476952, + "step": 100790 + }, + { + "epoch": 15.012660113196306, + "grad_norm": 0.003877457231283188, + "learning_rate": 8.88852952148041e-06, + "loss": 0.0, + "num_input_tokens_seen": 58479928, + "step": 100795 + }, + { + "epoch": 15.013404825737265, + "grad_norm": 0.11385221034288406, + "learning_rate": 8.886045023144829e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58482968, + "step": 100800 + }, + { + "epoch": 15.014149538278225, + "grad_norm": 3.886696504196152e-05, + "learning_rate": 8.883560797038152e-06, + "loss": 0.0, + "num_input_tokens_seen": 58485816, + "step": 100805 + }, + { + "epoch": 15.014894250819184, + "grad_norm": 0.05043522268533707, + "learning_rate": 8.881076843202332e-06, + "loss": 0.0, + "num_input_tokens_seen": 58488888, + "step": 100810 + }, + { + "epoch": 15.015638963360143, + "grad_norm": 0.00047533452743664384, + "learning_rate": 8.878593161679327e-06, + "loss": 0.0, + "num_input_tokens_seen": 58492056, + "step": 100815 + }, + { + "epoch": 15.016383675901102, + "grad_norm": 0.004448818974196911, + "learning_rate": 8.876109752511117e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58495128, + "step": 100820 + }, + { + "epoch": 15.017128388442062, + "grad_norm": 0.0010542416712269187, + "learning_rate": 8.873626615739632e-06, + "loss": 0.0, + "num_input_tokens_seen": 58497592, + "step": 100825 + }, + { + "epoch": 15.01787310098302, + "grad_norm": 0.00047828719834797084, + "learning_rate": 8.871143751406849e-06, + "loss": 0.0, + "num_input_tokens_seen": 58500408, + "step": 100830 + }, + { + "epoch": 15.01861781352398, + "grad_norm": 0.00126832805108279, + "learning_rate": 8.868661159554689e-06, + "loss": 0.0, + "num_input_tokens_seen": 58503768, + "step": 100835 + }, + { + "epoch": 15.019362526064938, + "grad_norm": 0.002742709359154105, + "learning_rate": 8.866178840225111e-06, + "loss": 0.0, + "num_input_tokens_seen": 58506552, + "step": 100840 + }, + { + "epoch": 15.020107238605899, + "grad_norm": 0.00038505264092236757, + "learning_rate": 8.863696793460047e-06, + "loss": 0.0, + "num_input_tokens_seen": 58509432, + "step": 100845 + }, + { + "epoch": 15.020851951146858, + "grad_norm": 6.155788287287578e-05, + "learning_rate": 8.861215019301414e-06, + "loss": 0.0, + "num_input_tokens_seen": 58512440, + "step": 100850 + }, + { + "epoch": 15.021596663687816, + "grad_norm": 0.00030555433477275074, + "learning_rate": 8.85873351779116e-06, + "loss": 0.0, + "num_input_tokens_seen": 58515064, + "step": 100855 + }, + { + "epoch": 15.022341376228775, + "grad_norm": 7.578512304462492e-05, + "learning_rate": 8.856252288971198e-06, + "loss": 0.0, + "num_input_tokens_seen": 58518040, + "step": 100860 + }, + { + "epoch": 15.023086088769736, + "grad_norm": 0.0004962018574588001, + "learning_rate": 8.853771332883446e-06, + "loss": 0.0, + "num_input_tokens_seen": 58520856, + "step": 100865 + }, + { + "epoch": 15.023830801310694, + "grad_norm": 0.0001109606891986914, + "learning_rate": 8.851290649569808e-06, + "loss": 0.0, + "num_input_tokens_seen": 58523896, + "step": 100870 + }, + { + "epoch": 15.024575513851653, + "grad_norm": 0.00011660002928692847, + "learning_rate": 8.848810239072208e-06, + "loss": 0.0, + "num_input_tokens_seen": 58526968, + "step": 100875 + }, + { + "epoch": 15.025320226392612, + "grad_norm": 0.00041246655746363103, + "learning_rate": 8.84633010143254e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58529912, + "step": 100880 + }, + { + "epoch": 15.02606493893357, + "grad_norm": 0.0020484731066972017, + "learning_rate": 8.84385023669271e-06, + "loss": 0.0, + "num_input_tokens_seen": 58532856, + "step": 100885 + }, + { + "epoch": 15.026809651474531, + "grad_norm": 0.02064484730362892, + "learning_rate": 8.841370644894614e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58536184, + "step": 100890 + }, + { + "epoch": 15.02755436401549, + "grad_norm": 0.003868293482810259, + "learning_rate": 8.838891326080129e-06, + "loss": 0.0, + "num_input_tokens_seen": 58539256, + "step": 100895 + }, + { + "epoch": 15.028299076556449, + "grad_norm": 0.001962959999218583, + "learning_rate": 8.83641228029116e-06, + "loss": 0.0, + "num_input_tokens_seen": 58542200, + "step": 100900 + }, + { + "epoch": 15.029043789097408, + "grad_norm": 9.533129923511297e-05, + "learning_rate": 8.833933507569564e-06, + "loss": 0.0, + "num_input_tokens_seen": 58545272, + "step": 100905 + }, + { + "epoch": 15.029788501638368, + "grad_norm": 0.0015188490506261587, + "learning_rate": 8.831455007957243e-06, + "loss": 0.0, + "num_input_tokens_seen": 58548120, + "step": 100910 + }, + { + "epoch": 15.030533214179327, + "grad_norm": 0.006975826341658831, + "learning_rate": 8.828976781496057e-06, + "loss": 0.0, + "num_input_tokens_seen": 58551256, + "step": 100915 + }, + { + "epoch": 15.031277926720286, + "grad_norm": 0.00040748200262896717, + "learning_rate": 8.826498828227861e-06, + "loss": 0.0, + "num_input_tokens_seen": 58554104, + "step": 100920 + }, + { + "epoch": 15.032022639261244, + "grad_norm": 0.0012352833291515708, + "learning_rate": 8.824021148194541e-06, + "loss": 0.0, + "num_input_tokens_seen": 58557176, + "step": 100925 + }, + { + "epoch": 15.032767351802205, + "grad_norm": 0.0014104091096669436, + "learning_rate": 8.82154374143794e-06, + "loss": 0.0032, + "num_input_tokens_seen": 58560088, + "step": 100930 + }, + { + "epoch": 15.033512064343164, + "grad_norm": 0.000592997414059937, + "learning_rate": 8.819066607999918e-06, + "loss": 0.0, + "num_input_tokens_seen": 58562680, + "step": 100935 + }, + { + "epoch": 15.034256776884122, + "grad_norm": 0.001972147496417165, + "learning_rate": 8.816589747922311e-06, + "loss": 0.0, + "num_input_tokens_seen": 58565496, + "step": 100940 + }, + { + "epoch": 15.035001489425081, + "grad_norm": 0.0048355222679674625, + "learning_rate": 8.814113161246979e-06, + "loss": 0.0, + "num_input_tokens_seen": 58568472, + "step": 100945 + }, + { + "epoch": 15.035746201966042, + "grad_norm": 0.0005991957732476294, + "learning_rate": 8.811636848015747e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58571544, + "step": 100950 + }, + { + "epoch": 15.036490914507, + "grad_norm": 0.00032959875534288585, + "learning_rate": 8.809160808270464e-06, + "loss": 0.0, + "num_input_tokens_seen": 58574488, + "step": 100955 + }, + { + "epoch": 15.03723562704796, + "grad_norm": 0.000915016105864197, + "learning_rate": 8.806685042052949e-06, + "loss": 0.0, + "num_input_tokens_seen": 58577272, + "step": 100960 + }, + { + "epoch": 15.037980339588918, + "grad_norm": 0.00020111235789954662, + "learning_rate": 8.804209549405037e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58580312, + "step": 100965 + }, + { + "epoch": 15.038725052129879, + "grad_norm": 1.4621296941186301e-05, + "learning_rate": 8.801734330368544e-06, + "loss": 0.0, + "num_input_tokens_seen": 58583256, + "step": 100970 + }, + { + "epoch": 15.039469764670837, + "grad_norm": 0.011645573191344738, + "learning_rate": 8.79925938498528e-06, + "loss": 0.0, + "num_input_tokens_seen": 58586200, + "step": 100975 + }, + { + "epoch": 15.040214477211796, + "grad_norm": 0.00039158613071776927, + "learning_rate": 8.796784713297072e-06, + "loss": 0.0, + "num_input_tokens_seen": 58589176, + "step": 100980 + }, + { + "epoch": 15.040959189752755, + "grad_norm": 0.00013228518946561962, + "learning_rate": 8.794310315345713e-06, + "loss": 0.0, + "num_input_tokens_seen": 58592184, + "step": 100985 + }, + { + "epoch": 15.041703902293715, + "grad_norm": 0.00044150007306598127, + "learning_rate": 8.791836191173017e-06, + "loss": 0.0, + "num_input_tokens_seen": 58595448, + "step": 100990 + }, + { + "epoch": 15.042448614834674, + "grad_norm": 0.011065232567489147, + "learning_rate": 8.78936234082076e-06, + "loss": 0.0, + "num_input_tokens_seen": 58598392, + "step": 100995 + }, + { + "epoch": 15.043193327375633, + "grad_norm": 7.280574936885387e-05, + "learning_rate": 8.786888764330767e-06, + "loss": 0.0, + "num_input_tokens_seen": 58601240, + "step": 101000 + }, + { + "epoch": 15.043938039916592, + "grad_norm": 0.0008799454662948847, + "learning_rate": 8.784415461744805e-06, + "loss": 0.0, + "num_input_tokens_seen": 58604152, + "step": 101005 + }, + { + "epoch": 15.044682752457552, + "grad_norm": 0.0001138677544076927, + "learning_rate": 8.781942433104654e-06, + "loss": 0.0, + "num_input_tokens_seen": 58607032, + "step": 101010 + }, + { + "epoch": 15.045427464998511, + "grad_norm": 0.0003326456353534013, + "learning_rate": 8.779469678452113e-06, + "loss": 0.0, + "num_input_tokens_seen": 58609848, + "step": 101015 + }, + { + "epoch": 15.04617217753947, + "grad_norm": 0.0012994700809940696, + "learning_rate": 8.776997197828937e-06, + "loss": 0.0, + "num_input_tokens_seen": 58612568, + "step": 101020 + }, + { + "epoch": 15.046916890080428, + "grad_norm": 0.0008605131879448891, + "learning_rate": 8.774524991276911e-06, + "loss": 0.0, + "num_input_tokens_seen": 58615320, + "step": 101025 + }, + { + "epoch": 15.047661602621389, + "grad_norm": 6.971041148062795e-05, + "learning_rate": 8.77205305883779e-06, + "loss": 0.1564, + "num_input_tokens_seen": 58618296, + "step": 101030 + }, + { + "epoch": 15.048406315162348, + "grad_norm": 0.0006806362071074545, + "learning_rate": 8.769581400553346e-06, + "loss": 0.0, + "num_input_tokens_seen": 58621240, + "step": 101035 + }, + { + "epoch": 15.049151027703306, + "grad_norm": 0.00037004079786129296, + "learning_rate": 8.767110016465318e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58623928, + "step": 101040 + }, + { + "epoch": 15.049895740244265, + "grad_norm": 2.1151277906028554e-05, + "learning_rate": 8.76463890661548e-06, + "loss": 0.0, + "num_input_tokens_seen": 58626904, + "step": 101045 + }, + { + "epoch": 15.050640452785226, + "grad_norm": 3.560313780326396e-05, + "learning_rate": 8.762168071045566e-06, + "loss": 0.0, + "num_input_tokens_seen": 58629624, + "step": 101050 + }, + { + "epoch": 15.051385165326185, + "grad_norm": 0.0019009218085557222, + "learning_rate": 8.759697509797315e-06, + "loss": 0.0, + "num_input_tokens_seen": 58632408, + "step": 101055 + }, + { + "epoch": 15.052129877867143, + "grad_norm": 0.0006653577438555658, + "learning_rate": 8.757227222912473e-06, + "loss": 0.0, + "num_input_tokens_seen": 58635608, + "step": 101060 + }, + { + "epoch": 15.052874590408102, + "grad_norm": 0.0007328352076001465, + "learning_rate": 8.754757210432758e-06, + "loss": 0.0, + "num_input_tokens_seen": 58638616, + "step": 101065 + }, + { + "epoch": 15.05361930294906, + "grad_norm": 0.0037223089020699263, + "learning_rate": 8.752287472399918e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58641432, + "step": 101070 + }, + { + "epoch": 15.054364015490021, + "grad_norm": 0.0009093329426832497, + "learning_rate": 8.74981800885566e-06, + "loss": 0.0, + "num_input_tokens_seen": 58644536, + "step": 101075 + }, + { + "epoch": 15.05510872803098, + "grad_norm": 2.6235422410536557e-05, + "learning_rate": 8.747348819841719e-06, + "loss": 0.0, + "num_input_tokens_seen": 58647896, + "step": 101080 + }, + { + "epoch": 15.055853440571939, + "grad_norm": 4.442947465577163e-05, + "learning_rate": 8.7448799053998e-06, + "loss": 0.0, + "num_input_tokens_seen": 58650808, + "step": 101085 + }, + { + "epoch": 15.056598153112898, + "grad_norm": 0.013218647800385952, + "learning_rate": 8.742411265571607e-06, + "loss": 0.0, + "num_input_tokens_seen": 58653432, + "step": 101090 + }, + { + "epoch": 15.057342865653858, + "grad_norm": 0.010285397991538048, + "learning_rate": 8.73994290039886e-06, + "loss": 0.0065, + "num_input_tokens_seen": 58656216, + "step": 101095 + }, + { + "epoch": 15.058087578194817, + "grad_norm": 0.0008151728543452919, + "learning_rate": 8.737474809923244e-06, + "loss": 0.0097, + "num_input_tokens_seen": 58659160, + "step": 101100 + }, + { + "epoch": 15.058832290735776, + "grad_norm": 0.010439387522637844, + "learning_rate": 8.73500699418647e-06, + "loss": 0.0148, + "num_input_tokens_seen": 58662136, + "step": 101105 + }, + { + "epoch": 15.059577003276734, + "grad_norm": 0.0019580251537263393, + "learning_rate": 8.732539453230215e-06, + "loss": 0.0, + "num_input_tokens_seen": 58664984, + "step": 101110 + }, + { + "epoch": 15.060321715817695, + "grad_norm": 0.00014475089847110212, + "learning_rate": 8.730072187096178e-06, + "loss": 0.001, + "num_input_tokens_seen": 58667800, + "step": 101115 + }, + { + "epoch": 15.061066428358654, + "grad_norm": 5.617334863927681e-06, + "learning_rate": 8.727605195826038e-06, + "loss": 0.0, + "num_input_tokens_seen": 58670648, + "step": 101120 + }, + { + "epoch": 15.061811140899612, + "grad_norm": 0.027472294867038727, + "learning_rate": 8.72513847946147e-06, + "loss": 0.0914, + "num_input_tokens_seen": 58673368, + "step": 101125 + }, + { + "epoch": 15.062555853440571, + "grad_norm": 3.868788553518243e-05, + "learning_rate": 8.722672038044145e-06, + "loss": 0.0285, + "num_input_tokens_seen": 58676472, + "step": 101130 + }, + { + "epoch": 15.063300565981532, + "grad_norm": 0.0004393844283185899, + "learning_rate": 8.720205871615722e-06, + "loss": 0.0, + "num_input_tokens_seen": 58679288, + "step": 101135 + }, + { + "epoch": 15.06404527852249, + "grad_norm": 9.529280941933393e-05, + "learning_rate": 8.717739980217887e-06, + "loss": 0.0, + "num_input_tokens_seen": 58682168, + "step": 101140 + }, + { + "epoch": 15.06478999106345, + "grad_norm": 1.195476852444699e-05, + "learning_rate": 8.715274363892276e-06, + "loss": 0.0, + "num_input_tokens_seen": 58685368, + "step": 101145 + }, + { + "epoch": 15.065534703604408, + "grad_norm": 0.00018842217104975134, + "learning_rate": 8.712809022680563e-06, + "loss": 0.0, + "num_input_tokens_seen": 58688280, + "step": 101150 + }, + { + "epoch": 15.066279416145369, + "grad_norm": 4.1433519072597846e-05, + "learning_rate": 8.710343956624379e-06, + "loss": 0.0, + "num_input_tokens_seen": 58691256, + "step": 101155 + }, + { + "epoch": 15.067024128686327, + "grad_norm": 2.0680265151895583e-05, + "learning_rate": 8.707879165765384e-06, + "loss": 0.0, + "num_input_tokens_seen": 58694520, + "step": 101160 + }, + { + "epoch": 15.067768841227286, + "grad_norm": 174.76242065429688, + "learning_rate": 8.705414650145215e-06, + "loss": 0.1625, + "num_input_tokens_seen": 58697336, + "step": 101165 + }, + { + "epoch": 15.068513553768245, + "grad_norm": 0.00021367287263274193, + "learning_rate": 8.702950409805493e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58700280, + "step": 101170 + }, + { + "epoch": 15.069258266309205, + "grad_norm": 0.00849529355764389, + "learning_rate": 8.700486444787872e-06, + "loss": 0.0, + "num_input_tokens_seen": 58703064, + "step": 101175 + }, + { + "epoch": 15.070002978850164, + "grad_norm": 0.00011540015111677349, + "learning_rate": 8.698022755133957e-06, + "loss": 0.0, + "num_input_tokens_seen": 58705880, + "step": 101180 + }, + { + "epoch": 15.070747691391123, + "grad_norm": 0.0003677303029689938, + "learning_rate": 8.695559340885387e-06, + "loss": 0.0, + "num_input_tokens_seen": 58709208, + "step": 101185 + }, + { + "epoch": 15.071492403932082, + "grad_norm": 0.0010694770608097315, + "learning_rate": 8.693096202083773e-06, + "loss": 0.0, + "num_input_tokens_seen": 58711896, + "step": 101190 + }, + { + "epoch": 15.072237116473042, + "grad_norm": 0.00025759756681509316, + "learning_rate": 8.69063333877072e-06, + "loss": 0.0, + "num_input_tokens_seen": 58715352, + "step": 101195 + }, + { + "epoch": 15.072981829014001, + "grad_norm": 0.0006143738864921033, + "learning_rate": 8.688170750987836e-06, + "loss": 0.0, + "num_input_tokens_seen": 58717784, + "step": 101200 + }, + { + "epoch": 15.07372654155496, + "grad_norm": 2.1639649276039563e-05, + "learning_rate": 8.685708438776739e-06, + "loss": 0.0, + "num_input_tokens_seen": 58720536, + "step": 101205 + }, + { + "epoch": 15.074471254095918, + "grad_norm": 9.171097190119326e-06, + "learning_rate": 8.683246402179013e-06, + "loss": 0.0, + "num_input_tokens_seen": 58723544, + "step": 101210 + }, + { + "epoch": 15.075215966636879, + "grad_norm": 3.944199124816805e-05, + "learning_rate": 8.680784641236248e-06, + "loss": 0.0, + "num_input_tokens_seen": 58726648, + "step": 101215 + }, + { + "epoch": 15.075960679177838, + "grad_norm": 2.5700875994516537e-05, + "learning_rate": 8.678323155990047e-06, + "loss": 0.0, + "num_input_tokens_seen": 58729848, + "step": 101220 + }, + { + "epoch": 15.076705391718797, + "grad_norm": 4.927232657792047e-05, + "learning_rate": 8.67586194648198e-06, + "loss": 0.0, + "num_input_tokens_seen": 58732696, + "step": 101225 + }, + { + "epoch": 15.077450104259755, + "grad_norm": 0.00016191699251066893, + "learning_rate": 8.673401012753646e-06, + "loss": 0.0002, + "num_input_tokens_seen": 58735640, + "step": 101230 + }, + { + "epoch": 15.078194816800714, + "grad_norm": 0.0001264304737560451, + "learning_rate": 8.670940354846596e-06, + "loss": 0.0002, + "num_input_tokens_seen": 58738456, + "step": 101235 + }, + { + "epoch": 15.078939529341675, + "grad_norm": 0.04077815264463425, + "learning_rate": 8.668479972802423e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58741496, + "step": 101240 + }, + { + "epoch": 15.079684241882633, + "grad_norm": 1.5162537238211371e-05, + "learning_rate": 8.666019866662683e-06, + "loss": 0.0, + "num_input_tokens_seen": 58744376, + "step": 101245 + }, + { + "epoch": 15.080428954423592, + "grad_norm": 0.00011359850759617984, + "learning_rate": 8.663560036468926e-06, + "loss": 0.0, + "num_input_tokens_seen": 58747224, + "step": 101250 + }, + { + "epoch": 15.08117366696455, + "grad_norm": 0.001096790423616767, + "learning_rate": 8.661100482262729e-06, + "loss": 0.0, + "num_input_tokens_seen": 58750136, + "step": 101255 + }, + { + "epoch": 15.081918379505511, + "grad_norm": 6.322528497548774e-05, + "learning_rate": 8.658641204085632e-06, + "loss": 0.0, + "num_input_tokens_seen": 58752824, + "step": 101260 + }, + { + "epoch": 15.08266309204647, + "grad_norm": 0.00018782161350827664, + "learning_rate": 8.656182201979181e-06, + "loss": 0.0, + "num_input_tokens_seen": 58755768, + "step": 101265 + }, + { + "epoch": 15.083407804587429, + "grad_norm": 8.882280235411599e-05, + "learning_rate": 8.653723475984916e-06, + "loss": 0.0, + "num_input_tokens_seen": 58758392, + "step": 101270 + }, + { + "epoch": 15.084152517128388, + "grad_norm": 213.9291229248047, + "learning_rate": 8.651265026144387e-06, + "loss": 0.0401, + "num_input_tokens_seen": 58760984, + "step": 101275 + }, + { + "epoch": 15.084897229669348, + "grad_norm": 0.0015403145225718617, + "learning_rate": 8.648806852499109e-06, + "loss": 0.0, + "num_input_tokens_seen": 58763832, + "step": 101280 + }, + { + "epoch": 15.085641942210307, + "grad_norm": 4.11558139603585e-05, + "learning_rate": 8.64634895509063e-06, + "loss": 0.0, + "num_input_tokens_seen": 58766648, + "step": 101285 + }, + { + "epoch": 15.086386654751266, + "grad_norm": 0.0004803554911632091, + "learning_rate": 8.643891333960464e-06, + "loss": 0.0, + "num_input_tokens_seen": 58769848, + "step": 101290 + }, + { + "epoch": 15.087131367292224, + "grad_norm": 0.0007927575497888029, + "learning_rate": 8.641433989150123e-06, + "loss": 0.0062, + "num_input_tokens_seen": 58772824, + "step": 101295 + }, + { + "epoch": 15.087876079833185, + "grad_norm": 0.0001855672016972676, + "learning_rate": 8.638976920701137e-06, + "loss": 0.0, + "num_input_tokens_seen": 58775384, + "step": 101300 + }, + { + "epoch": 15.088620792374144, + "grad_norm": 0.0032588178291916847, + "learning_rate": 8.636520128654995e-06, + "loss": 0.074, + "num_input_tokens_seen": 58778296, + "step": 101305 + }, + { + "epoch": 15.089365504915103, + "grad_norm": 0.0006115453434176743, + "learning_rate": 8.634063613053228e-06, + "loss": 0.0, + "num_input_tokens_seen": 58781208, + "step": 101310 + }, + { + "epoch": 15.090110217456061, + "grad_norm": 0.003279574913904071, + "learning_rate": 8.631607373937319e-06, + "loss": 0.0, + "num_input_tokens_seen": 58783800, + "step": 101315 + }, + { + "epoch": 15.090854929997022, + "grad_norm": 0.0005485009169206023, + "learning_rate": 8.62915141134877e-06, + "loss": 0.0433, + "num_input_tokens_seen": 58787128, + "step": 101320 + }, + { + "epoch": 15.09159964253798, + "grad_norm": 0.0011943596182391047, + "learning_rate": 8.626695725329059e-06, + "loss": 0.0, + "num_input_tokens_seen": 58790264, + "step": 101325 + }, + { + "epoch": 15.09234435507894, + "grad_norm": 0.00014482771803159267, + "learning_rate": 8.624240315919693e-06, + "loss": 0.0003, + "num_input_tokens_seen": 58793464, + "step": 101330 + }, + { + "epoch": 15.093089067619898, + "grad_norm": 0.0005705205840058625, + "learning_rate": 8.62178518316214e-06, + "loss": 0.0, + "num_input_tokens_seen": 58796632, + "step": 101335 + }, + { + "epoch": 15.093833780160859, + "grad_norm": 0.0029183446895331144, + "learning_rate": 8.619330327097874e-06, + "loss": 0.0, + "num_input_tokens_seen": 58799608, + "step": 101340 + }, + { + "epoch": 15.094578492701817, + "grad_norm": 0.004178543109446764, + "learning_rate": 8.616875747768382e-06, + "loss": 0.0, + "num_input_tokens_seen": 58802584, + "step": 101345 + }, + { + "epoch": 15.095323205242776, + "grad_norm": 0.00010162080434383824, + "learning_rate": 8.614421445215116e-06, + "loss": 0.0, + "num_input_tokens_seen": 58805336, + "step": 101350 + }, + { + "epoch": 15.096067917783735, + "grad_norm": 0.00028034375282004476, + "learning_rate": 8.611967419479553e-06, + "loss": 0.0, + "num_input_tokens_seen": 58808408, + "step": 101355 + }, + { + "epoch": 15.096812630324695, + "grad_norm": 0.0004359686281532049, + "learning_rate": 8.609513670603137e-06, + "loss": 0.0, + "num_input_tokens_seen": 58811096, + "step": 101360 + }, + { + "epoch": 15.097557342865654, + "grad_norm": 0.0005441037937998772, + "learning_rate": 8.607060198627337e-06, + "loss": 0.0674, + "num_input_tokens_seen": 58813784, + "step": 101365 + }, + { + "epoch": 15.098302055406613, + "grad_norm": 0.0055612302385270596, + "learning_rate": 8.604607003593593e-06, + "loss": 0.0, + "num_input_tokens_seen": 58816664, + "step": 101370 + }, + { + "epoch": 15.099046767947572, + "grad_norm": 0.0033586646895855665, + "learning_rate": 8.602154085543341e-06, + "loss": 0.0, + "num_input_tokens_seen": 58819448, + "step": 101375 + }, + { + "epoch": 15.099791480488532, + "grad_norm": 0.0009953540284186602, + "learning_rate": 8.59970144451804e-06, + "loss": 0.0, + "num_input_tokens_seen": 58822200, + "step": 101380 + }, + { + "epoch": 15.100536193029491, + "grad_norm": 0.0022711926139891148, + "learning_rate": 8.597249080559114e-06, + "loss": 0.0, + "num_input_tokens_seen": 58824824, + "step": 101385 + }, + { + "epoch": 15.10128090557045, + "grad_norm": 0.00013740238500759006, + "learning_rate": 8.594796993707993e-06, + "loss": 0.0, + "num_input_tokens_seen": 58827480, + "step": 101390 + }, + { + "epoch": 15.102025618111409, + "grad_norm": 5.3134423069423065e-05, + "learning_rate": 8.592345184006096e-06, + "loss": 0.0, + "num_input_tokens_seen": 58830200, + "step": 101395 + }, + { + "epoch": 15.102770330652369, + "grad_norm": 3.2246880437014624e-05, + "learning_rate": 8.58989365149486e-06, + "loss": 0.0, + "num_input_tokens_seen": 58833016, + "step": 101400 + }, + { + "epoch": 15.103515043193328, + "grad_norm": 0.0014540485572069883, + "learning_rate": 8.58744239621568e-06, + "loss": 0.0007, + "num_input_tokens_seen": 58835544, + "step": 101405 + }, + { + "epoch": 15.104259755734287, + "grad_norm": 0.00017357224714942276, + "learning_rate": 8.584991418209992e-06, + "loss": 0.0, + "num_input_tokens_seen": 58838328, + "step": 101410 + }, + { + "epoch": 15.105004468275245, + "grad_norm": 0.00024689591373316944, + "learning_rate": 8.582540717519191e-06, + "loss": 0.035, + "num_input_tokens_seen": 58841336, + "step": 101415 + }, + { + "epoch": 15.105749180816204, + "grad_norm": 0.006992571987211704, + "learning_rate": 8.580090294184667e-06, + "loss": 0.0, + "num_input_tokens_seen": 58844216, + "step": 101420 + }, + { + "epoch": 15.106493893357165, + "grad_norm": 0.00800401158630848, + "learning_rate": 8.57764014824784e-06, + "loss": 0.0, + "num_input_tokens_seen": 58847160, + "step": 101425 + }, + { + "epoch": 15.107238605898123, + "grad_norm": 7.0389173743024e-06, + "learning_rate": 8.575190279750085e-06, + "loss": 0.0, + "num_input_tokens_seen": 58850104, + "step": 101430 + }, + { + "epoch": 15.107983318439082, + "grad_norm": 0.025815630331635475, + "learning_rate": 8.5727406887328e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58852920, + "step": 101435 + }, + { + "epoch": 15.108728030980041, + "grad_norm": 27.856849670410156, + "learning_rate": 8.570291375237361e-06, + "loss": 0.1159, + "num_input_tokens_seen": 58856120, + "step": 101440 + }, + { + "epoch": 15.109472743521001, + "grad_norm": 12.891950607299805, + "learning_rate": 8.567842339305157e-06, + "loss": 0.0108, + "num_input_tokens_seen": 58858840, + "step": 101445 + }, + { + "epoch": 15.11021745606196, + "grad_norm": 0.0004081304941792041, + "learning_rate": 8.565393580977558e-06, + "loss": 0.0, + "num_input_tokens_seen": 58861688, + "step": 101450 + }, + { + "epoch": 15.110962168602919, + "grad_norm": 0.0007934956811368465, + "learning_rate": 8.562945100295927e-06, + "loss": 0.2313, + "num_input_tokens_seen": 58864696, + "step": 101455 + }, + { + "epoch": 15.111706881143878, + "grad_norm": 0.0017581225838512182, + "learning_rate": 8.560496897301637e-06, + "loss": 0.0, + "num_input_tokens_seen": 58867576, + "step": 101460 + }, + { + "epoch": 15.112451593684838, + "grad_norm": 0.0001955149054992944, + "learning_rate": 8.558048972036031e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58870488, + "step": 101465 + }, + { + "epoch": 15.113196306225797, + "grad_norm": 8.071996853686869e-05, + "learning_rate": 8.555601324540488e-06, + "loss": 0.0, + "num_input_tokens_seen": 58873496, + "step": 101470 + }, + { + "epoch": 15.113941018766756, + "grad_norm": 0.6538786292076111, + "learning_rate": 8.553153954856338e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58876600, + "step": 101475 + }, + { + "epoch": 15.114685731307715, + "grad_norm": 0.01304902695119381, + "learning_rate": 8.550706863024945e-06, + "loss": 0.0, + "num_input_tokens_seen": 58879320, + "step": 101480 + }, + { + "epoch": 15.115430443848675, + "grad_norm": 0.00033562968019396067, + "learning_rate": 8.548260049087634e-06, + "loss": 0.0, + "num_input_tokens_seen": 58882424, + "step": 101485 + }, + { + "epoch": 15.116175156389634, + "grad_norm": 0.0003268639848101884, + "learning_rate": 8.545813513085757e-06, + "loss": 0.001, + "num_input_tokens_seen": 58885400, + "step": 101490 + }, + { + "epoch": 15.116919868930593, + "grad_norm": 0.0002223948249593377, + "learning_rate": 8.543367255060636e-06, + "loss": 0.0, + "num_input_tokens_seen": 58888536, + "step": 101495 + }, + { + "epoch": 15.117664581471551, + "grad_norm": 0.000659664161503315, + "learning_rate": 8.54092127505359e-06, + "loss": 0.0, + "num_input_tokens_seen": 58891256, + "step": 101500 + }, + { + "epoch": 15.118409294012512, + "grad_norm": 0.00013656304508913308, + "learning_rate": 8.538475573105961e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58894232, + "step": 101505 + }, + { + "epoch": 15.11915400655347, + "grad_norm": 0.00013920056517235935, + "learning_rate": 8.536030149259046e-06, + "loss": 0.0, + "num_input_tokens_seen": 58897208, + "step": 101510 + }, + { + "epoch": 15.11989871909443, + "grad_norm": 0.0005240186583250761, + "learning_rate": 8.533585003554179e-06, + "loss": 0.0, + "num_input_tokens_seen": 58899864, + "step": 101515 + }, + { + "epoch": 15.120643431635388, + "grad_norm": 0.0013081383658573031, + "learning_rate": 8.53114013603266e-06, + "loss": 0.0008, + "num_input_tokens_seen": 58902936, + "step": 101520 + }, + { + "epoch": 15.121388144176349, + "grad_norm": 0.0016326501499861479, + "learning_rate": 8.528695546735784e-06, + "loss": 0.0, + "num_input_tokens_seen": 58905880, + "step": 101525 + }, + { + "epoch": 15.122132856717307, + "grad_norm": 0.0041178069077432156, + "learning_rate": 8.52625123570486e-06, + "loss": 0.0, + "num_input_tokens_seen": 58908792, + "step": 101530 + }, + { + "epoch": 15.122877569258266, + "grad_norm": 0.0002903977583628148, + "learning_rate": 8.523807202981168e-06, + "loss": 0.0007, + "num_input_tokens_seen": 58911576, + "step": 101535 + }, + { + "epoch": 15.123622281799225, + "grad_norm": 0.000952454109210521, + "learning_rate": 8.521363448606018e-06, + "loss": 0.0, + "num_input_tokens_seen": 58914520, + "step": 101540 + }, + { + "epoch": 15.124366994340185, + "grad_norm": 0.0027572817634791136, + "learning_rate": 8.518919972620675e-06, + "loss": 0.0, + "num_input_tokens_seen": 58917336, + "step": 101545 + }, + { + "epoch": 15.125111706881144, + "grad_norm": 0.00020894540648441762, + "learning_rate": 8.516476775066438e-06, + "loss": 0.0, + "num_input_tokens_seen": 58920024, + "step": 101550 + }, + { + "epoch": 15.125856419422103, + "grad_norm": 7.667398313060403e-05, + "learning_rate": 8.514033855984563e-06, + "loss": 0.0, + "num_input_tokens_seen": 58922872, + "step": 101555 + }, + { + "epoch": 15.126601131963062, + "grad_norm": 0.001924871583469212, + "learning_rate": 8.51159121541634e-06, + "loss": 0.0, + "num_input_tokens_seen": 58926072, + "step": 101560 + }, + { + "epoch": 15.127345844504022, + "grad_norm": 0.0035813492722809315, + "learning_rate": 8.509148853403015e-06, + "loss": 0.0003, + "num_input_tokens_seen": 58929144, + "step": 101565 + }, + { + "epoch": 15.128090557044981, + "grad_norm": 3.867673058266519e-06, + "learning_rate": 8.50670676998587e-06, + "loss": 0.0287, + "num_input_tokens_seen": 58931928, + "step": 101570 + }, + { + "epoch": 15.12883526958594, + "grad_norm": 0.007117266301065683, + "learning_rate": 8.504264965206148e-06, + "loss": 0.0478, + "num_input_tokens_seen": 58934712, + "step": 101575 + }, + { + "epoch": 15.129579982126899, + "grad_norm": 0.0022986335679888725, + "learning_rate": 8.5018234391051e-06, + "loss": 0.0, + "num_input_tokens_seen": 58937560, + "step": 101580 + }, + { + "epoch": 15.130324694667857, + "grad_norm": 0.005426167976111174, + "learning_rate": 8.499382191723981e-06, + "loss": 0.0, + "num_input_tokens_seen": 58940440, + "step": 101585 + }, + { + "epoch": 15.131069407208818, + "grad_norm": 0.0003504368360154331, + "learning_rate": 8.496941223104032e-06, + "loss": 0.0, + "num_input_tokens_seen": 58943448, + "step": 101590 + }, + { + "epoch": 15.131814119749777, + "grad_norm": 0.00017949701577890664, + "learning_rate": 8.494500533286487e-06, + "loss": 0.0, + "num_input_tokens_seen": 58946296, + "step": 101595 + }, + { + "epoch": 15.132558832290735, + "grad_norm": 0.00012210333079565316, + "learning_rate": 8.492060122312572e-06, + "loss": 0.0888, + "num_input_tokens_seen": 58949240, + "step": 101600 + }, + { + "epoch": 15.133303544831694, + "grad_norm": 0.00015899445861577988, + "learning_rate": 8.489619990223533e-06, + "loss": 0.0, + "num_input_tokens_seen": 58952152, + "step": 101605 + }, + { + "epoch": 15.134048257372655, + "grad_norm": 0.0002426728024147451, + "learning_rate": 8.487180137060582e-06, + "loss": 0.0, + "num_input_tokens_seen": 58955064, + "step": 101610 + }, + { + "epoch": 15.134792969913613, + "grad_norm": 0.00021323221153579652, + "learning_rate": 8.484740562864931e-06, + "loss": 0.0002, + "num_input_tokens_seen": 58958200, + "step": 101615 + }, + { + "epoch": 15.135537682454572, + "grad_norm": 0.0003341479750815779, + "learning_rate": 8.482301267677813e-06, + "loss": 0.0, + "num_input_tokens_seen": 58960888, + "step": 101620 + }, + { + "epoch": 15.136282394995531, + "grad_norm": 0.0004075470787938684, + "learning_rate": 8.47986225154042e-06, + "loss": 0.0, + "num_input_tokens_seen": 58963832, + "step": 101625 + }, + { + "epoch": 15.137027107536491, + "grad_norm": 0.00011297524179099128, + "learning_rate": 8.477423514493967e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58966456, + "step": 101630 + }, + { + "epoch": 15.13777182007745, + "grad_norm": 3.217930134269409e-05, + "learning_rate": 8.474985056579648e-06, + "loss": 0.0002, + "num_input_tokens_seen": 58969144, + "step": 101635 + }, + { + "epoch": 15.138516532618409, + "grad_norm": 0.00010601647227304056, + "learning_rate": 8.47254687783867e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58971864, + "step": 101640 + }, + { + "epoch": 15.139261245159368, + "grad_norm": 0.00016747535846661776, + "learning_rate": 8.470108978312211e-06, + "loss": 0.0, + "num_input_tokens_seen": 58974584, + "step": 101645 + }, + { + "epoch": 15.140005957700328, + "grad_norm": 4.088555942871608e-05, + "learning_rate": 8.46767135804146e-06, + "loss": 0.0, + "num_input_tokens_seen": 58977528, + "step": 101650 + }, + { + "epoch": 15.140750670241287, + "grad_norm": 4.4944761611986905e-05, + "learning_rate": 8.465234017067595e-06, + "loss": 0.0, + "num_input_tokens_seen": 58980216, + "step": 101655 + }, + { + "epoch": 15.141495382782246, + "grad_norm": 0.0001661857677390799, + "learning_rate": 8.462796955431801e-06, + "loss": 0.0, + "num_input_tokens_seen": 58983032, + "step": 101660 + }, + { + "epoch": 15.142240095323205, + "grad_norm": 0.0048677329905331135, + "learning_rate": 8.460360173175244e-06, + "loss": 0.0, + "num_input_tokens_seen": 58985752, + "step": 101665 + }, + { + "epoch": 15.142984807864165, + "grad_norm": 0.000647647597361356, + "learning_rate": 8.457923670339085e-06, + "loss": 0.0, + "num_input_tokens_seen": 58988440, + "step": 101670 + }, + { + "epoch": 15.143729520405124, + "grad_norm": 5.948594116489403e-05, + "learning_rate": 8.455487446964502e-06, + "loss": 0.0001, + "num_input_tokens_seen": 58991672, + "step": 101675 + }, + { + "epoch": 15.144474232946083, + "grad_norm": 0.0009438488050363958, + "learning_rate": 8.453051503092632e-06, + "loss": 0.001, + "num_input_tokens_seen": 58994360, + "step": 101680 + }, + { + "epoch": 15.145218945487041, + "grad_norm": 0.0019366074120625854, + "learning_rate": 8.450615838764653e-06, + "loss": 0.0, + "num_input_tokens_seen": 58997560, + "step": 101685 + }, + { + "epoch": 15.145963658028002, + "grad_norm": 0.1185988038778305, + "learning_rate": 8.448180454021695e-06, + "loss": 0.0, + "num_input_tokens_seen": 59000792, + "step": 101690 + }, + { + "epoch": 15.14670837056896, + "grad_norm": 0.00111281662248075, + "learning_rate": 8.445745348904898e-06, + "loss": 0.0004, + "num_input_tokens_seen": 59003768, + "step": 101695 + }, + { + "epoch": 15.14745308310992, + "grad_norm": 0.0006469152867794037, + "learning_rate": 8.443310523455416e-06, + "loss": 0.0, + "num_input_tokens_seen": 59007032, + "step": 101700 + }, + { + "epoch": 15.148197795650878, + "grad_norm": 0.0002251119294669479, + "learning_rate": 8.440875977714368e-06, + "loss": 0.0, + "num_input_tokens_seen": 59010040, + "step": 101705 + }, + { + "epoch": 15.148942508191839, + "grad_norm": 3.982405178248882e-05, + "learning_rate": 8.4384417117229e-06, + "loss": 0.0119, + "num_input_tokens_seen": 59013240, + "step": 101710 + }, + { + "epoch": 15.149687220732797, + "grad_norm": 0.0005613869871012866, + "learning_rate": 8.436007725522127e-06, + "loss": 0.0, + "num_input_tokens_seen": 59016408, + "step": 101715 + }, + { + "epoch": 15.150431933273756, + "grad_norm": 1.4140040548227262e-05, + "learning_rate": 8.433574019153167e-06, + "loss": 0.0, + "num_input_tokens_seen": 59019448, + "step": 101720 + }, + { + "epoch": 15.151176645814715, + "grad_norm": 0.0765194445848465, + "learning_rate": 8.43114059265713e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59022584, + "step": 101725 + }, + { + "epoch": 15.151921358355676, + "grad_norm": 0.00023596444225404412, + "learning_rate": 8.428707446075138e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59025304, + "step": 101730 + }, + { + "epoch": 15.152666070896634, + "grad_norm": 0.00020554717048071325, + "learning_rate": 8.426274579448293e-06, + "loss": 0.0, + "num_input_tokens_seen": 59027896, + "step": 101735 + }, + { + "epoch": 15.153410783437593, + "grad_norm": 3.5197706893086433e-05, + "learning_rate": 8.423841992817688e-06, + "loss": 0.0, + "num_input_tokens_seen": 59030584, + "step": 101740 + }, + { + "epoch": 15.154155495978552, + "grad_norm": 0.006120521109551191, + "learning_rate": 8.42140968622443e-06, + "loss": 0.0, + "num_input_tokens_seen": 59033496, + "step": 101745 + }, + { + "epoch": 15.15490020851951, + "grad_norm": 6.571048288606107e-05, + "learning_rate": 8.4189776597096e-06, + "loss": 0.0, + "num_input_tokens_seen": 59036344, + "step": 101750 + }, + { + "epoch": 15.155644921060471, + "grad_norm": 0.0012699172366410494, + "learning_rate": 8.416545913314296e-06, + "loss": 0.0, + "num_input_tokens_seen": 59039704, + "step": 101755 + }, + { + "epoch": 15.15638963360143, + "grad_norm": 0.0001029754348564893, + "learning_rate": 8.414114447079588e-06, + "loss": 0.0, + "num_input_tokens_seen": 59042840, + "step": 101760 + }, + { + "epoch": 15.157134346142389, + "grad_norm": 0.00023015614715404809, + "learning_rate": 8.411683261046569e-06, + "loss": 0.0, + "num_input_tokens_seen": 59045912, + "step": 101765 + }, + { + "epoch": 15.157879058683347, + "grad_norm": 1.6034244254115038e-05, + "learning_rate": 8.409252355256297e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59048696, + "step": 101770 + }, + { + "epoch": 15.158623771224308, + "grad_norm": 0.0021526480559259653, + "learning_rate": 8.40682172974984e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59051416, + "step": 101775 + }, + { + "epoch": 15.159368483765267, + "grad_norm": 0.00022694085782859474, + "learning_rate": 8.404391384568271e-06, + "loss": 0.0, + "num_input_tokens_seen": 59054392, + "step": 101780 + }, + { + "epoch": 15.160113196306225, + "grad_norm": 0.00015393701323773712, + "learning_rate": 8.401961319752646e-06, + "loss": 0.0, + "num_input_tokens_seen": 59057400, + "step": 101785 + }, + { + "epoch": 15.160857908847184, + "grad_norm": 0.00048406238784082234, + "learning_rate": 8.399531535344013e-06, + "loss": 0.0, + "num_input_tokens_seen": 59060600, + "step": 101790 + }, + { + "epoch": 15.161602621388145, + "grad_norm": 8.377800259040669e-05, + "learning_rate": 8.397102031383414e-06, + "loss": 0.0002, + "num_input_tokens_seen": 59063480, + "step": 101795 + }, + { + "epoch": 15.162347333929103, + "grad_norm": 0.0012215414317324758, + "learning_rate": 8.39467280791191e-06, + "loss": 0.0, + "num_input_tokens_seen": 59066264, + "step": 101800 + }, + { + "epoch": 15.163092046470062, + "grad_norm": 0.00012083936599083245, + "learning_rate": 8.392243864970525e-06, + "loss": 0.0, + "num_input_tokens_seen": 59069112, + "step": 101805 + }, + { + "epoch": 15.163836759011021, + "grad_norm": 6.753451452823356e-05, + "learning_rate": 8.389815202600306e-06, + "loss": 0.0, + "num_input_tokens_seen": 59071928, + "step": 101810 + }, + { + "epoch": 15.164581471551982, + "grad_norm": 0.0008481336408294737, + "learning_rate": 8.38738682084228e-06, + "loss": 0.0, + "num_input_tokens_seen": 59074648, + "step": 101815 + }, + { + "epoch": 15.16532618409294, + "grad_norm": 5.441026132757543e-06, + "learning_rate": 8.38495871973746e-06, + "loss": 0.0, + "num_input_tokens_seen": 59077304, + "step": 101820 + }, + { + "epoch": 15.166070896633899, + "grad_norm": 0.01741122081875801, + "learning_rate": 8.382530899326885e-06, + "loss": 0.0, + "num_input_tokens_seen": 59080024, + "step": 101825 + }, + { + "epoch": 15.166815609174858, + "grad_norm": 0.00029705261113122106, + "learning_rate": 8.380103359651553e-06, + "loss": 0.0, + "num_input_tokens_seen": 59082936, + "step": 101830 + }, + { + "epoch": 15.167560321715818, + "grad_norm": 0.0005631649983115494, + "learning_rate": 8.377676100752491e-06, + "loss": 0.0329, + "num_input_tokens_seen": 59085752, + "step": 101835 + }, + { + "epoch": 15.168305034256777, + "grad_norm": 0.0003903353645000607, + "learning_rate": 8.375249122670686e-06, + "loss": 0.0, + "num_input_tokens_seen": 59088760, + "step": 101840 + }, + { + "epoch": 15.169049746797736, + "grad_norm": 0.0005260799080133438, + "learning_rate": 8.372822425447164e-06, + "loss": 0.0, + "num_input_tokens_seen": 59091448, + "step": 101845 + }, + { + "epoch": 15.169794459338695, + "grad_norm": 2.264195791212842e-05, + "learning_rate": 8.370396009122902e-06, + "loss": 0.0, + "num_input_tokens_seen": 59094328, + "step": 101850 + }, + { + "epoch": 15.170539171879655, + "grad_norm": 4.505141987465322e-06, + "learning_rate": 8.3679698737389e-06, + "loss": 0.0, + "num_input_tokens_seen": 59097336, + "step": 101855 + }, + { + "epoch": 15.171283884420614, + "grad_norm": 0.0022610407322645187, + "learning_rate": 8.365544019336146e-06, + "loss": 0.0, + "num_input_tokens_seen": 59100312, + "step": 101860 + }, + { + "epoch": 15.172028596961573, + "grad_norm": 0.4407474994659424, + "learning_rate": 8.363118445955609e-06, + "loss": 0.0, + "num_input_tokens_seen": 59103192, + "step": 101865 + }, + { + "epoch": 15.172773309502531, + "grad_norm": 7.032616122160107e-05, + "learning_rate": 8.360693153638285e-06, + "loss": 0.0, + "num_input_tokens_seen": 59106264, + "step": 101870 + }, + { + "epoch": 15.173518022043492, + "grad_norm": 0.0003233922179788351, + "learning_rate": 8.35826814242513e-06, + "loss": 0.0, + "num_input_tokens_seen": 59109016, + "step": 101875 + }, + { + "epoch": 15.17426273458445, + "grad_norm": 0.003468971699476242, + "learning_rate": 8.355843412357131e-06, + "loss": 0.0, + "num_input_tokens_seen": 59112024, + "step": 101880 + }, + { + "epoch": 15.17500744712541, + "grad_norm": 2.4638378818053752e-05, + "learning_rate": 8.353418963475232e-06, + "loss": 0.0, + "num_input_tokens_seen": 59114936, + "step": 101885 + }, + { + "epoch": 15.175752159666368, + "grad_norm": 15.024121284484863, + "learning_rate": 8.350994795820407e-06, + "loss": 0.0014, + "num_input_tokens_seen": 59118040, + "step": 101890 + }, + { + "epoch": 15.176496872207329, + "grad_norm": 8.056149818003178e-05, + "learning_rate": 8.348570909433607e-06, + "loss": 0.0004, + "num_input_tokens_seen": 59121112, + "step": 101895 + }, + { + "epoch": 15.177241584748288, + "grad_norm": 0.0001266981562366709, + "learning_rate": 8.346147304355767e-06, + "loss": 0.0006, + "num_input_tokens_seen": 59123960, + "step": 101900 + }, + { + "epoch": 15.177986297289246, + "grad_norm": 9.412536019226536e-05, + "learning_rate": 8.343723980627848e-06, + "loss": 0.0, + "num_input_tokens_seen": 59127160, + "step": 101905 + }, + { + "epoch": 15.178731009830205, + "grad_norm": 0.00022473145509138703, + "learning_rate": 8.34130093829078e-06, + "loss": 0.0, + "num_input_tokens_seen": 59130200, + "step": 101910 + }, + { + "epoch": 15.179475722371166, + "grad_norm": 1.8994300262420438e-05, + "learning_rate": 8.338878177385508e-06, + "loss": 0.0, + "num_input_tokens_seen": 59133592, + "step": 101915 + }, + { + "epoch": 15.180220434912124, + "grad_norm": 8.224960327148438, + "learning_rate": 8.336455697952956e-06, + "loss": 0.0006, + "num_input_tokens_seen": 59136536, + "step": 101920 + }, + { + "epoch": 15.180965147453083, + "grad_norm": 5.80528867430985e-05, + "learning_rate": 8.33403350003405e-06, + "loss": 0.0, + "num_input_tokens_seen": 59139256, + "step": 101925 + }, + { + "epoch": 15.181709859994042, + "grad_norm": 0.0003054523258469999, + "learning_rate": 8.3316115836697e-06, + "loss": 0.0, + "num_input_tokens_seen": 59142328, + "step": 101930 + }, + { + "epoch": 15.182454572535, + "grad_norm": 0.0001170173563878052, + "learning_rate": 8.32918994890084e-06, + "loss": 0.0, + "num_input_tokens_seen": 59145656, + "step": 101935 + }, + { + "epoch": 15.183199285075961, + "grad_norm": 0.012146794237196445, + "learning_rate": 8.32676859576837e-06, + "loss": 0.0, + "num_input_tokens_seen": 59148600, + "step": 101940 + }, + { + "epoch": 15.18394399761692, + "grad_norm": 0.006938063073903322, + "learning_rate": 8.324347524313192e-06, + "loss": 0.0, + "num_input_tokens_seen": 59151352, + "step": 101945 + }, + { + "epoch": 15.184688710157879, + "grad_norm": 0.00024270554422400892, + "learning_rate": 8.321926734576223e-06, + "loss": 0.0, + "num_input_tokens_seen": 59154072, + "step": 101950 + }, + { + "epoch": 15.185433422698837, + "grad_norm": 0.0027407535817474127, + "learning_rate": 8.319506226598342e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59156920, + "step": 101955 + }, + { + "epoch": 15.186178135239798, + "grad_norm": 0.00035419451887719333, + "learning_rate": 8.317086000420459e-06, + "loss": 0.0, + "num_input_tokens_seen": 59159672, + "step": 101960 + }, + { + "epoch": 15.186922847780757, + "grad_norm": 7.353139517363161e-05, + "learning_rate": 8.314666056083444e-06, + "loss": 0.0, + "num_input_tokens_seen": 59162328, + "step": 101965 + }, + { + "epoch": 15.187667560321715, + "grad_norm": 0.0003366960445418954, + "learning_rate": 8.312246393628195e-06, + "loss": 0.0, + "num_input_tokens_seen": 59165400, + "step": 101970 + }, + { + "epoch": 15.188412272862674, + "grad_norm": 0.0001299605646636337, + "learning_rate": 8.309827013095584e-06, + "loss": 0.0, + "num_input_tokens_seen": 59168376, + "step": 101975 + }, + { + "epoch": 15.189156985403635, + "grad_norm": 0.0009065996273420751, + "learning_rate": 8.30740791452648e-06, + "loss": 0.0, + "num_input_tokens_seen": 59171352, + "step": 101980 + }, + { + "epoch": 15.189901697944594, + "grad_norm": 0.00012769908062182367, + "learning_rate": 8.304989097961748e-06, + "loss": 0.0, + "num_input_tokens_seen": 59174328, + "step": 101985 + }, + { + "epoch": 15.190646410485552, + "grad_norm": 2.9750377507298253e-05, + "learning_rate": 8.302570563442263e-06, + "loss": 0.0002, + "num_input_tokens_seen": 59177144, + "step": 101990 + }, + { + "epoch": 15.191391123026511, + "grad_norm": 0.00034420783049426973, + "learning_rate": 8.300152311008883e-06, + "loss": 0.0, + "num_input_tokens_seen": 59180024, + "step": 101995 + }, + { + "epoch": 15.192135835567472, + "grad_norm": 2.926613342424389e-05, + "learning_rate": 8.297734340702443e-06, + "loss": 0.0249, + "num_input_tokens_seen": 59183160, + "step": 102000 + }, + { + "epoch": 15.19288054810843, + "grad_norm": 9.029553621076047e-05, + "learning_rate": 8.295316652563817e-06, + "loss": 0.0, + "num_input_tokens_seen": 59185784, + "step": 102005 + }, + { + "epoch": 15.19362526064939, + "grad_norm": 0.007345705293118954, + "learning_rate": 8.292899246633828e-06, + "loss": 0.0, + "num_input_tokens_seen": 59188568, + "step": 102010 + }, + { + "epoch": 15.194369973190348, + "grad_norm": 0.2644347548484802, + "learning_rate": 8.290482122953336e-06, + "loss": 0.0003, + "num_input_tokens_seen": 59191480, + "step": 102015 + }, + { + "epoch": 15.195114685731308, + "grad_norm": 0.031055200845003128, + "learning_rate": 8.288065281563164e-06, + "loss": 0.0, + "num_input_tokens_seen": 59194328, + "step": 102020 + }, + { + "epoch": 15.195859398272267, + "grad_norm": 0.0006584238726645708, + "learning_rate": 8.285648722504136e-06, + "loss": 0.0, + "num_input_tokens_seen": 59197560, + "step": 102025 + }, + { + "epoch": 15.196604110813226, + "grad_norm": 8.139376586768776e-05, + "learning_rate": 8.283232445817094e-06, + "loss": 0.0, + "num_input_tokens_seen": 59200440, + "step": 102030 + }, + { + "epoch": 15.197348823354185, + "grad_norm": 0.00013769729412160814, + "learning_rate": 8.280816451542841e-06, + "loss": 0.0, + "num_input_tokens_seen": 59203352, + "step": 102035 + }, + { + "epoch": 15.198093535895145, + "grad_norm": 0.00018249369168188423, + "learning_rate": 8.278400739722211e-06, + "loss": 0.0, + "num_input_tokens_seen": 59206232, + "step": 102040 + }, + { + "epoch": 15.198838248436104, + "grad_norm": 0.0008044000715017319, + "learning_rate": 8.275985310396003e-06, + "loss": 0.0, + "num_input_tokens_seen": 59209240, + "step": 102045 + }, + { + "epoch": 15.199582960977063, + "grad_norm": 0.00011765845556510612, + "learning_rate": 8.273570163605026e-06, + "loss": 0.0, + "num_input_tokens_seen": 59211864, + "step": 102050 + }, + { + "epoch": 15.200327673518021, + "grad_norm": 2.6412812076159753e-05, + "learning_rate": 8.271155299390082e-06, + "loss": 0.0, + "num_input_tokens_seen": 59214680, + "step": 102055 + }, + { + "epoch": 15.201072386058982, + "grad_norm": 9.308705193689093e-05, + "learning_rate": 8.26874071779196e-06, + "loss": 0.0, + "num_input_tokens_seen": 59217528, + "step": 102060 + }, + { + "epoch": 15.20181709859994, + "grad_norm": 0.00034034682903438807, + "learning_rate": 8.266326418851467e-06, + "loss": 0.0, + "num_input_tokens_seen": 59220248, + "step": 102065 + }, + { + "epoch": 15.2025618111409, + "grad_norm": 0.0001148492592619732, + "learning_rate": 8.26391240260937e-06, + "loss": 0.0, + "num_input_tokens_seen": 59223384, + "step": 102070 + }, + { + "epoch": 15.203306523681858, + "grad_norm": 8.194200199795887e-05, + "learning_rate": 8.261498669106473e-06, + "loss": 0.0, + "num_input_tokens_seen": 59226584, + "step": 102075 + }, + { + "epoch": 15.204051236222819, + "grad_norm": 0.00021985647617839277, + "learning_rate": 8.259085218383536e-06, + "loss": 0.0, + "num_input_tokens_seen": 59229336, + "step": 102080 + }, + { + "epoch": 15.204795948763778, + "grad_norm": 8.063120185397565e-05, + "learning_rate": 8.256672050481348e-06, + "loss": 0.0, + "num_input_tokens_seen": 59232312, + "step": 102085 + }, + { + "epoch": 15.205540661304736, + "grad_norm": 8.369402348762378e-05, + "learning_rate": 8.254259165440662e-06, + "loss": 0.0, + "num_input_tokens_seen": 59235000, + "step": 102090 + }, + { + "epoch": 15.206285373845695, + "grad_norm": 2.60120887105586e-05, + "learning_rate": 8.251846563302253e-06, + "loss": 0.0, + "num_input_tokens_seen": 59238040, + "step": 102095 + }, + { + "epoch": 15.207030086386654, + "grad_norm": 8.90332376002334e-05, + "learning_rate": 8.249434244106875e-06, + "loss": 0.0, + "num_input_tokens_seen": 59240824, + "step": 102100 + }, + { + "epoch": 15.207774798927614, + "grad_norm": 1.651314778428059e-05, + "learning_rate": 8.247022207895271e-06, + "loss": 0.0, + "num_input_tokens_seen": 59244056, + "step": 102105 + }, + { + "epoch": 15.208519511468573, + "grad_norm": 7.194121280917898e-05, + "learning_rate": 8.244610454708213e-06, + "loss": 0.0, + "num_input_tokens_seen": 59246744, + "step": 102110 + }, + { + "epoch": 15.209264224009532, + "grad_norm": 0.0011242169421166182, + "learning_rate": 8.242198984586427e-06, + "loss": 0.0, + "num_input_tokens_seen": 59249496, + "step": 102115 + }, + { + "epoch": 15.21000893655049, + "grad_norm": 5.8459132560528815e-05, + "learning_rate": 8.239787797570661e-06, + "loss": 0.0, + "num_input_tokens_seen": 59252216, + "step": 102120 + }, + { + "epoch": 15.210753649091451, + "grad_norm": 0.0006568376556970179, + "learning_rate": 8.237376893701635e-06, + "loss": 0.0, + "num_input_tokens_seen": 59254776, + "step": 102125 + }, + { + "epoch": 15.21149836163241, + "grad_norm": 0.3338589668273926, + "learning_rate": 8.2349662730201e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59257624, + "step": 102130 + }, + { + "epoch": 15.212243074173369, + "grad_norm": 0.0009426135220564902, + "learning_rate": 8.232555935566769e-06, + "loss": 0.0, + "num_input_tokens_seen": 59260568, + "step": 102135 + }, + { + "epoch": 15.212987786714327, + "grad_norm": 3.1323750590672716e-05, + "learning_rate": 8.230145881382357e-06, + "loss": 0.0, + "num_input_tokens_seen": 59263416, + "step": 102140 + }, + { + "epoch": 15.213732499255288, + "grad_norm": 3.08858543576207e-05, + "learning_rate": 8.227736110507592e-06, + "loss": 0.0, + "num_input_tokens_seen": 59266360, + "step": 102145 + }, + { + "epoch": 15.214477211796247, + "grad_norm": 0.0003879025171045214, + "learning_rate": 8.225326622983173e-06, + "loss": 0.0097, + "num_input_tokens_seen": 59268952, + "step": 102150 + }, + { + "epoch": 15.215221924337206, + "grad_norm": 0.006415300536900759, + "learning_rate": 8.222917418849819e-06, + "loss": 0.0, + "num_input_tokens_seen": 59271672, + "step": 102155 + }, + { + "epoch": 15.215966636878164, + "grad_norm": 2.282677769471775e-06, + "learning_rate": 8.220508498148213e-06, + "loss": 0.0047, + "num_input_tokens_seen": 59274424, + "step": 102160 + }, + { + "epoch": 15.216711349419125, + "grad_norm": 0.016117198392748833, + "learning_rate": 8.218099860919074e-06, + "loss": 0.0, + "num_input_tokens_seen": 59277144, + "step": 102165 + }, + { + "epoch": 15.217456061960084, + "grad_norm": 0.002970707369968295, + "learning_rate": 8.215691507203072e-06, + "loss": 0.0, + "num_input_tokens_seen": 59279896, + "step": 102170 + }, + { + "epoch": 15.218200774501042, + "grad_norm": 8.600104592915159e-06, + "learning_rate": 8.213283437040911e-06, + "loss": 0.0, + "num_input_tokens_seen": 59283032, + "step": 102175 + }, + { + "epoch": 15.218945487042001, + "grad_norm": 0.0034932815469801426, + "learning_rate": 8.210875650473266e-06, + "loss": 0.0, + "num_input_tokens_seen": 59285912, + "step": 102180 + }, + { + "epoch": 15.219690199582962, + "grad_norm": 0.0005758183542639017, + "learning_rate": 8.208468147540812e-06, + "loss": 0.0, + "num_input_tokens_seen": 59288856, + "step": 102185 + }, + { + "epoch": 15.22043491212392, + "grad_norm": 7.705898315180093e-05, + "learning_rate": 8.206060928284223e-06, + "loss": 0.0, + "num_input_tokens_seen": 59291672, + "step": 102190 + }, + { + "epoch": 15.22117962466488, + "grad_norm": 4.214800719637424e-05, + "learning_rate": 8.20365399274416e-06, + "loss": 0.0, + "num_input_tokens_seen": 59294616, + "step": 102195 + }, + { + "epoch": 15.221924337205838, + "grad_norm": 0.00037079022149555385, + "learning_rate": 8.201247340961296e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59297304, + "step": 102200 + }, + { + "epoch": 15.222669049746798, + "grad_norm": 0.00026524902204982936, + "learning_rate": 8.19884097297628e-06, + "loss": 0.0, + "num_input_tokens_seen": 59300216, + "step": 102205 + }, + { + "epoch": 15.223413762287757, + "grad_norm": 0.00044646026799455285, + "learning_rate": 8.196434888829774e-06, + "loss": 0.0, + "num_input_tokens_seen": 59302968, + "step": 102210 + }, + { + "epoch": 15.224158474828716, + "grad_norm": 4.9300683713227045e-06, + "learning_rate": 8.194029088562425e-06, + "loss": 0.0, + "num_input_tokens_seen": 59305656, + "step": 102215 + }, + { + "epoch": 15.224903187369675, + "grad_norm": 0.00010177035437664017, + "learning_rate": 8.191623572214865e-06, + "loss": 0.0, + "num_input_tokens_seen": 59308344, + "step": 102220 + }, + { + "epoch": 15.225647899910635, + "grad_norm": 7.646255653526168e-06, + "learning_rate": 8.18921833982775e-06, + "loss": 0.0, + "num_input_tokens_seen": 59311320, + "step": 102225 + }, + { + "epoch": 15.226392612451594, + "grad_norm": 0.0019075290765613317, + "learning_rate": 8.186813391441697e-06, + "loss": 0.0, + "num_input_tokens_seen": 59314360, + "step": 102230 + }, + { + "epoch": 15.227137324992553, + "grad_norm": 1.2008434534072876, + "learning_rate": 8.184408727097354e-06, + "loss": 0.144, + "num_input_tokens_seen": 59317272, + "step": 102235 + }, + { + "epoch": 15.227882037533512, + "grad_norm": 0.00024101590679492801, + "learning_rate": 8.182004346835323e-06, + "loss": 0.0379, + "num_input_tokens_seen": 59320088, + "step": 102240 + }, + { + "epoch": 15.228626750074472, + "grad_norm": 0.00012593196879606694, + "learning_rate": 8.179600250696245e-06, + "loss": 0.0, + "num_input_tokens_seen": 59323448, + "step": 102245 + }, + { + "epoch": 15.22937146261543, + "grad_norm": 0.000750349136069417, + "learning_rate": 8.177196438720724e-06, + "loss": 0.0422, + "num_input_tokens_seen": 59326296, + "step": 102250 + }, + { + "epoch": 15.23011617515639, + "grad_norm": 0.0008954238728620112, + "learning_rate": 8.174792910949376e-06, + "loss": 0.0, + "num_input_tokens_seen": 59329176, + "step": 102255 + }, + { + "epoch": 15.230860887697348, + "grad_norm": 4.6499833842972293e-05, + "learning_rate": 8.172389667422797e-06, + "loss": 0.0089, + "num_input_tokens_seen": 59332152, + "step": 102260 + }, + { + "epoch": 15.231605600238307, + "grad_norm": 0.00015678857744205743, + "learning_rate": 8.169986708181584e-06, + "loss": 0.0, + "num_input_tokens_seen": 59334968, + "step": 102265 + }, + { + "epoch": 15.232350312779268, + "grad_norm": 0.0002505338052287698, + "learning_rate": 8.167584033266349e-06, + "loss": 0.0, + "num_input_tokens_seen": 59337816, + "step": 102270 + }, + { + "epoch": 15.233095025320226, + "grad_norm": 0.00037167101982049644, + "learning_rate": 8.165181642717668e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59340728, + "step": 102275 + }, + { + "epoch": 15.233839737861185, + "grad_norm": 7.210944022517651e-05, + "learning_rate": 8.162779536576138e-06, + "loss": 0.0, + "num_input_tokens_seen": 59343352, + "step": 102280 + }, + { + "epoch": 15.234584450402144, + "grad_norm": 1.6933316146605648e-05, + "learning_rate": 8.160377714882327e-06, + "loss": 0.0, + "num_input_tokens_seen": 59346456, + "step": 102285 + }, + { + "epoch": 15.235329162943104, + "grad_norm": 0.002266308758407831, + "learning_rate": 8.15797617767683e-06, + "loss": 0.0, + "num_input_tokens_seen": 59349112, + "step": 102290 + }, + { + "epoch": 15.236073875484063, + "grad_norm": 8.730238914722577e-05, + "learning_rate": 8.155574925000207e-06, + "loss": 0.0183, + "num_input_tokens_seen": 59351960, + "step": 102295 + }, + { + "epoch": 15.236818588025022, + "grad_norm": 0.0005834074690937996, + "learning_rate": 8.153173956893018e-06, + "loss": 0.0, + "num_input_tokens_seen": 59354712, + "step": 102300 + }, + { + "epoch": 15.23756330056598, + "grad_norm": 0.0005948480684310198, + "learning_rate": 8.15077327339584e-06, + "loss": 0.0, + "num_input_tokens_seen": 59357496, + "step": 102305 + }, + { + "epoch": 15.238308013106941, + "grad_norm": 2.4964683689177036e-05, + "learning_rate": 8.148372874549224e-06, + "loss": 0.0, + "num_input_tokens_seen": 59360248, + "step": 102310 + }, + { + "epoch": 15.2390527256479, + "grad_norm": 0.007159990258514881, + "learning_rate": 8.145972760393711e-06, + "loss": 0.0, + "num_input_tokens_seen": 59363384, + "step": 102315 + }, + { + "epoch": 15.239797438188859, + "grad_norm": 8.355638419743627e-05, + "learning_rate": 8.143572930969866e-06, + "loss": 0.0, + "num_input_tokens_seen": 59366424, + "step": 102320 + }, + { + "epoch": 15.240542150729818, + "grad_norm": 0.00020941192633472383, + "learning_rate": 8.141173386318226e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59369528, + "step": 102325 + }, + { + "epoch": 15.241286863270778, + "grad_norm": 7.243768050102517e-05, + "learning_rate": 8.13877412647932e-06, + "loss": 0.0, + "num_input_tokens_seen": 59373304, + "step": 102330 + }, + { + "epoch": 15.242031575811737, + "grad_norm": 0.0009399782866239548, + "learning_rate": 8.136375151493695e-06, + "loss": 0.0, + "num_input_tokens_seen": 59376216, + "step": 102335 + }, + { + "epoch": 15.242776288352696, + "grad_norm": 0.00010329796350561082, + "learning_rate": 8.13397646140187e-06, + "loss": 0.3063, + "num_input_tokens_seen": 59379160, + "step": 102340 + }, + { + "epoch": 15.243521000893654, + "grad_norm": 0.5472840666770935, + "learning_rate": 8.131578056244365e-06, + "loss": 0.019, + "num_input_tokens_seen": 59382456, + "step": 102345 + }, + { + "epoch": 15.244265713434615, + "grad_norm": 0.0019813093822449446, + "learning_rate": 8.129179936061715e-06, + "loss": 0.0, + "num_input_tokens_seen": 59385400, + "step": 102350 + }, + { + "epoch": 15.245010425975574, + "grad_norm": 0.00012477388372644782, + "learning_rate": 8.126782100894411e-06, + "loss": 0.0, + "num_input_tokens_seen": 59388472, + "step": 102355 + }, + { + "epoch": 15.245755138516532, + "grad_norm": 0.00012414662342052907, + "learning_rate": 8.124384550782985e-06, + "loss": 0.0, + "num_input_tokens_seen": 59391448, + "step": 102360 + }, + { + "epoch": 15.246499851057491, + "grad_norm": 0.007038903422653675, + "learning_rate": 8.12198728576792e-06, + "loss": 0.0, + "num_input_tokens_seen": 59394200, + "step": 102365 + }, + { + "epoch": 15.247244563598452, + "grad_norm": 6.750425382051617e-05, + "learning_rate": 8.119590305889737e-06, + "loss": 0.0, + "num_input_tokens_seen": 59396856, + "step": 102370 + }, + { + "epoch": 15.24798927613941, + "grad_norm": 0.0002931460039690137, + "learning_rate": 8.117193611188917e-06, + "loss": 0.0, + "num_input_tokens_seen": 59399608, + "step": 102375 + }, + { + "epoch": 15.24873398868037, + "grad_norm": 2.4563143597333692e-05, + "learning_rate": 8.114797201705954e-06, + "loss": 0.0443, + "num_input_tokens_seen": 59402584, + "step": 102380 + }, + { + "epoch": 15.249478701221328, + "grad_norm": 2.1036619727965444e-05, + "learning_rate": 8.112401077481329e-06, + "loss": 0.0, + "num_input_tokens_seen": 59406136, + "step": 102385 + }, + { + "epoch": 15.250223413762289, + "grad_norm": 1.2096554200979881e-05, + "learning_rate": 8.110005238555517e-06, + "loss": 0.0, + "num_input_tokens_seen": 59408856, + "step": 102390 + }, + { + "epoch": 15.250968126303247, + "grad_norm": 6.871514051454142e-05, + "learning_rate": 8.107609684969008e-06, + "loss": 0.0, + "num_input_tokens_seen": 59411608, + "step": 102395 + }, + { + "epoch": 15.251712838844206, + "grad_norm": 3.6489241210801993e-06, + "learning_rate": 8.105214416762255e-06, + "loss": 0.0, + "num_input_tokens_seen": 59414360, + "step": 102400 + }, + { + "epoch": 15.252457551385165, + "grad_norm": 8.365921530639753e-05, + "learning_rate": 8.102819433975745e-06, + "loss": 0.0, + "num_input_tokens_seen": 59417112, + "step": 102405 + }, + { + "epoch": 15.253202263926125, + "grad_norm": 0.000504080846440047, + "learning_rate": 8.100424736649918e-06, + "loss": 0.0002, + "num_input_tokens_seen": 59419832, + "step": 102410 + }, + { + "epoch": 15.253946976467084, + "grad_norm": 9.27001383388415e-05, + "learning_rate": 8.098030324825246e-06, + "loss": 0.0, + "num_input_tokens_seen": 59423000, + "step": 102415 + }, + { + "epoch": 15.254691689008043, + "grad_norm": 0.00020559427503030747, + "learning_rate": 8.095636198542173e-06, + "loss": 0.0, + "num_input_tokens_seen": 59425880, + "step": 102420 + }, + { + "epoch": 15.255436401549002, + "grad_norm": 3.393345594406128, + "learning_rate": 8.093242357841136e-06, + "loss": 0.0365, + "num_input_tokens_seen": 59428856, + "step": 102425 + }, + { + "epoch": 15.256181114089962, + "grad_norm": 3.7619919776916504, + "learning_rate": 8.090848802762596e-06, + "loss": 0.0003, + "num_input_tokens_seen": 59431896, + "step": 102430 + }, + { + "epoch": 15.256925826630921, + "grad_norm": 0.00010931748693110421, + "learning_rate": 8.088455533346973e-06, + "loss": 0.0707, + "num_input_tokens_seen": 59434744, + "step": 102435 + }, + { + "epoch": 15.25767053917188, + "grad_norm": 1.6125539332279004e-05, + "learning_rate": 8.086062549634712e-06, + "loss": 0.0, + "num_input_tokens_seen": 59437432, + "step": 102440 + }, + { + "epoch": 15.258415251712838, + "grad_norm": 8.857581269694492e-05, + "learning_rate": 8.083669851666235e-06, + "loss": 0.0005, + "num_input_tokens_seen": 59440632, + "step": 102445 + }, + { + "epoch": 15.259159964253797, + "grad_norm": 0.004400933161377907, + "learning_rate": 8.081277439481961e-06, + "loss": 0.0, + "num_input_tokens_seen": 59443224, + "step": 102450 + }, + { + "epoch": 15.259904676794758, + "grad_norm": 0.0012264559045433998, + "learning_rate": 8.078885313122311e-06, + "loss": 0.0, + "num_input_tokens_seen": 59446456, + "step": 102455 + }, + { + "epoch": 15.260649389335716, + "grad_norm": 7.399060268653557e-05, + "learning_rate": 8.076493472627687e-06, + "loss": 0.0, + "num_input_tokens_seen": 59449688, + "step": 102460 + }, + { + "epoch": 15.261394101876675, + "grad_norm": 0.00012402748689055443, + "learning_rate": 8.074101918038512e-06, + "loss": 0.0, + "num_input_tokens_seen": 59452632, + "step": 102465 + }, + { + "epoch": 15.262138814417634, + "grad_norm": 9.176421372103505e-06, + "learning_rate": 8.071710649395178e-06, + "loss": 0.0, + "num_input_tokens_seen": 59455704, + "step": 102470 + }, + { + "epoch": 15.262883526958595, + "grad_norm": 0.00015678885392844677, + "learning_rate": 8.069319666738093e-06, + "loss": 0.0, + "num_input_tokens_seen": 59458520, + "step": 102475 + }, + { + "epoch": 15.263628239499553, + "grad_norm": 152.79600524902344, + "learning_rate": 8.066928970107638e-06, + "loss": 0.2438, + "num_input_tokens_seen": 59461496, + "step": 102480 + }, + { + "epoch": 15.264372952040512, + "grad_norm": 3.0326415071613155e-05, + "learning_rate": 8.064538559544213e-06, + "loss": 0.0002, + "num_input_tokens_seen": 59464280, + "step": 102485 + }, + { + "epoch": 15.26511766458147, + "grad_norm": 3.060956805711612e-05, + "learning_rate": 8.06214843508819e-06, + "loss": 0.0, + "num_input_tokens_seen": 59466968, + "step": 102490 + }, + { + "epoch": 15.265862377122431, + "grad_norm": 9.557133125781547e-06, + "learning_rate": 8.059758596779965e-06, + "loss": 0.0, + "num_input_tokens_seen": 59469976, + "step": 102495 + }, + { + "epoch": 15.26660708966339, + "grad_norm": 4.010621705674566e-05, + "learning_rate": 8.057369044659899e-06, + "loss": 0.0, + "num_input_tokens_seen": 59472792, + "step": 102500 + }, + { + "epoch": 15.267351802204349, + "grad_norm": 0.005194803234189749, + "learning_rate": 8.054979778768354e-06, + "loss": 0.0, + "num_input_tokens_seen": 59475480, + "step": 102505 + }, + { + "epoch": 15.268096514745308, + "grad_norm": 0.0006162759382277727, + "learning_rate": 8.052590799145715e-06, + "loss": 0.0, + "num_input_tokens_seen": 59478584, + "step": 102510 + }, + { + "epoch": 15.268841227286268, + "grad_norm": 4.593923222273588e-05, + "learning_rate": 8.050202105832327e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59481720, + "step": 102515 + }, + { + "epoch": 15.269585939827227, + "grad_norm": 0.002752883592620492, + "learning_rate": 8.047813698868548e-06, + "loss": 0.0, + "num_input_tokens_seen": 59484472, + "step": 102520 + }, + { + "epoch": 15.270330652368186, + "grad_norm": 0.0031483378261327744, + "learning_rate": 8.045425578294719e-06, + "loss": 0.0196, + "num_input_tokens_seen": 59487192, + "step": 102525 + }, + { + "epoch": 15.271075364909144, + "grad_norm": 0.0007920555071905255, + "learning_rate": 8.043037744151203e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59490232, + "step": 102530 + }, + { + "epoch": 15.271820077450105, + "grad_norm": 0.004223999101668596, + "learning_rate": 8.040650196478319e-06, + "loss": 0.0009, + "num_input_tokens_seen": 59493304, + "step": 102535 + }, + { + "epoch": 15.272564789991064, + "grad_norm": 5.224457709118724e-05, + "learning_rate": 8.038262935316423e-06, + "loss": 0.0, + "num_input_tokens_seen": 59496312, + "step": 102540 + }, + { + "epoch": 15.273309502532022, + "grad_norm": 0.00020064644922968, + "learning_rate": 8.035875960705835e-06, + "loss": 0.0, + "num_input_tokens_seen": 59499416, + "step": 102545 + }, + { + "epoch": 15.274054215072981, + "grad_norm": 0.030121563002467155, + "learning_rate": 8.033489272686872e-06, + "loss": 0.0, + "num_input_tokens_seen": 59502488, + "step": 102550 + }, + { + "epoch": 15.274798927613942, + "grad_norm": 3.419441782170907e-05, + "learning_rate": 8.031102871299876e-06, + "loss": 0.0, + "num_input_tokens_seen": 59505240, + "step": 102555 + }, + { + "epoch": 15.2755436401549, + "grad_norm": 0.0002796209591906518, + "learning_rate": 8.02871675658514e-06, + "loss": 0.0, + "num_input_tokens_seen": 59507960, + "step": 102560 + }, + { + "epoch": 15.27628835269586, + "grad_norm": 3.4501113987062126e-05, + "learning_rate": 8.026330928582993e-06, + "loss": 0.0, + "num_input_tokens_seen": 59510904, + "step": 102565 + }, + { + "epoch": 15.277033065236818, + "grad_norm": 0.00029584066942334175, + "learning_rate": 8.023945387333722e-06, + "loss": 0.0, + "num_input_tokens_seen": 59513688, + "step": 102570 + }, + { + "epoch": 15.277777777777779, + "grad_norm": 0.0146170100197196, + "learning_rate": 8.021560132877653e-06, + "loss": 0.0, + "num_input_tokens_seen": 59516600, + "step": 102575 + }, + { + "epoch": 15.278522490318737, + "grad_norm": 2.871871629395173e-06, + "learning_rate": 8.019175165255069e-06, + "loss": 0.0, + "num_input_tokens_seen": 59519224, + "step": 102580 + }, + { + "epoch": 15.279267202859696, + "grad_norm": 0.0002157228154828772, + "learning_rate": 8.016790484506261e-06, + "loss": 0.0, + "num_input_tokens_seen": 59522488, + "step": 102585 + }, + { + "epoch": 15.280011915400655, + "grad_norm": 8.24171456770273e-06, + "learning_rate": 8.014406090671516e-06, + "loss": 0.0, + "num_input_tokens_seen": 59525432, + "step": 102590 + }, + { + "epoch": 15.280756627941615, + "grad_norm": 0.004368430003523827, + "learning_rate": 8.012021983791112e-06, + "loss": 0.0, + "num_input_tokens_seen": 59528184, + "step": 102595 + }, + { + "epoch": 15.281501340482574, + "grad_norm": 0.0013624089770019054, + "learning_rate": 8.009638163905337e-06, + "loss": 0.0, + "num_input_tokens_seen": 59530904, + "step": 102600 + }, + { + "epoch": 15.282246053023533, + "grad_norm": 1.6118839994305745e-05, + "learning_rate": 8.007254631054448e-06, + "loss": 0.0004, + "num_input_tokens_seen": 59533784, + "step": 102605 + }, + { + "epoch": 15.282990765564492, + "grad_norm": 3.156370439683087e-05, + "learning_rate": 8.00487138527873e-06, + "loss": 0.0, + "num_input_tokens_seen": 59536728, + "step": 102610 + }, + { + "epoch": 15.283735478105452, + "grad_norm": 1.4489786735794041e-05, + "learning_rate": 8.002488426618429e-06, + "loss": 0.0, + "num_input_tokens_seen": 59539672, + "step": 102615 + }, + { + "epoch": 15.284480190646411, + "grad_norm": 8.02266804384999e-05, + "learning_rate": 8.000105755113818e-06, + "loss": 0.0, + "num_input_tokens_seen": 59542744, + "step": 102620 + }, + { + "epoch": 15.28522490318737, + "grad_norm": 1.718017847451847e-05, + "learning_rate": 7.997723370805143e-06, + "loss": 0.0674, + "num_input_tokens_seen": 59545528, + "step": 102625 + }, + { + "epoch": 15.285969615728328, + "grad_norm": 4.512838859227486e-05, + "learning_rate": 7.995341273732642e-06, + "loss": 0.0, + "num_input_tokens_seen": 59548536, + "step": 102630 + }, + { + "epoch": 15.286714328269287, + "grad_norm": 6.594353908440098e-05, + "learning_rate": 7.992959463936578e-06, + "loss": 0.0, + "num_input_tokens_seen": 59551480, + "step": 102635 + }, + { + "epoch": 15.287459040810248, + "grad_norm": 0.002151613589376211, + "learning_rate": 7.990577941457175e-06, + "loss": 0.0, + "num_input_tokens_seen": 59554136, + "step": 102640 + }, + { + "epoch": 15.288203753351207, + "grad_norm": 0.0001599649403942749, + "learning_rate": 7.988196706334666e-06, + "loss": 0.0, + "num_input_tokens_seen": 59556920, + "step": 102645 + }, + { + "epoch": 15.288948465892165, + "grad_norm": 0.00010130625014426187, + "learning_rate": 7.985815758609289e-06, + "loss": 0.0, + "num_input_tokens_seen": 59559768, + "step": 102650 + }, + { + "epoch": 15.289693178433124, + "grad_norm": 9.508580660622101e-06, + "learning_rate": 7.983435098321263e-06, + "loss": 0.0, + "num_input_tokens_seen": 59562872, + "step": 102655 + }, + { + "epoch": 15.290437890974085, + "grad_norm": 1.0707337423809804e-05, + "learning_rate": 7.981054725510805e-06, + "loss": 0.0, + "num_input_tokens_seen": 59565912, + "step": 102660 + }, + { + "epoch": 15.291182603515043, + "grad_norm": 0.0006463086465373635, + "learning_rate": 7.978674640218126e-06, + "loss": 0.0, + "num_input_tokens_seen": 59568600, + "step": 102665 + }, + { + "epoch": 15.291927316056002, + "grad_norm": 0.0001769578520907089, + "learning_rate": 7.976294842483446e-06, + "loss": 0.0, + "num_input_tokens_seen": 59571576, + "step": 102670 + }, + { + "epoch": 15.29267202859696, + "grad_norm": 7.002246093179565e-06, + "learning_rate": 7.97391533234695e-06, + "loss": 0.0, + "num_input_tokens_seen": 59574744, + "step": 102675 + }, + { + "epoch": 15.293416741137921, + "grad_norm": 1.8646087482920848e-05, + "learning_rate": 7.971536109848862e-06, + "loss": 0.0, + "num_input_tokens_seen": 59577912, + "step": 102680 + }, + { + "epoch": 15.29416145367888, + "grad_norm": 4.0702692785998806e-05, + "learning_rate": 7.969157175029354e-06, + "loss": 0.0, + "num_input_tokens_seen": 59580696, + "step": 102685 + }, + { + "epoch": 15.294906166219839, + "grad_norm": 0.0006218301132321358, + "learning_rate": 7.966778527928637e-06, + "loss": 0.0, + "num_input_tokens_seen": 59583256, + "step": 102690 + }, + { + "epoch": 15.295650878760798, + "grad_norm": 0.00012117323058191687, + "learning_rate": 7.964400168586875e-06, + "loss": 0.0, + "num_input_tokens_seen": 59585976, + "step": 102695 + }, + { + "epoch": 15.296395591301758, + "grad_norm": 0.008100091479718685, + "learning_rate": 7.962022097044266e-06, + "loss": 0.0, + "num_input_tokens_seen": 59588984, + "step": 102700 + }, + { + "epoch": 15.297140303842717, + "grad_norm": 0.0002801472437568009, + "learning_rate": 7.959644313340978e-06, + "loss": 0.0, + "num_input_tokens_seen": 59592088, + "step": 102705 + }, + { + "epoch": 15.297885016383676, + "grad_norm": 1.8640786947798915e-05, + "learning_rate": 7.95726681751718e-06, + "loss": 0.0, + "num_input_tokens_seen": 59594840, + "step": 102710 + }, + { + "epoch": 15.298629728924634, + "grad_norm": 35.33000564575195, + "learning_rate": 7.95488960961304e-06, + "loss": 0.1221, + "num_input_tokens_seen": 59597464, + "step": 102715 + }, + { + "epoch": 15.299374441465595, + "grad_norm": 0.0005603451281785965, + "learning_rate": 7.952512689668703e-06, + "loss": 0.0, + "num_input_tokens_seen": 59599992, + "step": 102720 + }, + { + "epoch": 15.300119154006554, + "grad_norm": 0.0001399514003423974, + "learning_rate": 7.95013605772435e-06, + "loss": 0.0, + "num_input_tokens_seen": 59602936, + "step": 102725 + }, + { + "epoch": 15.300863866547513, + "grad_norm": 0.00010719725833041593, + "learning_rate": 7.947759713820111e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59605656, + "step": 102730 + }, + { + "epoch": 15.301608579088471, + "grad_norm": 7.847879896871746e-05, + "learning_rate": 7.945383657996148e-06, + "loss": 0.0, + "num_input_tokens_seen": 59608472, + "step": 102735 + }, + { + "epoch": 15.302353291629432, + "grad_norm": 0.0003803445433732122, + "learning_rate": 7.943007890292593e-06, + "loss": 0.0, + "num_input_tokens_seen": 59610968, + "step": 102740 + }, + { + "epoch": 15.30309800417039, + "grad_norm": 6.062006650608964e-05, + "learning_rate": 7.940632410749577e-06, + "loss": 0.0, + "num_input_tokens_seen": 59613912, + "step": 102745 + }, + { + "epoch": 15.30384271671135, + "grad_norm": 1.095302195608383e-05, + "learning_rate": 7.938257219407246e-06, + "loss": 0.0, + "num_input_tokens_seen": 59616696, + "step": 102750 + }, + { + "epoch": 15.304587429252308, + "grad_norm": 5.409668574429816e-06, + "learning_rate": 7.93588231630571e-06, + "loss": 0.0, + "num_input_tokens_seen": 59619704, + "step": 102755 + }, + { + "epoch": 15.305332141793269, + "grad_norm": 6.637290789512917e-05, + "learning_rate": 7.933507701485108e-06, + "loss": 0.0, + "num_input_tokens_seen": 59622232, + "step": 102760 + }, + { + "epoch": 15.306076854334227, + "grad_norm": 0.0007903802907094359, + "learning_rate": 7.93113337498554e-06, + "loss": 0.0003, + "num_input_tokens_seen": 59625304, + "step": 102765 + }, + { + "epoch": 15.306821566875186, + "grad_norm": 0.00024411366030108184, + "learning_rate": 7.928759336847133e-06, + "loss": 0.0, + "num_input_tokens_seen": 59628408, + "step": 102770 + }, + { + "epoch": 15.307566279416145, + "grad_norm": 1.5905821783235297e-05, + "learning_rate": 7.926385587109986e-06, + "loss": 0.0, + "num_input_tokens_seen": 59630936, + "step": 102775 + }, + { + "epoch": 15.308310991957104, + "grad_norm": 5.004660852137022e-05, + "learning_rate": 7.924012125814203e-06, + "loss": 0.0674, + "num_input_tokens_seen": 59633752, + "step": 102780 + }, + { + "epoch": 15.309055704498064, + "grad_norm": 0.003546307794749737, + "learning_rate": 7.92163895299988e-06, + "loss": 0.0, + "num_input_tokens_seen": 59636600, + "step": 102785 + }, + { + "epoch": 15.309800417039023, + "grad_norm": 9.931721433531493e-05, + "learning_rate": 7.919266068707099e-06, + "loss": 0.0, + "num_input_tokens_seen": 59639448, + "step": 102790 + }, + { + "epoch": 15.310545129579982, + "grad_norm": 5.8151936531066895, + "learning_rate": 7.916893472975967e-06, + "loss": 0.0953, + "num_input_tokens_seen": 59642264, + "step": 102795 + }, + { + "epoch": 15.31128984212094, + "grad_norm": 7.769842624664307, + "learning_rate": 7.91452116584655e-06, + "loss": 0.1407, + "num_input_tokens_seen": 59645272, + "step": 102800 + }, + { + "epoch": 15.312034554661901, + "grad_norm": 0.00389885320328176, + "learning_rate": 7.912149147358938e-06, + "loss": 0.0, + "num_input_tokens_seen": 59648184, + "step": 102805 + }, + { + "epoch": 15.31277926720286, + "grad_norm": 0.00048216115101240575, + "learning_rate": 7.909777417553193e-06, + "loss": 0.0, + "num_input_tokens_seen": 59651352, + "step": 102810 + }, + { + "epoch": 15.313523979743819, + "grad_norm": 1.7142912838608027e-05, + "learning_rate": 7.907405976469397e-06, + "loss": 0.0, + "num_input_tokens_seen": 59654520, + "step": 102815 + }, + { + "epoch": 15.314268692284777, + "grad_norm": 4.0494828681403305e-06, + "learning_rate": 7.905034824147605e-06, + "loss": 0.0, + "num_input_tokens_seen": 59657560, + "step": 102820 + }, + { + "epoch": 15.315013404825738, + "grad_norm": 1.1231020835111849e-05, + "learning_rate": 7.902663960627869e-06, + "loss": 0.0, + "num_input_tokens_seen": 59660472, + "step": 102825 + }, + { + "epoch": 15.315758117366697, + "grad_norm": 0.00022281368728727102, + "learning_rate": 7.900293385950253e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59663288, + "step": 102830 + }, + { + "epoch": 15.316502829907655, + "grad_norm": 6.63535320200026e-05, + "learning_rate": 7.897923100154794e-06, + "loss": 0.0, + "num_input_tokens_seen": 59666296, + "step": 102835 + }, + { + "epoch": 15.317247542448614, + "grad_norm": 0.0004969800356775522, + "learning_rate": 7.895553103281552e-06, + "loss": 0.0, + "num_input_tokens_seen": 59669080, + "step": 102840 + }, + { + "epoch": 15.317992254989575, + "grad_norm": 0.0003040881420020014, + "learning_rate": 7.893183395370554e-06, + "loss": 0.0, + "num_input_tokens_seen": 59671704, + "step": 102845 + }, + { + "epoch": 15.318736967530533, + "grad_norm": 0.0017426850972697139, + "learning_rate": 7.890813976461836e-06, + "loss": 0.0, + "num_input_tokens_seen": 59674328, + "step": 102850 + }, + { + "epoch": 15.319481680071492, + "grad_norm": 5.526076620299136e-06, + "learning_rate": 7.888444846595422e-06, + "loss": 0.0, + "num_input_tokens_seen": 59677176, + "step": 102855 + }, + { + "epoch": 15.320226392612451, + "grad_norm": 0.00026454523322172463, + "learning_rate": 7.886076005811346e-06, + "loss": 0.0, + "num_input_tokens_seen": 59679960, + "step": 102860 + }, + { + "epoch": 15.320971105153411, + "grad_norm": 0.0007891711429692805, + "learning_rate": 7.883707454149621e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59682584, + "step": 102865 + }, + { + "epoch": 15.32171581769437, + "grad_norm": 0.002433842048048973, + "learning_rate": 7.881339191650256e-06, + "loss": 0.0, + "num_input_tokens_seen": 59685464, + "step": 102870 + }, + { + "epoch": 15.322460530235329, + "grad_norm": 0.001856506452895701, + "learning_rate": 7.878971218353275e-06, + "loss": 0.0, + "num_input_tokens_seen": 59688472, + "step": 102875 + }, + { + "epoch": 15.323205242776288, + "grad_norm": 0.8021926879882812, + "learning_rate": 7.876603534298666e-06, + "loss": 0.0002, + "num_input_tokens_seen": 59691160, + "step": 102880 + }, + { + "epoch": 15.323949955317248, + "grad_norm": 2.783912181854248, + "learning_rate": 7.874236139526445e-06, + "loss": 0.0002, + "num_input_tokens_seen": 59693880, + "step": 102885 + }, + { + "epoch": 15.324694667858207, + "grad_norm": 0.005957445129752159, + "learning_rate": 7.87186903407659e-06, + "loss": 0.0, + "num_input_tokens_seen": 59696920, + "step": 102890 + }, + { + "epoch": 15.325439380399166, + "grad_norm": 5.529998816200532e-06, + "learning_rate": 7.869502217989108e-06, + "loss": 0.0, + "num_input_tokens_seen": 59699704, + "step": 102895 + }, + { + "epoch": 15.326184092940125, + "grad_norm": 0.0003903545730281621, + "learning_rate": 7.867135691303975e-06, + "loss": 0.0, + "num_input_tokens_seen": 59702584, + "step": 102900 + }, + { + "epoch": 15.326928805481085, + "grad_norm": 0.010720505379140377, + "learning_rate": 7.864769454061163e-06, + "loss": 0.0, + "num_input_tokens_seen": 59705592, + "step": 102905 + }, + { + "epoch": 15.327673518022044, + "grad_norm": 0.0005056546069681644, + "learning_rate": 7.862403506300664e-06, + "loss": 0.0002, + "num_input_tokens_seen": 59708600, + "step": 102910 + }, + { + "epoch": 15.328418230563003, + "grad_norm": 0.0003516322758514434, + "learning_rate": 7.86003784806244e-06, + "loss": 0.0, + "num_input_tokens_seen": 59711320, + "step": 102915 + }, + { + "epoch": 15.329162943103961, + "grad_norm": 0.0001423522480763495, + "learning_rate": 7.857672479386458e-06, + "loss": 0.0, + "num_input_tokens_seen": 59714136, + "step": 102920 + }, + { + "epoch": 15.329907655644922, + "grad_norm": 0.0014864779077470303, + "learning_rate": 7.855307400312667e-06, + "loss": 0.0, + "num_input_tokens_seen": 59717176, + "step": 102925 + }, + { + "epoch": 15.33065236818588, + "grad_norm": 0.0010918116895481944, + "learning_rate": 7.85294261088104e-06, + "loss": 0.0, + "num_input_tokens_seen": 59720056, + "step": 102930 + }, + { + "epoch": 15.33139708072684, + "grad_norm": 0.0011850965674966574, + "learning_rate": 7.850578111131513e-06, + "loss": 0.0009, + "num_input_tokens_seen": 59722968, + "step": 102935 + }, + { + "epoch": 15.332141793267798, + "grad_norm": 0.00013076691539026797, + "learning_rate": 7.848213901104045e-06, + "loss": 0.0, + "num_input_tokens_seen": 59725784, + "step": 102940 + }, + { + "epoch": 15.332886505808759, + "grad_norm": 5.076485831523314e-05, + "learning_rate": 7.845849980838574e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59728472, + "step": 102945 + }, + { + "epoch": 15.333631218349717, + "grad_norm": 8.476859875372611e-06, + "learning_rate": 7.843486350375023e-06, + "loss": 0.0, + "num_input_tokens_seen": 59731320, + "step": 102950 + }, + { + "epoch": 15.334375930890676, + "grad_norm": 0.00017201658920384943, + "learning_rate": 7.84112300975334e-06, + "loss": 0.0025, + "num_input_tokens_seen": 59734168, + "step": 102955 + }, + { + "epoch": 15.335120643431635, + "grad_norm": 0.0011945056030526757, + "learning_rate": 7.838759959013439e-06, + "loss": 0.1159, + "num_input_tokens_seen": 59737016, + "step": 102960 + }, + { + "epoch": 15.335865355972594, + "grad_norm": 0.0018764351261779666, + "learning_rate": 7.836397198195252e-06, + "loss": 0.0, + "num_input_tokens_seen": 59739960, + "step": 102965 + }, + { + "epoch": 15.336610068513554, + "grad_norm": 0.0024427692405879498, + "learning_rate": 7.83403472733869e-06, + "loss": 0.0, + "num_input_tokens_seen": 59742904, + "step": 102970 + }, + { + "epoch": 15.337354781054513, + "grad_norm": 0.006404083222150803, + "learning_rate": 7.83167254648366e-06, + "loss": 0.0, + "num_input_tokens_seen": 59745560, + "step": 102975 + }, + { + "epoch": 15.338099493595472, + "grad_norm": 0.00015727734717074782, + "learning_rate": 7.829310655670077e-06, + "loss": 0.0, + "num_input_tokens_seen": 59748600, + "step": 102980 + }, + { + "epoch": 15.33884420613643, + "grad_norm": 115.65184783935547, + "learning_rate": 7.82694905493784e-06, + "loss": 0.2, + "num_input_tokens_seen": 59751512, + "step": 102985 + }, + { + "epoch": 15.339588918677391, + "grad_norm": 0.002351053524762392, + "learning_rate": 7.824587744326847e-06, + "loss": 0.0075, + "num_input_tokens_seen": 59754392, + "step": 102990 + }, + { + "epoch": 15.34033363121835, + "grad_norm": 6.134825525805354e-05, + "learning_rate": 7.822226723876976e-06, + "loss": 0.0, + "num_input_tokens_seen": 59757240, + "step": 102995 + }, + { + "epoch": 15.341078343759309, + "grad_norm": 0.0004071496950928122, + "learning_rate": 7.819865993628139e-06, + "loss": 0.0, + "num_input_tokens_seen": 59760280, + "step": 103000 + }, + { + "epoch": 15.341823056300267, + "grad_norm": 4.9660007789498195e-05, + "learning_rate": 7.817505553620194e-06, + "loss": 0.0, + "num_input_tokens_seen": 59763000, + "step": 103005 + }, + { + "epoch": 15.342567768841228, + "grad_norm": 2.1482692318386398e-05, + "learning_rate": 7.815145403893037e-06, + "loss": 0.0, + "num_input_tokens_seen": 59765880, + "step": 103010 + }, + { + "epoch": 15.343312481382187, + "grad_norm": 0.00011551279021659866, + "learning_rate": 7.812785544486526e-06, + "loss": 0.0, + "num_input_tokens_seen": 59768920, + "step": 103015 + }, + { + "epoch": 15.344057193923145, + "grad_norm": 0.00017709248641040176, + "learning_rate": 7.81042597544054e-06, + "loss": 0.107, + "num_input_tokens_seen": 59771896, + "step": 103020 + }, + { + "epoch": 15.344801906464104, + "grad_norm": 0.0012004624586552382, + "learning_rate": 7.808066696794938e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59775064, + "step": 103025 + }, + { + "epoch": 15.345546619005065, + "grad_norm": 7.609901513205841e-05, + "learning_rate": 7.805707708589569e-06, + "loss": 0.0, + "num_input_tokens_seen": 59777912, + "step": 103030 + }, + { + "epoch": 15.346291331546023, + "grad_norm": 0.0006400640122592449, + "learning_rate": 7.8033490108643e-06, + "loss": 0.0, + "num_input_tokens_seen": 59780824, + "step": 103035 + }, + { + "epoch": 15.347036044086982, + "grad_norm": 0.002290559932589531, + "learning_rate": 7.80099060365897e-06, + "loss": 0.0, + "num_input_tokens_seen": 59783640, + "step": 103040 + }, + { + "epoch": 15.347780756627941, + "grad_norm": 0.0027949826326221228, + "learning_rate": 7.798632487013427e-06, + "loss": 0.0, + "num_input_tokens_seen": 59786776, + "step": 103045 + }, + { + "epoch": 15.348525469168901, + "grad_norm": 0.0006239217473194003, + "learning_rate": 7.796274660967496e-06, + "loss": 0.02, + "num_input_tokens_seen": 59790296, + "step": 103050 + }, + { + "epoch": 15.34927018170986, + "grad_norm": 5.7363278756383806e-05, + "learning_rate": 7.793917125561027e-06, + "loss": 0.0, + "num_input_tokens_seen": 59793112, + "step": 103055 + }, + { + "epoch": 15.350014894250819, + "grad_norm": 0.0005084877484478056, + "learning_rate": 7.79155988083384e-06, + "loss": 0.0005, + "num_input_tokens_seen": 59796120, + "step": 103060 + }, + { + "epoch": 15.350759606791778, + "grad_norm": 0.0004671690403483808, + "learning_rate": 7.78920292682575e-06, + "loss": 0.0, + "num_input_tokens_seen": 59799128, + "step": 103065 + }, + { + "epoch": 15.351504319332738, + "grad_norm": 3.819978155661374e-05, + "learning_rate": 7.786846263576594e-06, + "loss": 0.0, + "num_input_tokens_seen": 59801976, + "step": 103070 + }, + { + "epoch": 15.352249031873697, + "grad_norm": 0.00027253522421233356, + "learning_rate": 7.784489891126167e-06, + "loss": 0.0, + "num_input_tokens_seen": 59805176, + "step": 103075 + }, + { + "epoch": 15.352993744414656, + "grad_norm": 1.8005197489401326e-05, + "learning_rate": 7.782133809514297e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59807832, + "step": 103080 + }, + { + "epoch": 15.353738456955615, + "grad_norm": 0.0002551936195231974, + "learning_rate": 7.779778018780765e-06, + "loss": 0.0, + "num_input_tokens_seen": 59810680, + "step": 103085 + }, + { + "epoch": 15.354483169496575, + "grad_norm": 0.0019101918442174792, + "learning_rate": 7.77742251896539e-06, + "loss": 0.0223, + "num_input_tokens_seen": 59813592, + "step": 103090 + }, + { + "epoch": 15.355227882037534, + "grad_norm": 2.3109740141080692e-05, + "learning_rate": 7.775067310107953e-06, + "loss": 0.0, + "num_input_tokens_seen": 59816408, + "step": 103095 + }, + { + "epoch": 15.355972594578493, + "grad_norm": 8.738057658774778e-06, + "learning_rate": 7.772712392248251e-06, + "loss": 0.0, + "num_input_tokens_seen": 59819544, + "step": 103100 + }, + { + "epoch": 15.356717307119451, + "grad_norm": 6.258388748392463e-05, + "learning_rate": 7.770357765426068e-06, + "loss": 0.0, + "num_input_tokens_seen": 59822616, + "step": 103105 + }, + { + "epoch": 15.357462019660412, + "grad_norm": 0.0001182908090413548, + "learning_rate": 7.768003429681175e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59825528, + "step": 103110 + }, + { + "epoch": 15.35820673220137, + "grad_norm": 1.9939621779485606e-05, + "learning_rate": 7.765649385053353e-06, + "loss": 0.0, + "num_input_tokens_seen": 59828376, + "step": 103115 + }, + { + "epoch": 15.35895144474233, + "grad_norm": 0.000880615902133286, + "learning_rate": 7.76329563158236e-06, + "loss": 0.0, + "num_input_tokens_seen": 59831192, + "step": 103120 + }, + { + "epoch": 15.359696157283288, + "grad_norm": 6.082380059524439e-05, + "learning_rate": 7.760942169307975e-06, + "loss": 0.0, + "num_input_tokens_seen": 59834168, + "step": 103125 + }, + { + "epoch": 15.360440869824249, + "grad_norm": 0.00013771826343145221, + "learning_rate": 7.758588998269944e-06, + "loss": 0.0002, + "num_input_tokens_seen": 59837432, + "step": 103130 + }, + { + "epoch": 15.361185582365207, + "grad_norm": 0.0045860824175179005, + "learning_rate": 7.756236118508036e-06, + "loss": 0.0, + "num_input_tokens_seen": 59840184, + "step": 103135 + }, + { + "epoch": 15.361930294906166, + "grad_norm": 0.02215493842959404, + "learning_rate": 7.753883530061987e-06, + "loss": 0.0292, + "num_input_tokens_seen": 59843224, + "step": 103140 + }, + { + "epoch": 15.362675007447125, + "grad_norm": 0.00014046589785721153, + "learning_rate": 7.751531232971554e-06, + "loss": 0.0, + "num_input_tokens_seen": 59846264, + "step": 103145 + }, + { + "epoch": 15.363419719988084, + "grad_norm": 0.00029515393543988466, + "learning_rate": 7.749179227276471e-06, + "loss": 0.0001, + "num_input_tokens_seen": 59849144, + "step": 103150 + }, + { + "epoch": 15.364164432529044, + "grad_norm": 0.0006385680171661079, + "learning_rate": 7.746827513016464e-06, + "loss": 0.0, + "num_input_tokens_seen": 59852024, + "step": 103155 + }, + { + "epoch": 15.364909145070003, + "grad_norm": 0.004962884820997715, + "learning_rate": 7.744476090231275e-06, + "loss": 0.0, + "num_input_tokens_seen": 59855032, + "step": 103160 + }, + { + "epoch": 15.365653857610962, + "grad_norm": 0.00041454576421529055, + "learning_rate": 7.742124958960622e-06, + "loss": 0.0, + "num_input_tokens_seen": 59857816, + "step": 103165 + }, + { + "epoch": 15.36639857015192, + "grad_norm": 9.313789632869884e-05, + "learning_rate": 7.739774119244233e-06, + "loss": 0.0, + "num_input_tokens_seen": 59860728, + "step": 103170 + }, + { + "epoch": 15.367143282692881, + "grad_norm": 0.0003238499630242586, + "learning_rate": 7.737423571121818e-06, + "loss": 0.0, + "num_input_tokens_seen": 59863448, + "step": 103175 + }, + { + "epoch": 15.36788799523384, + "grad_norm": 0.0009149261168204248, + "learning_rate": 7.73507331463309e-06, + "loss": 0.0017, + "num_input_tokens_seen": 59866264, + "step": 103180 + }, + { + "epoch": 15.368632707774799, + "grad_norm": 0.003147375537082553, + "learning_rate": 7.732723349817747e-06, + "loss": 0.0, + "num_input_tokens_seen": 59869176, + "step": 103185 + }, + { + "epoch": 15.369377420315757, + "grad_norm": 0.0006165379891172051, + "learning_rate": 7.730373676715488e-06, + "loss": 0.0, + "num_input_tokens_seen": 59871960, + "step": 103190 + }, + { + "epoch": 15.370122132856718, + "grad_norm": 0.00015663196973036975, + "learning_rate": 7.728024295366018e-06, + "loss": 0.2138, + "num_input_tokens_seen": 59874872, + "step": 103195 + }, + { + "epoch": 15.370866845397677, + "grad_norm": 5.285615407046862e-05, + "learning_rate": 7.725675205809019e-06, + "loss": 0.0, + "num_input_tokens_seen": 59877464, + "step": 103200 + }, + { + "epoch": 15.371611557938635, + "grad_norm": 0.0003626077086664736, + "learning_rate": 7.723326408084186e-06, + "loss": 0.0, + "num_input_tokens_seen": 59880504, + "step": 103205 + }, + { + "epoch": 15.372356270479594, + "grad_norm": 9.030084765981883e-05, + "learning_rate": 7.720977902231189e-06, + "loss": 0.0, + "num_input_tokens_seen": 59883320, + "step": 103210 + }, + { + "epoch": 15.373100983020555, + "grad_norm": 0.00010422131890663877, + "learning_rate": 7.718629688289713e-06, + "loss": 0.0, + "num_input_tokens_seen": 59886136, + "step": 103215 + }, + { + "epoch": 15.373845695561513, + "grad_norm": 2.9533768611145206e-05, + "learning_rate": 7.716281766299419e-06, + "loss": 0.0, + "num_input_tokens_seen": 59889304, + "step": 103220 + }, + { + "epoch": 15.374590408102472, + "grad_norm": 0.0026891955640167, + "learning_rate": 7.713934136299985e-06, + "loss": 0.0, + "num_input_tokens_seen": 59892088, + "step": 103225 + }, + { + "epoch": 15.375335120643431, + "grad_norm": 0.01479940302670002, + "learning_rate": 7.711586798331066e-06, + "loss": 0.0, + "num_input_tokens_seen": 59895000, + "step": 103230 + }, + { + "epoch": 15.376079833184392, + "grad_norm": 5.553948358283378e-05, + "learning_rate": 7.70923975243231e-06, + "loss": 0.0, + "num_input_tokens_seen": 59897752, + "step": 103235 + }, + { + "epoch": 15.37682454572535, + "grad_norm": 0.0003551245608832687, + "learning_rate": 7.70689299864338e-06, + "loss": 0.0, + "num_input_tokens_seen": 59900984, + "step": 103240 + }, + { + "epoch": 15.377569258266309, + "grad_norm": 5.532150316867046e-05, + "learning_rate": 7.704546537003918e-06, + "loss": 0.0, + "num_input_tokens_seen": 59903928, + "step": 103245 + }, + { + "epoch": 15.378313970807268, + "grad_norm": 3.575741720851511e-05, + "learning_rate": 7.702200367553563e-06, + "loss": 0.0, + "num_input_tokens_seen": 59906616, + "step": 103250 + }, + { + "epoch": 15.379058683348228, + "grad_norm": 6.7712135205511e-05, + "learning_rate": 7.699854490331948e-06, + "loss": 0.0, + "num_input_tokens_seen": 59909400, + "step": 103255 + }, + { + "epoch": 15.379803395889187, + "grad_norm": 0.00010622264380799606, + "learning_rate": 7.697508905378712e-06, + "loss": 0.0, + "num_input_tokens_seen": 59912504, + "step": 103260 + }, + { + "epoch": 15.380548108430146, + "grad_norm": 0.00130240258295089, + "learning_rate": 7.69516361273348e-06, + "loss": 0.0, + "num_input_tokens_seen": 59915224, + "step": 103265 + }, + { + "epoch": 15.381292820971105, + "grad_norm": 0.022308316081762314, + "learning_rate": 7.692818612435862e-06, + "loss": 0.0, + "num_input_tokens_seen": 59918456, + "step": 103270 + }, + { + "epoch": 15.382037533512065, + "grad_norm": 8.925551810534671e-05, + "learning_rate": 7.690473904525491e-06, + "loss": 0.0, + "num_input_tokens_seen": 59921272, + "step": 103275 + }, + { + "epoch": 15.382782246053024, + "grad_norm": 0.005667903460562229, + "learning_rate": 7.688129489041963e-06, + "loss": 0.0, + "num_input_tokens_seen": 59924120, + "step": 103280 + }, + { + "epoch": 15.383526958593983, + "grad_norm": 1.215054089698242e-05, + "learning_rate": 7.685785366024901e-06, + "loss": 0.0, + "num_input_tokens_seen": 59927064, + "step": 103285 + }, + { + "epoch": 15.384271671134941, + "grad_norm": 1.9622078980319202e-05, + "learning_rate": 7.683441535513888e-06, + "loss": 0.0, + "num_input_tokens_seen": 59929848, + "step": 103290 + }, + { + "epoch": 15.3850163836759, + "grad_norm": 3.443727837293409e-05, + "learning_rate": 7.681097997548539e-06, + "loss": 0.0, + "num_input_tokens_seen": 59932856, + "step": 103295 + }, + { + "epoch": 15.38576109621686, + "grad_norm": 6.825305717939045e-06, + "learning_rate": 7.678754752168438e-06, + "loss": 0.0, + "num_input_tokens_seen": 59935864, + "step": 103300 + }, + { + "epoch": 15.38650580875782, + "grad_norm": 0.005761174019426107, + "learning_rate": 7.676411799413163e-06, + "loss": 0.0682, + "num_input_tokens_seen": 59938904, + "step": 103305 + }, + { + "epoch": 15.387250521298778, + "grad_norm": 0.0004983097896911204, + "learning_rate": 7.674069139322312e-06, + "loss": 0.0, + "num_input_tokens_seen": 59941816, + "step": 103310 + }, + { + "epoch": 15.387995233839739, + "grad_norm": 5.547181444853777e-06, + "learning_rate": 7.671726771935453e-06, + "loss": 0.0, + "num_input_tokens_seen": 59944792, + "step": 103315 + }, + { + "epoch": 15.388739946380698, + "grad_norm": 0.0035644294694066048, + "learning_rate": 7.669384697292158e-06, + "loss": 0.0, + "num_input_tokens_seen": 59947896, + "step": 103320 + }, + { + "epoch": 15.389484658921656, + "grad_norm": 0.0006253598839975893, + "learning_rate": 7.667042915431987e-06, + "loss": 0.0002, + "num_input_tokens_seen": 59950712, + "step": 103325 + }, + { + "epoch": 15.390229371462615, + "grad_norm": 0.000224001458263956, + "learning_rate": 7.66470142639452e-06, + "loss": 0.0, + "num_input_tokens_seen": 59953752, + "step": 103330 + }, + { + "epoch": 15.390974084003574, + "grad_norm": 0.00010123341053258628, + "learning_rate": 7.662360230219293e-06, + "loss": 0.0084, + "num_input_tokens_seen": 59956824, + "step": 103335 + }, + { + "epoch": 15.391718796544534, + "grad_norm": 6.231698036193848, + "learning_rate": 7.660019326945874e-06, + "loss": 0.006, + "num_input_tokens_seen": 59959576, + "step": 103340 + }, + { + "epoch": 15.392463509085493, + "grad_norm": 0.0001261801371583715, + "learning_rate": 7.657678716613808e-06, + "loss": 0.0, + "num_input_tokens_seen": 59962232, + "step": 103345 + }, + { + "epoch": 15.393208221626452, + "grad_norm": 0.023347996175289154, + "learning_rate": 7.655338399262627e-06, + "loss": 0.0, + "num_input_tokens_seen": 59964856, + "step": 103350 + }, + { + "epoch": 15.39395293416741, + "grad_norm": 0.00017037181532941759, + "learning_rate": 7.652998374931882e-06, + "loss": 0.017, + "num_input_tokens_seen": 59967960, + "step": 103355 + }, + { + "epoch": 15.394697646708371, + "grad_norm": 0.0013949754647910595, + "learning_rate": 7.65065864366109e-06, + "loss": 0.0, + "num_input_tokens_seen": 59970808, + "step": 103360 + }, + { + "epoch": 15.39544235924933, + "grad_norm": 0.000997927156277001, + "learning_rate": 7.648319205489798e-06, + "loss": 0.0, + "num_input_tokens_seen": 59973656, + "step": 103365 + }, + { + "epoch": 15.396187071790289, + "grad_norm": 15.189732551574707, + "learning_rate": 7.645980060457517e-06, + "loss": 0.0014, + "num_input_tokens_seen": 59977048, + "step": 103370 + }, + { + "epoch": 15.396931784331247, + "grad_norm": 0.0007866615778766572, + "learning_rate": 7.643641208603764e-06, + "loss": 0.0, + "num_input_tokens_seen": 59980024, + "step": 103375 + }, + { + "epoch": 15.397676496872208, + "grad_norm": 2.922915700764861e-05, + "learning_rate": 7.641302649968043e-06, + "loss": 0.0, + "num_input_tokens_seen": 59983000, + "step": 103380 + }, + { + "epoch": 15.398421209413167, + "grad_norm": 0.00045575309195555747, + "learning_rate": 7.638964384589881e-06, + "loss": 0.029, + "num_input_tokens_seen": 59985848, + "step": 103385 + }, + { + "epoch": 15.399165921954125, + "grad_norm": 1.199373036797624e-05, + "learning_rate": 7.63662641250877e-06, + "loss": 0.0, + "num_input_tokens_seen": 59988664, + "step": 103390 + }, + { + "epoch": 15.399910634495084, + "grad_norm": 0.0006169801927171648, + "learning_rate": 7.6342887337642e-06, + "loss": 0.0336, + "num_input_tokens_seen": 59991672, + "step": 103395 + }, + { + "epoch": 15.400655347036045, + "grad_norm": 0.00026827590772882104, + "learning_rate": 7.631951348395683e-06, + "loss": 0.0, + "num_input_tokens_seen": 59994360, + "step": 103400 + }, + { + "epoch": 15.401400059577004, + "grad_norm": 3.21331899613142e-05, + "learning_rate": 7.629614256442685e-06, + "loss": 0.0, + "num_input_tokens_seen": 59997464, + "step": 103405 + }, + { + "epoch": 15.402144772117962, + "grad_norm": 2.7109526854474097e-05, + "learning_rate": 7.6272774579447065e-06, + "loss": 0.0376, + "num_input_tokens_seen": 60000472, + "step": 103410 + }, + { + "epoch": 15.402889484658921, + "grad_norm": 0.0003779045073315501, + "learning_rate": 7.6249409529412145e-06, + "loss": 0.0, + "num_input_tokens_seen": 60003448, + "step": 103415 + }, + { + "epoch": 15.403634197199882, + "grad_norm": 0.00019584214896894991, + "learning_rate": 7.622604741471692e-06, + "loss": 0.1439, + "num_input_tokens_seen": 60006552, + "step": 103420 + }, + { + "epoch": 15.40437890974084, + "grad_norm": 0.0005632626707665622, + "learning_rate": 7.620268823575599e-06, + "loss": 0.0, + "num_input_tokens_seen": 60009176, + "step": 103425 + }, + { + "epoch": 15.405123622281799, + "grad_norm": 1.591725231264718e-05, + "learning_rate": 7.617933199292396e-06, + "loss": 0.0, + "num_input_tokens_seen": 60011704, + "step": 103430 + }, + { + "epoch": 15.405868334822758, + "grad_norm": 0.0012881079455837607, + "learning_rate": 7.61559786866155e-06, + "loss": 0.0422, + "num_input_tokens_seen": 60014776, + "step": 103435 + }, + { + "epoch": 15.406613047363718, + "grad_norm": 0.00014718731108587235, + "learning_rate": 7.613262831722509e-06, + "loss": 0.0, + "num_input_tokens_seen": 60017848, + "step": 103440 + }, + { + "epoch": 15.407357759904677, + "grad_norm": 3.5343196941539645e-05, + "learning_rate": 7.610928088514724e-06, + "loss": 0.0, + "num_input_tokens_seen": 60020792, + "step": 103445 + }, + { + "epoch": 15.408102472445636, + "grad_norm": 2.4426678919553524e-06, + "learning_rate": 7.608593639077627e-06, + "loss": 0.0, + "num_input_tokens_seen": 60023544, + "step": 103450 + }, + { + "epoch": 15.408847184986595, + "grad_norm": 6.994839350227267e-05, + "learning_rate": 7.60625948345067e-06, + "loss": 0.0, + "num_input_tokens_seen": 60026424, + "step": 103455 + }, + { + "epoch": 15.409591897527555, + "grad_norm": 6.680825026705861e-05, + "learning_rate": 7.603925621673275e-06, + "loss": 0.0, + "num_input_tokens_seen": 60029720, + "step": 103460 + }, + { + "epoch": 15.410336610068514, + "grad_norm": 8.237306610681117e-05, + "learning_rate": 7.601592053784884e-06, + "loss": 0.0, + "num_input_tokens_seen": 60032728, + "step": 103465 + }, + { + "epoch": 15.411081322609473, + "grad_norm": 2.7212368877371773e-05, + "learning_rate": 7.599258779824911e-06, + "loss": 0.0, + "num_input_tokens_seen": 60035448, + "step": 103470 + }, + { + "epoch": 15.411826035150431, + "grad_norm": 6.281585228862241e-05, + "learning_rate": 7.596925799832769e-06, + "loss": 0.0, + "num_input_tokens_seen": 60038104, + "step": 103475 + }, + { + "epoch": 15.41257074769139, + "grad_norm": 0.00014670181553810835, + "learning_rate": 7.594593113847887e-06, + "loss": 0.0, + "num_input_tokens_seen": 60040760, + "step": 103480 + }, + { + "epoch": 15.41331546023235, + "grad_norm": 0.007040924858301878, + "learning_rate": 7.592260721909655e-06, + "loss": 0.0, + "num_input_tokens_seen": 60043640, + "step": 103485 + }, + { + "epoch": 15.41406017277331, + "grad_norm": 0.00015304746921174228, + "learning_rate": 7.589928624057494e-06, + "loss": 0.0289, + "num_input_tokens_seen": 60046392, + "step": 103490 + }, + { + "epoch": 15.414804885314268, + "grad_norm": 2.9405253371805884e-05, + "learning_rate": 7.587596820330783e-06, + "loss": 0.1005, + "num_input_tokens_seen": 60049336, + "step": 103495 + }, + { + "epoch": 15.415549597855227, + "grad_norm": 0.0016798089491203427, + "learning_rate": 7.585265310768938e-06, + "loss": 0.0, + "num_input_tokens_seen": 60052568, + "step": 103500 + }, + { + "epoch": 15.416294310396188, + "grad_norm": 0.02147459052503109, + "learning_rate": 7.582934095411337e-06, + "loss": 0.0, + "num_input_tokens_seen": 60055416, + "step": 103505 + }, + { + "epoch": 15.417039022937146, + "grad_norm": 0.0008176640258170664, + "learning_rate": 7.58060317429736e-06, + "loss": 0.0, + "num_input_tokens_seen": 60058232, + "step": 103510 + }, + { + "epoch": 15.417783735478105, + "grad_norm": 8.159162098309025e-05, + "learning_rate": 7.5782725474663894e-06, + "loss": 0.0, + "num_input_tokens_seen": 60061080, + "step": 103515 + }, + { + "epoch": 15.418528448019064, + "grad_norm": 0.00067380431573838, + "learning_rate": 7.575942214957787e-06, + "loss": 0.0, + "num_input_tokens_seen": 60064024, + "step": 103520 + }, + { + "epoch": 15.419273160560024, + "grad_norm": 0.0046013169921934605, + "learning_rate": 7.573612176810943e-06, + "loss": 0.0, + "num_input_tokens_seen": 60066840, + "step": 103525 + }, + { + "epoch": 15.420017873100983, + "grad_norm": 5.491403862833977e-05, + "learning_rate": 7.5712824330651995e-06, + "loss": 0.0, + "num_input_tokens_seen": 60069592, + "step": 103530 + }, + { + "epoch": 15.420762585641942, + "grad_norm": 0.0004977218341082335, + "learning_rate": 7.568952983759936e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60072344, + "step": 103535 + }, + { + "epoch": 15.4215072981829, + "grad_norm": 0.0006642311927862465, + "learning_rate": 7.566623828934485e-06, + "loss": 0.0, + "num_input_tokens_seen": 60075256, + "step": 103540 + }, + { + "epoch": 15.422252010723861, + "grad_norm": 0.00033601876930333674, + "learning_rate": 7.5642949686282165e-06, + "loss": 0.0592, + "num_input_tokens_seen": 60078072, + "step": 103545 + }, + { + "epoch": 15.42299672326482, + "grad_norm": 6.0048550949431956e-05, + "learning_rate": 7.561966402880461e-06, + "loss": 0.0, + "num_input_tokens_seen": 60080856, + "step": 103550 + }, + { + "epoch": 15.423741435805779, + "grad_norm": 4.686038664658554e-05, + "learning_rate": 7.559638131730554e-06, + "loss": 0.0002, + "num_input_tokens_seen": 60083672, + "step": 103555 + }, + { + "epoch": 15.424486148346737, + "grad_norm": 5.625473568215966e-05, + "learning_rate": 7.557310155217842e-06, + "loss": 0.0, + "num_input_tokens_seen": 60086776, + "step": 103560 + }, + { + "epoch": 15.425230860887698, + "grad_norm": 0.0009420167189091444, + "learning_rate": 7.554982473381639e-06, + "loss": 0.0, + "num_input_tokens_seen": 60090104, + "step": 103565 + }, + { + "epoch": 15.425975573428657, + "grad_norm": 0.00029191322391852736, + "learning_rate": 7.552655086261287e-06, + "loss": 0.0, + "num_input_tokens_seen": 60092600, + "step": 103570 + }, + { + "epoch": 15.426720285969616, + "grad_norm": 0.0034797906409949064, + "learning_rate": 7.550327993896092e-06, + "loss": 0.0002, + "num_input_tokens_seen": 60095192, + "step": 103575 + }, + { + "epoch": 15.427464998510574, + "grad_norm": 0.0024703750386834145, + "learning_rate": 7.548001196325372e-06, + "loss": 0.0, + "num_input_tokens_seen": 60098136, + "step": 103580 + }, + { + "epoch": 15.428209711051535, + "grad_norm": 0.001405031536705792, + "learning_rate": 7.545674693588434e-06, + "loss": 0.0, + "num_input_tokens_seen": 60100696, + "step": 103585 + }, + { + "epoch": 15.428954423592494, + "grad_norm": 0.10970757156610489, + "learning_rate": 7.543348485724572e-06, + "loss": 0.0, + "num_input_tokens_seen": 60103288, + "step": 103590 + }, + { + "epoch": 15.429699136133452, + "grad_norm": 0.00022007335792295635, + "learning_rate": 7.541022572773107e-06, + "loss": 0.0, + "num_input_tokens_seen": 60106424, + "step": 103595 + }, + { + "epoch": 15.430443848674411, + "grad_norm": 0.00014606928743887693, + "learning_rate": 7.538696954773311e-06, + "loss": 0.0, + "num_input_tokens_seen": 60109304, + "step": 103600 + }, + { + "epoch": 15.431188561215372, + "grad_norm": 9.716688509797677e-05, + "learning_rate": 7.536371631764491e-06, + "loss": 0.0, + "num_input_tokens_seen": 60112440, + "step": 103605 + }, + { + "epoch": 15.43193327375633, + "grad_norm": 0.0001879808696685359, + "learning_rate": 7.534046603785916e-06, + "loss": 0.0, + "num_input_tokens_seen": 60115160, + "step": 103610 + }, + { + "epoch": 15.43267798629729, + "grad_norm": 6.308286538114771e-05, + "learning_rate": 7.531721870876879e-06, + "loss": 0.0, + "num_input_tokens_seen": 60118264, + "step": 103615 + }, + { + "epoch": 15.433422698838248, + "grad_norm": 0.00017724820645526052, + "learning_rate": 7.529397433076638e-06, + "loss": 0.0284, + "num_input_tokens_seen": 60120888, + "step": 103620 + }, + { + "epoch": 15.434167411379208, + "grad_norm": 0.00013556712656281888, + "learning_rate": 7.52707329042448e-06, + "loss": 0.0, + "num_input_tokens_seen": 60123608, + "step": 103625 + }, + { + "epoch": 15.434912123920167, + "grad_norm": 24.578168869018555, + "learning_rate": 7.524749442959661e-06, + "loss": 0.0057, + "num_input_tokens_seen": 60126744, + "step": 103630 + }, + { + "epoch": 15.435656836461126, + "grad_norm": 0.00013857056910637766, + "learning_rate": 7.522425890721432e-06, + "loss": 0.005, + "num_input_tokens_seen": 60129592, + "step": 103635 + }, + { + "epoch": 15.436401549002085, + "grad_norm": 2.9068161893519573e-05, + "learning_rate": 7.52010263374906e-06, + "loss": 0.0, + "num_input_tokens_seen": 60132472, + "step": 103640 + }, + { + "epoch": 15.437146261543045, + "grad_norm": 0.0001978669606614858, + "learning_rate": 7.51777967208179e-06, + "loss": 0.0, + "num_input_tokens_seen": 60135224, + "step": 103645 + }, + { + "epoch": 15.437890974084004, + "grad_norm": 2.366751323279459e-05, + "learning_rate": 7.515457005758864e-06, + "loss": 0.0, + "num_input_tokens_seen": 60137848, + "step": 103650 + }, + { + "epoch": 15.438635686624963, + "grad_norm": 238.2144317626953, + "learning_rate": 7.5131346348195105e-06, + "loss": 0.1035, + "num_input_tokens_seen": 60140600, + "step": 103655 + }, + { + "epoch": 15.439380399165922, + "grad_norm": 0.0006475057452917099, + "learning_rate": 7.510812559302985e-06, + "loss": 0.0, + "num_input_tokens_seen": 60143640, + "step": 103660 + }, + { + "epoch": 15.44012511170688, + "grad_norm": 13.084464073181152, + "learning_rate": 7.508490779248506e-06, + "loss": 0.0009, + "num_input_tokens_seen": 60146392, + "step": 103665 + }, + { + "epoch": 15.44086982424784, + "grad_norm": 0.00017164697055704892, + "learning_rate": 7.5061692946952896e-06, + "loss": 0.0, + "num_input_tokens_seen": 60148984, + "step": 103670 + }, + { + "epoch": 15.4416145367888, + "grad_norm": 7.19463569112122e-05, + "learning_rate": 7.503848105682571e-06, + "loss": 0.0, + "num_input_tokens_seen": 60152056, + "step": 103675 + }, + { + "epoch": 15.442359249329758, + "grad_norm": 0.0006707796710543334, + "learning_rate": 7.501527212249549e-06, + "loss": 0.0, + "num_input_tokens_seen": 60155384, + "step": 103680 + }, + { + "epoch": 15.443103961870717, + "grad_norm": 9.286078238801565e-06, + "learning_rate": 7.4992066144354475e-06, + "loss": 0.0, + "num_input_tokens_seen": 60158104, + "step": 103685 + }, + { + "epoch": 15.443848674411678, + "grad_norm": 5.966154731140705e-06, + "learning_rate": 7.496886312279455e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60161048, + "step": 103690 + }, + { + "epoch": 15.444593386952636, + "grad_norm": 0.00017539433611091226, + "learning_rate": 7.494566305820788e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60164088, + "step": 103695 + }, + { + "epoch": 15.445338099493595, + "grad_norm": 6.70587396598421e-05, + "learning_rate": 7.492246595098629e-06, + "loss": 0.0, + "num_input_tokens_seen": 60166872, + "step": 103700 + }, + { + "epoch": 15.446082812034554, + "grad_norm": 3.403279697522521e-05, + "learning_rate": 7.489927180152173e-06, + "loss": 0.0, + "num_input_tokens_seen": 60169624, + "step": 103705 + }, + { + "epoch": 15.446827524575514, + "grad_norm": 0.00016169865557458252, + "learning_rate": 7.487608061020599e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60172344, + "step": 103710 + }, + { + "epoch": 15.447572237116473, + "grad_norm": 0.0010944841196760535, + "learning_rate": 7.485289237743079e-06, + "loss": 0.0, + "num_input_tokens_seen": 60175160, + "step": 103715 + }, + { + "epoch": 15.448316949657432, + "grad_norm": 0.0002685956424102187, + "learning_rate": 7.482970710358806e-06, + "loss": 0.0, + "num_input_tokens_seen": 60177944, + "step": 103720 + }, + { + "epoch": 15.44906166219839, + "grad_norm": 7.413686398649588e-05, + "learning_rate": 7.48065247890693e-06, + "loss": 0.0, + "num_input_tokens_seen": 60180664, + "step": 103725 + }, + { + "epoch": 15.449806374739351, + "grad_norm": 8.350589632755145e-05, + "learning_rate": 7.478334543426632e-06, + "loss": 0.0, + "num_input_tokens_seen": 60183576, + "step": 103730 + }, + { + "epoch": 15.45055108728031, + "grad_norm": 4.806429205927998e-05, + "learning_rate": 7.476016903957058e-06, + "loss": 0.0, + "num_input_tokens_seen": 60186520, + "step": 103735 + }, + { + "epoch": 15.451295799821269, + "grad_norm": 1.1453102160885464e-05, + "learning_rate": 7.473699560537376e-06, + "loss": 0.3438, + "num_input_tokens_seen": 60189464, + "step": 103740 + }, + { + "epoch": 15.452040512362228, + "grad_norm": 0.0001219805417349562, + "learning_rate": 7.471382513206718e-06, + "loss": 0.0, + "num_input_tokens_seen": 60192088, + "step": 103745 + }, + { + "epoch": 15.452785224903188, + "grad_norm": 0.013144117780029774, + "learning_rate": 7.469065762004243e-06, + "loss": 0.0, + "num_input_tokens_seen": 60195032, + "step": 103750 + }, + { + "epoch": 15.453529937444147, + "grad_norm": 2.617024802020751e-05, + "learning_rate": 7.466749306969087e-06, + "loss": 0.0, + "num_input_tokens_seen": 60198040, + "step": 103755 + }, + { + "epoch": 15.454274649985106, + "grad_norm": 0.018187740817666054, + "learning_rate": 7.464433148140371e-06, + "loss": 0.0, + "num_input_tokens_seen": 60200920, + "step": 103760 + }, + { + "epoch": 15.455019362526064, + "grad_norm": 0.00020370061974972486, + "learning_rate": 7.462117285557246e-06, + "loss": 0.0, + "num_input_tokens_seen": 60203896, + "step": 103765 + }, + { + "epoch": 15.455764075067025, + "grad_norm": 0.0004362973559182137, + "learning_rate": 7.459801719258821e-06, + "loss": 0.0, + "num_input_tokens_seen": 60206488, + "step": 103770 + }, + { + "epoch": 15.456508787607984, + "grad_norm": 0.0005783055094070733, + "learning_rate": 7.457486449284221e-06, + "loss": 0.0, + "num_input_tokens_seen": 60209400, + "step": 103775 + }, + { + "epoch": 15.457253500148942, + "grad_norm": 0.004357870202511549, + "learning_rate": 7.455171475672551e-06, + "loss": 0.0, + "num_input_tokens_seen": 60212696, + "step": 103780 + }, + { + "epoch": 15.457998212689901, + "grad_norm": 0.0006694543408229947, + "learning_rate": 7.4528567984629344e-06, + "loss": 0.0033, + "num_input_tokens_seen": 60215640, + "step": 103785 + }, + { + "epoch": 15.458742925230862, + "grad_norm": 0.00033475819509476423, + "learning_rate": 7.450542417694467e-06, + "loss": 0.0, + "num_input_tokens_seen": 60218200, + "step": 103790 + }, + { + "epoch": 15.45948763777182, + "grad_norm": 0.0005647373618558049, + "learning_rate": 7.448228333406241e-06, + "loss": 0.0047, + "num_input_tokens_seen": 60221336, + "step": 103795 + }, + { + "epoch": 15.46023235031278, + "grad_norm": 5.2526509534800425e-05, + "learning_rate": 7.445914545637367e-06, + "loss": 0.0, + "num_input_tokens_seen": 60225368, + "step": 103800 + }, + { + "epoch": 15.460977062853738, + "grad_norm": 0.00018209553672932088, + "learning_rate": 7.443601054426919e-06, + "loss": 0.0003, + "num_input_tokens_seen": 60228376, + "step": 103805 + }, + { + "epoch": 15.461721775394698, + "grad_norm": 3.385046511539258e-05, + "learning_rate": 7.441287859813995e-06, + "loss": 0.0, + "num_input_tokens_seen": 60231000, + "step": 103810 + }, + { + "epoch": 15.462466487935657, + "grad_norm": 1.2733356015814934e-05, + "learning_rate": 7.438974961837655e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60233880, + "step": 103815 + }, + { + "epoch": 15.463211200476616, + "grad_norm": 0.0011901517864316702, + "learning_rate": 7.436662360536997e-06, + "loss": 0.0, + "num_input_tokens_seen": 60236824, + "step": 103820 + }, + { + "epoch": 15.463955913017575, + "grad_norm": 0.002302034990862012, + "learning_rate": 7.43435005595107e-06, + "loss": 0.0002, + "num_input_tokens_seen": 60239864, + "step": 103825 + }, + { + "epoch": 15.464700625558535, + "grad_norm": 1.4225855920813046e-05, + "learning_rate": 7.432038048118953e-06, + "loss": 0.008, + "num_input_tokens_seen": 60242584, + "step": 103830 + }, + { + "epoch": 15.465445338099494, + "grad_norm": 0.00017505222058389336, + "learning_rate": 7.429726337079695e-06, + "loss": 0.0, + "num_input_tokens_seen": 60245400, + "step": 103835 + }, + { + "epoch": 15.466190050640453, + "grad_norm": 1.8005113815888762e-05, + "learning_rate": 7.427414922872356e-06, + "loss": 0.0, + "num_input_tokens_seen": 60248440, + "step": 103840 + }, + { + "epoch": 15.466934763181412, + "grad_norm": 0.0009077109862118959, + "learning_rate": 7.4251038055359825e-06, + "loss": 0.0, + "num_input_tokens_seen": 60251160, + "step": 103845 + }, + { + "epoch": 15.46767947572237, + "grad_norm": 0.0006337217637337744, + "learning_rate": 7.422792985109608e-06, + "loss": 0.0, + "num_input_tokens_seen": 60254424, + "step": 103850 + }, + { + "epoch": 15.46842418826333, + "grad_norm": 0.00016237089585047215, + "learning_rate": 7.420482461632289e-06, + "loss": 0.0, + "num_input_tokens_seen": 60257496, + "step": 103855 + }, + { + "epoch": 15.46916890080429, + "grad_norm": 0.00027545183547772467, + "learning_rate": 7.418172235143045e-06, + "loss": 0.0, + "num_input_tokens_seen": 60260344, + "step": 103860 + }, + { + "epoch": 15.469913613345248, + "grad_norm": 0.0009552660631015897, + "learning_rate": 7.415862305680921e-06, + "loss": 0.0, + "num_input_tokens_seen": 60263064, + "step": 103865 + }, + { + "epoch": 15.470658325886207, + "grad_norm": 0.00013819921878166497, + "learning_rate": 7.413552673284929e-06, + "loss": 0.0, + "num_input_tokens_seen": 60265848, + "step": 103870 + }, + { + "epoch": 15.471403038427168, + "grad_norm": 1.8423928850097582e-05, + "learning_rate": 7.411243337994084e-06, + "loss": 0.0, + "num_input_tokens_seen": 60268504, + "step": 103875 + }, + { + "epoch": 15.472147750968126, + "grad_norm": 4.8712699936004356e-05, + "learning_rate": 7.4089342998474145e-06, + "loss": 0.0, + "num_input_tokens_seen": 60271576, + "step": 103880 + }, + { + "epoch": 15.472892463509085, + "grad_norm": 1.768065703799948e-05, + "learning_rate": 7.406625558883912e-06, + "loss": 0.0, + "num_input_tokens_seen": 60274648, + "step": 103885 + }, + { + "epoch": 15.473637176050044, + "grad_norm": 0.0001293687819270417, + "learning_rate": 7.404317115142598e-06, + "loss": 0.0, + "num_input_tokens_seen": 60277688, + "step": 103890 + }, + { + "epoch": 15.474381888591004, + "grad_norm": 7.53643544157967e-05, + "learning_rate": 7.402008968662455e-06, + "loss": 0.0, + "num_input_tokens_seen": 60280632, + "step": 103895 + }, + { + "epoch": 15.475126601131963, + "grad_norm": 0.0003531594411469996, + "learning_rate": 7.399701119482494e-06, + "loss": 0.0, + "num_input_tokens_seen": 60283704, + "step": 103900 + }, + { + "epoch": 15.475871313672922, + "grad_norm": 0.0033791507594287395, + "learning_rate": 7.397393567641694e-06, + "loss": 0.0, + "num_input_tokens_seen": 60286680, + "step": 103905 + }, + { + "epoch": 15.47661602621388, + "grad_norm": 8.119183621602133e-05, + "learning_rate": 7.395086313179037e-06, + "loss": 0.0, + "num_input_tokens_seen": 60289592, + "step": 103910 + }, + { + "epoch": 15.477360738754841, + "grad_norm": 2.0769066395587288e-05, + "learning_rate": 7.392779356133506e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60292280, + "step": 103915 + }, + { + "epoch": 15.4781054512958, + "grad_norm": 0.004722032230347395, + "learning_rate": 7.390472696544065e-06, + "loss": 0.0, + "num_input_tokens_seen": 60295000, + "step": 103920 + }, + { + "epoch": 15.478850163836759, + "grad_norm": 0.0002913082716986537, + "learning_rate": 7.388166334449697e-06, + "loss": 0.0, + "num_input_tokens_seen": 60297976, + "step": 103925 + }, + { + "epoch": 15.479594876377718, + "grad_norm": 0.0011846277629956603, + "learning_rate": 7.3858602698893495e-06, + "loss": 0.1938, + "num_input_tokens_seen": 60300760, + "step": 103930 + }, + { + "epoch": 15.480339588918678, + "grad_norm": 9.667263657320291e-05, + "learning_rate": 7.383554502902001e-06, + "loss": 0.0, + "num_input_tokens_seen": 60303960, + "step": 103935 + }, + { + "epoch": 15.481084301459637, + "grad_norm": 1.8713006284087896e-05, + "learning_rate": 7.381249033526585e-06, + "loss": 0.0, + "num_input_tokens_seen": 60306456, + "step": 103940 + }, + { + "epoch": 15.481829014000596, + "grad_norm": 0.006940951105207205, + "learning_rate": 7.37894386180207e-06, + "loss": 0.0219, + "num_input_tokens_seen": 60309144, + "step": 103945 + }, + { + "epoch": 15.482573726541554, + "grad_norm": 9.34443887672387e-05, + "learning_rate": 7.376638987767387e-06, + "loss": 0.0, + "num_input_tokens_seen": 60312216, + "step": 103950 + }, + { + "epoch": 15.483318439082515, + "grad_norm": 0.0007203016430139542, + "learning_rate": 7.37433441146147e-06, + "loss": 0.0, + "num_input_tokens_seen": 60315096, + "step": 103955 + }, + { + "epoch": 15.484063151623474, + "grad_norm": 0.0013399786548689008, + "learning_rate": 7.372030132923266e-06, + "loss": 0.0, + "num_input_tokens_seen": 60317912, + "step": 103960 + }, + { + "epoch": 15.484807864164432, + "grad_norm": 2.3484664779971354e-05, + "learning_rate": 7.369726152191692e-06, + "loss": 0.0, + "num_input_tokens_seen": 60320920, + "step": 103965 + }, + { + "epoch": 15.485552576705391, + "grad_norm": 0.0002306013775523752, + "learning_rate": 7.367422469305679e-06, + "loss": 0.0478, + "num_input_tokens_seen": 60323704, + "step": 103970 + }, + { + "epoch": 15.486297289246352, + "grad_norm": 1.3538950042857323e-05, + "learning_rate": 7.365119084304145e-06, + "loss": 0.0, + "num_input_tokens_seen": 60326328, + "step": 103975 + }, + { + "epoch": 15.48704200178731, + "grad_norm": 7.574512710561976e-05, + "learning_rate": 7.362815997226e-06, + "loss": 0.0, + "num_input_tokens_seen": 60329336, + "step": 103980 + }, + { + "epoch": 15.48778671432827, + "grad_norm": 0.0002304384543094784, + "learning_rate": 7.360513208110148e-06, + "loss": 0.0, + "num_input_tokens_seen": 60332472, + "step": 103985 + }, + { + "epoch": 15.488531426869228, + "grad_norm": 9.689530270406976e-05, + "learning_rate": 7.3582107169955005e-06, + "loss": 0.0, + "num_input_tokens_seen": 60335256, + "step": 103990 + }, + { + "epoch": 15.489276139410187, + "grad_norm": 3.0233215511543676e-05, + "learning_rate": 7.355908523920957e-06, + "loss": 0.0, + "num_input_tokens_seen": 60338328, + "step": 103995 + }, + { + "epoch": 15.490020851951147, + "grad_norm": 1.5375297152786516e-05, + "learning_rate": 7.353606628925397e-06, + "loss": 0.0, + "num_input_tokens_seen": 60341432, + "step": 104000 + }, + { + "epoch": 15.490765564492106, + "grad_norm": 0.0012437354307621717, + "learning_rate": 7.351305032047726e-06, + "loss": 0.0, + "num_input_tokens_seen": 60344376, + "step": 104005 + }, + { + "epoch": 15.491510277033065, + "grad_norm": 5.6083172239596024e-05, + "learning_rate": 7.349003733326809e-06, + "loss": 0.0, + "num_input_tokens_seen": 60347224, + "step": 104010 + }, + { + "epoch": 15.492254989574024, + "grad_norm": 0.0009625438251532614, + "learning_rate": 7.346702732801544e-06, + "loss": 0.0, + "num_input_tokens_seen": 60350040, + "step": 104015 + }, + { + "epoch": 15.492999702114984, + "grad_norm": 0.002147684572264552, + "learning_rate": 7.344402030510786e-06, + "loss": 0.0, + "num_input_tokens_seen": 60352920, + "step": 104020 + }, + { + "epoch": 15.493744414655943, + "grad_norm": 0.00027025851886719465, + "learning_rate": 7.34210162649342e-06, + "loss": 0.0, + "num_input_tokens_seen": 60355864, + "step": 104025 + }, + { + "epoch": 15.494489127196902, + "grad_norm": 0.0007570580346509814, + "learning_rate": 7.3398015207883006e-06, + "loss": 0.0, + "num_input_tokens_seen": 60358840, + "step": 104030 + }, + { + "epoch": 15.49523383973786, + "grad_norm": 9.480504559178371e-06, + "learning_rate": 7.337501713434283e-06, + "loss": 0.0, + "num_input_tokens_seen": 60361656, + "step": 104035 + }, + { + "epoch": 15.495978552278821, + "grad_norm": 7.3663431976456195e-06, + "learning_rate": 7.3352022044702266e-06, + "loss": 0.0, + "num_input_tokens_seen": 60364440, + "step": 104040 + }, + { + "epoch": 15.49672326481978, + "grad_norm": 0.00022024489589966834, + "learning_rate": 7.332902993934965e-06, + "loss": 0.0376, + "num_input_tokens_seen": 60367224, + "step": 104045 + }, + { + "epoch": 15.497467977360738, + "grad_norm": 2.5427503715036437e-05, + "learning_rate": 7.33060408186736e-06, + "loss": 0.0, + "num_input_tokens_seen": 60370264, + "step": 104050 + }, + { + "epoch": 15.498212689901697, + "grad_norm": 3.936912435165141e-06, + "learning_rate": 7.328305468306229e-06, + "loss": 0.0, + "num_input_tokens_seen": 60373144, + "step": 104055 + }, + { + "epoch": 15.498957402442658, + "grad_norm": 4.522048584476579e-06, + "learning_rate": 7.326007153290429e-06, + "loss": 0.0918, + "num_input_tokens_seen": 60375992, + "step": 104060 + }, + { + "epoch": 15.499702114983616, + "grad_norm": 0.0005139854620210826, + "learning_rate": 7.323709136858764e-06, + "loss": 0.0, + "num_input_tokens_seen": 60378968, + "step": 104065 + }, + { + "epoch": 15.500446827524575, + "grad_norm": 0.0002547484473325312, + "learning_rate": 7.321411419050078e-06, + "loss": 0.0, + "num_input_tokens_seen": 60381784, + "step": 104070 + }, + { + "epoch": 15.501191540065534, + "grad_norm": 0.0003288733132649213, + "learning_rate": 7.319113999903176e-06, + "loss": 0.0, + "num_input_tokens_seen": 60384600, + "step": 104075 + }, + { + "epoch": 15.501936252606495, + "grad_norm": 0.0009175734012387693, + "learning_rate": 7.31681687945687e-06, + "loss": 0.0, + "num_input_tokens_seen": 60387672, + "step": 104080 + }, + { + "epoch": 15.502680965147453, + "grad_norm": 0.000221795795368962, + "learning_rate": 7.314520057749974e-06, + "loss": 0.0, + "num_input_tokens_seen": 60390456, + "step": 104085 + }, + { + "epoch": 15.503425677688412, + "grad_norm": 0.0007588026928715408, + "learning_rate": 7.312223534821281e-06, + "loss": 0.0, + "num_input_tokens_seen": 60393272, + "step": 104090 + }, + { + "epoch": 15.50417039022937, + "grad_norm": 0.0003334595530759543, + "learning_rate": 7.3099273107096e-06, + "loss": 0.0, + "num_input_tokens_seen": 60396088, + "step": 104095 + }, + { + "epoch": 15.504915102770331, + "grad_norm": 0.0009872165974229574, + "learning_rate": 7.307631385453717e-06, + "loss": 0.0, + "num_input_tokens_seen": 60399032, + "step": 104100 + }, + { + "epoch": 15.50565981531129, + "grad_norm": 3.312313128844835e-05, + "learning_rate": 7.305335759092424e-06, + "loss": 0.0, + "num_input_tokens_seen": 60401784, + "step": 104105 + }, + { + "epoch": 15.506404527852249, + "grad_norm": 0.0004419610486365855, + "learning_rate": 7.303040431664496e-06, + "loss": 0.0, + "num_input_tokens_seen": 60404856, + "step": 104110 + }, + { + "epoch": 15.507149240393208, + "grad_norm": 0.00020733402925543487, + "learning_rate": 7.300745403208705e-06, + "loss": 0.0, + "num_input_tokens_seen": 60407800, + "step": 104115 + }, + { + "epoch": 15.507893952934168, + "grad_norm": 0.014680055901408195, + "learning_rate": 7.298450673763843e-06, + "loss": 0.0, + "num_input_tokens_seen": 60410616, + "step": 104120 + }, + { + "epoch": 15.508638665475127, + "grad_norm": 0.0002107104373862967, + "learning_rate": 7.296156243368657e-06, + "loss": 0.0, + "num_input_tokens_seen": 60413720, + "step": 104125 + }, + { + "epoch": 15.509383378016086, + "grad_norm": 0.0005852053291164339, + "learning_rate": 7.293862112061925e-06, + "loss": 0.0, + "num_input_tokens_seen": 60416504, + "step": 104130 + }, + { + "epoch": 15.510128090557044, + "grad_norm": 0.00019841825996991247, + "learning_rate": 7.291568279882388e-06, + "loss": 0.0, + "num_input_tokens_seen": 60419192, + "step": 104135 + }, + { + "epoch": 15.510872803098005, + "grad_norm": 0.0020731317345052958, + "learning_rate": 7.289274746868818e-06, + "loss": 0.0, + "num_input_tokens_seen": 60422104, + "step": 104140 + }, + { + "epoch": 15.511617515638964, + "grad_norm": 0.010717242024838924, + "learning_rate": 7.28698151305994e-06, + "loss": 0.0, + "num_input_tokens_seen": 60424888, + "step": 104145 + }, + { + "epoch": 15.512362228179922, + "grad_norm": 0.0017568262992426753, + "learning_rate": 7.284688578494514e-06, + "loss": 0.0, + "num_input_tokens_seen": 60428792, + "step": 104150 + }, + { + "epoch": 15.513106940720881, + "grad_norm": 3.716810169862583e-05, + "learning_rate": 7.2823959432112705e-06, + "loss": 0.0, + "num_input_tokens_seen": 60431736, + "step": 104155 + }, + { + "epoch": 15.513851653261842, + "grad_norm": 0.00010866847151191905, + "learning_rate": 7.280103607248934e-06, + "loss": 0.0, + "num_input_tokens_seen": 60434616, + "step": 104160 + }, + { + "epoch": 15.5145963658028, + "grad_norm": 0.0007472784491255879, + "learning_rate": 7.277811570646242e-06, + "loss": 0.0, + "num_input_tokens_seen": 60437432, + "step": 104165 + }, + { + "epoch": 15.51534107834376, + "grad_norm": 6.215188503265381, + "learning_rate": 7.275519833441915e-06, + "loss": 0.0028, + "num_input_tokens_seen": 60440120, + "step": 104170 + }, + { + "epoch": 15.516085790884718, + "grad_norm": 0.0003712278266903013, + "learning_rate": 7.273228395674664e-06, + "loss": 0.0, + "num_input_tokens_seen": 60442776, + "step": 104175 + }, + { + "epoch": 15.516830503425677, + "grad_norm": 5.746189708588645e-05, + "learning_rate": 7.270937257383195e-06, + "loss": 0.0, + "num_input_tokens_seen": 60445784, + "step": 104180 + }, + { + "epoch": 15.517575215966637, + "grad_norm": 26.394142150878906, + "learning_rate": 7.268646418606229e-06, + "loss": 0.0775, + "num_input_tokens_seen": 60448504, + "step": 104185 + }, + { + "epoch": 15.518319928507596, + "grad_norm": 0.00014997916878201067, + "learning_rate": 7.266355879382461e-06, + "loss": 0.0, + "num_input_tokens_seen": 60451192, + "step": 104190 + }, + { + "epoch": 15.519064641048555, + "grad_norm": 0.000698399671819061, + "learning_rate": 7.2640656397505805e-06, + "loss": 0.0944, + "num_input_tokens_seen": 60454360, + "step": 104195 + }, + { + "epoch": 15.519809353589514, + "grad_norm": 13.935420989990234, + "learning_rate": 7.26177569974929e-06, + "loss": 0.0163, + "num_input_tokens_seen": 60457016, + "step": 104200 + }, + { + "epoch": 15.520554066130474, + "grad_norm": 4.353998156148009e-05, + "learning_rate": 7.259486059417265e-06, + "loss": 0.0, + "num_input_tokens_seen": 60459960, + "step": 104205 + }, + { + "epoch": 15.521298778671433, + "grad_norm": 2.1248284610919654e-05, + "learning_rate": 7.2571967187932e-06, + "loss": 0.0007, + "num_input_tokens_seen": 60462712, + "step": 104210 + }, + { + "epoch": 15.522043491212392, + "grad_norm": 0.00817665085196495, + "learning_rate": 7.2549076779157565e-06, + "loss": 0.0, + "num_input_tokens_seen": 60465368, + "step": 104215 + }, + { + "epoch": 15.52278820375335, + "grad_norm": 0.0009972096886485815, + "learning_rate": 7.252618936823618e-06, + "loss": 0.1036, + "num_input_tokens_seen": 60468184, + "step": 104220 + }, + { + "epoch": 15.523532916294311, + "grad_norm": 4.2920542909996584e-05, + "learning_rate": 7.250330495555438e-06, + "loss": 0.0, + "num_input_tokens_seen": 60470968, + "step": 104225 + }, + { + "epoch": 15.52427762883527, + "grad_norm": 0.0002970442292280495, + "learning_rate": 7.248042354149892e-06, + "loss": 0.0004, + "num_input_tokens_seen": 60473784, + "step": 104230 + }, + { + "epoch": 15.525022341376228, + "grad_norm": 6.846705946372822e-05, + "learning_rate": 7.2457545126456275e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60476888, + "step": 104235 + }, + { + "epoch": 15.525767053917187, + "grad_norm": 0.0005131324869580567, + "learning_rate": 7.243466971081297e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60479768, + "step": 104240 + }, + { + "epoch": 15.526511766458148, + "grad_norm": 0.0036480058915913105, + "learning_rate": 7.2411797294955455e-06, + "loss": 0.0, + "num_input_tokens_seen": 60482456, + "step": 104245 + }, + { + "epoch": 15.527256478999107, + "grad_norm": 0.0001207953246193938, + "learning_rate": 7.238892787927004e-06, + "loss": 0.0196, + "num_input_tokens_seen": 60485464, + "step": 104250 + }, + { + "epoch": 15.528001191540065, + "grad_norm": 0.0002693014976102859, + "learning_rate": 7.2366061464143265e-06, + "loss": 0.0504, + "num_input_tokens_seen": 60488408, + "step": 104255 + }, + { + "epoch": 15.528745904081024, + "grad_norm": 0.0007669935584999621, + "learning_rate": 7.234319804996126e-06, + "loss": 0.0, + "num_input_tokens_seen": 60491256, + "step": 104260 + }, + { + "epoch": 15.529490616621985, + "grad_norm": 0.0011186774354428053, + "learning_rate": 7.232033763711044e-06, + "loss": 0.0, + "num_input_tokens_seen": 60494200, + "step": 104265 + }, + { + "epoch": 15.530235329162943, + "grad_norm": 0.000590097566600889, + "learning_rate": 7.229748022597693e-06, + "loss": 0.0088, + "num_input_tokens_seen": 60497400, + "step": 104270 + }, + { + "epoch": 15.530980041703902, + "grad_norm": 0.00013107250561006367, + "learning_rate": 7.22746258169468e-06, + "loss": 0.0, + "num_input_tokens_seen": 60500216, + "step": 104275 + }, + { + "epoch": 15.53172475424486, + "grad_norm": 0.004496132954955101, + "learning_rate": 7.225177441040632e-06, + "loss": 0.0, + "num_input_tokens_seen": 60503128, + "step": 104280 + }, + { + "epoch": 15.532469466785821, + "grad_norm": 0.0003387103497516364, + "learning_rate": 7.2228926006741385e-06, + "loss": 0.0, + "num_input_tokens_seen": 60506072, + "step": 104285 + }, + { + "epoch": 15.53321417932678, + "grad_norm": 0.0002021206310018897, + "learning_rate": 7.220608060633813e-06, + "loss": 0.0, + "num_input_tokens_seen": 60508920, + "step": 104290 + }, + { + "epoch": 15.533958891867739, + "grad_norm": 0.036074768751859665, + "learning_rate": 7.218323820958237e-06, + "loss": 0.0, + "num_input_tokens_seen": 60511896, + "step": 104295 + }, + { + "epoch": 15.534703604408698, + "grad_norm": 1.9499777408782393e-05, + "learning_rate": 7.2160398816860155e-06, + "loss": 0.0, + "num_input_tokens_seen": 60514968, + "step": 104300 + }, + { + "epoch": 15.535448316949658, + "grad_norm": 0.0009408602491021156, + "learning_rate": 7.213756242855724e-06, + "loss": 0.0, + "num_input_tokens_seen": 60517912, + "step": 104305 + }, + { + "epoch": 15.536193029490617, + "grad_norm": 685.74658203125, + "learning_rate": 7.211472904505945e-06, + "loss": 0.0329, + "num_input_tokens_seen": 60520664, + "step": 104310 + }, + { + "epoch": 15.536937742031576, + "grad_norm": 0.00043280923273414373, + "learning_rate": 7.20918986667525e-06, + "loss": 0.0, + "num_input_tokens_seen": 60523448, + "step": 104315 + }, + { + "epoch": 15.537682454572534, + "grad_norm": 0.0004566549032460898, + "learning_rate": 7.206907129402205e-06, + "loss": 0.0, + "num_input_tokens_seen": 60526360, + "step": 104320 + }, + { + "epoch": 15.538427167113493, + "grad_norm": 8.543311560060829e-05, + "learning_rate": 7.204624692725387e-06, + "loss": 0.0, + "num_input_tokens_seen": 60529304, + "step": 104325 + }, + { + "epoch": 15.539171879654454, + "grad_norm": 3.618869959609583e-05, + "learning_rate": 7.202342556683339e-06, + "loss": 0.0, + "num_input_tokens_seen": 60532120, + "step": 104330 + }, + { + "epoch": 15.539916592195413, + "grad_norm": 0.00022514820739161223, + "learning_rate": 7.200060721314636e-06, + "loss": 0.0, + "num_input_tokens_seen": 60535000, + "step": 104335 + }, + { + "epoch": 15.540661304736371, + "grad_norm": 0.00032911018934100866, + "learning_rate": 7.1977791866578045e-06, + "loss": 0.0, + "num_input_tokens_seen": 60538136, + "step": 104340 + }, + { + "epoch": 15.541406017277332, + "grad_norm": 0.00022430019453167915, + "learning_rate": 7.195497952751409e-06, + "loss": 0.0, + "num_input_tokens_seen": 60541144, + "step": 104345 + }, + { + "epoch": 15.54215072981829, + "grad_norm": 0.0057271732948720455, + "learning_rate": 7.1932170196339745e-06, + "loss": 0.0, + "num_input_tokens_seen": 60544376, + "step": 104350 + }, + { + "epoch": 15.54289544235925, + "grad_norm": 0.19899263978004456, + "learning_rate": 7.190936387344047e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60547128, + "step": 104355 + }, + { + "epoch": 15.543640154900208, + "grad_norm": 0.011034758761525154, + "learning_rate": 7.188656055920149e-06, + "loss": 0.0, + "num_input_tokens_seen": 60550072, + "step": 104360 + }, + { + "epoch": 15.544384867441167, + "grad_norm": 9.973945270758122e-05, + "learning_rate": 7.186376025400804e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60552760, + "step": 104365 + }, + { + "epoch": 15.545129579982127, + "grad_norm": 8.419324876740575e-05, + "learning_rate": 7.18409629582453e-06, + "loss": 0.0002, + "num_input_tokens_seen": 60555544, + "step": 104370 + }, + { + "epoch": 15.545874292523086, + "grad_norm": 0.0007032921421341598, + "learning_rate": 7.181816867229835e-06, + "loss": 0.0, + "num_input_tokens_seen": 60558392, + "step": 104375 + }, + { + "epoch": 15.546619005064045, + "grad_norm": 5.688513920176774e-05, + "learning_rate": 7.179537739655243e-06, + "loss": 0.0, + "num_input_tokens_seen": 60561464, + "step": 104380 + }, + { + "epoch": 15.547363717605004, + "grad_norm": 0.001164219924248755, + "learning_rate": 7.17725891313924e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60564216, + "step": 104385 + }, + { + "epoch": 15.548108430145964, + "grad_norm": 9.780367690837011e-05, + "learning_rate": 7.17498038772034e-06, + "loss": 0.0, + "num_input_tokens_seen": 60566936, + "step": 104390 + }, + { + "epoch": 15.548853142686923, + "grad_norm": 0.007483094464987516, + "learning_rate": 7.172702163437034e-06, + "loss": 0.0, + "num_input_tokens_seen": 60569976, + "step": 104395 + }, + { + "epoch": 15.549597855227882, + "grad_norm": 9.099289854930248e-06, + "learning_rate": 7.170424240327794e-06, + "loss": 0.0, + "num_input_tokens_seen": 60572920, + "step": 104400 + }, + { + "epoch": 15.55034256776884, + "grad_norm": 0.003858878742903471, + "learning_rate": 7.168146618431127e-06, + "loss": 0.0, + "num_input_tokens_seen": 60576248, + "step": 104405 + }, + { + "epoch": 15.551087280309801, + "grad_norm": 0.0036872616037726402, + "learning_rate": 7.165869297785488e-06, + "loss": 0.0032, + "num_input_tokens_seen": 60579000, + "step": 104410 + }, + { + "epoch": 15.55183199285076, + "grad_norm": 0.1969020962715149, + "learning_rate": 7.163592278429371e-06, + "loss": 0.0002, + "num_input_tokens_seen": 60582040, + "step": 104415 + }, + { + "epoch": 15.552576705391719, + "grad_norm": 0.00010165260755456984, + "learning_rate": 7.161315560401224e-06, + "loss": 0.0004, + "num_input_tokens_seen": 60585176, + "step": 104420 + }, + { + "epoch": 15.553321417932677, + "grad_norm": 3.30164039041847e-05, + "learning_rate": 7.159039143739532e-06, + "loss": 0.0, + "num_input_tokens_seen": 60588088, + "step": 104425 + }, + { + "epoch": 15.554066130473638, + "grad_norm": 1.849565029144287, + "learning_rate": 7.1567630284827384e-06, + "loss": 0.0023, + "num_input_tokens_seen": 60590840, + "step": 104430 + }, + { + "epoch": 15.554810843014597, + "grad_norm": 0.006662626285105944, + "learning_rate": 7.1544872146693e-06, + "loss": 0.0, + "num_input_tokens_seen": 60593688, + "step": 104435 + }, + { + "epoch": 15.555555555555555, + "grad_norm": 8.471621185890399e-06, + "learning_rate": 7.1522117023376606e-06, + "loss": 0.0, + "num_input_tokens_seen": 60596440, + "step": 104440 + }, + { + "epoch": 15.556300268096514, + "grad_norm": 0.0009295943891629577, + "learning_rate": 7.149936491526258e-06, + "loss": 0.0, + "num_input_tokens_seen": 60599096, + "step": 104445 + }, + { + "epoch": 15.557044980637475, + "grad_norm": 1.8616214219946414e-05, + "learning_rate": 7.147661582273546e-06, + "loss": 0.0, + "num_input_tokens_seen": 60601912, + "step": 104450 + }, + { + "epoch": 15.557789693178433, + "grad_norm": 4.4118205551058054e-05, + "learning_rate": 7.145386974617937e-06, + "loss": 0.0, + "num_input_tokens_seen": 60604984, + "step": 104455 + }, + { + "epoch": 15.558534405719392, + "grad_norm": 0.0002605411282274872, + "learning_rate": 7.143112668597876e-06, + "loss": 0.0, + "num_input_tokens_seen": 60607704, + "step": 104460 + }, + { + "epoch": 15.559279118260351, + "grad_norm": 0.00030568247893825173, + "learning_rate": 7.140838664251773e-06, + "loss": 0.0, + "num_input_tokens_seen": 60610584, + "step": 104465 + }, + { + "epoch": 15.560023830801311, + "grad_norm": 1.751139097905252e-05, + "learning_rate": 7.138564961618055e-06, + "loss": 0.0, + "num_input_tokens_seen": 60613400, + "step": 104470 + }, + { + "epoch": 15.56076854334227, + "grad_norm": 1.5972078472259454e-05, + "learning_rate": 7.1362915607351285e-06, + "loss": 0.0, + "num_input_tokens_seen": 60616344, + "step": 104475 + }, + { + "epoch": 15.561513255883229, + "grad_norm": 0.0019865822978317738, + "learning_rate": 7.1340184616413926e-06, + "loss": 0.0, + "num_input_tokens_seen": 60619224, + "step": 104480 + }, + { + "epoch": 15.562257968424188, + "grad_norm": 30.091934204101562, + "learning_rate": 7.131745664375264e-06, + "loss": 0.1859, + "num_input_tokens_seen": 60622680, + "step": 104485 + }, + { + "epoch": 15.563002680965148, + "grad_norm": 0.00040627780253998935, + "learning_rate": 7.129473168975123e-06, + "loss": 0.0, + "num_input_tokens_seen": 60625560, + "step": 104490 + }, + { + "epoch": 15.563747393506107, + "grad_norm": 0.0004038088663946837, + "learning_rate": 7.127200975479381e-06, + "loss": 0.0, + "num_input_tokens_seen": 60628344, + "step": 104495 + }, + { + "epoch": 15.564492106047066, + "grad_norm": 8.828046702546999e-05, + "learning_rate": 7.12492908392641e-06, + "loss": 0.0, + "num_input_tokens_seen": 60631224, + "step": 104500 + }, + { + "epoch": 15.565236818588025, + "grad_norm": 1.3773815226159059e-05, + "learning_rate": 7.122657494354596e-06, + "loss": 0.0, + "num_input_tokens_seen": 60634168, + "step": 104505 + }, + { + "epoch": 15.565981531128983, + "grad_norm": 5.042761404183693e-05, + "learning_rate": 7.120386206802307e-06, + "loss": 0.0163, + "num_input_tokens_seen": 60637048, + "step": 104510 + }, + { + "epoch": 15.566726243669944, + "grad_norm": 7.331959204748273e-05, + "learning_rate": 7.1181152213079275e-06, + "loss": 0.0, + "num_input_tokens_seen": 60639960, + "step": 104515 + }, + { + "epoch": 15.567470956210903, + "grad_norm": 4.1353599954163656e-05, + "learning_rate": 7.115844537909819e-06, + "loss": 0.0, + "num_input_tokens_seen": 60642680, + "step": 104520 + }, + { + "epoch": 15.568215668751861, + "grad_norm": 0.0005833701579831541, + "learning_rate": 7.11357415664633e-06, + "loss": 0.0, + "num_input_tokens_seen": 60645880, + "step": 104525 + }, + { + "epoch": 15.568960381292822, + "grad_norm": 0.0006236631888896227, + "learning_rate": 7.111304077555836e-06, + "loss": 0.0, + "num_input_tokens_seen": 60648760, + "step": 104530 + }, + { + "epoch": 15.56970509383378, + "grad_norm": 0.363174170255661, + "learning_rate": 7.1090343006766704e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60651736, + "step": 104535 + }, + { + "epoch": 15.57044980637474, + "grad_norm": 6.605601083720103e-05, + "learning_rate": 7.106764826047196e-06, + "loss": 0.2844, + "num_input_tokens_seen": 60654712, + "step": 104540 + }, + { + "epoch": 15.571194518915698, + "grad_norm": 4.4148851884528995e-05, + "learning_rate": 7.104495653705734e-06, + "loss": 0.0, + "num_input_tokens_seen": 60657976, + "step": 104545 + }, + { + "epoch": 15.571939231456657, + "grad_norm": 0.0011969935148954391, + "learning_rate": 7.102226783690638e-06, + "loss": 0.0, + "num_input_tokens_seen": 60660664, + "step": 104550 + }, + { + "epoch": 15.572683943997617, + "grad_norm": 0.05407193303108215, + "learning_rate": 7.099958216040231e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60663608, + "step": 104555 + }, + { + "epoch": 15.573428656538576, + "grad_norm": 0.0003214507596567273, + "learning_rate": 7.09768995079283e-06, + "loss": 0.0, + "num_input_tokens_seen": 60666552, + "step": 104560 + }, + { + "epoch": 15.574173369079535, + "grad_norm": 8.502977289026603e-05, + "learning_rate": 7.095421987986766e-06, + "loss": 0.1875, + "num_input_tokens_seen": 60669368, + "step": 104565 + }, + { + "epoch": 15.574918081620494, + "grad_norm": 3.473195465630852e-05, + "learning_rate": 7.093154327660354e-06, + "loss": 0.0004, + "num_input_tokens_seen": 60672216, + "step": 104570 + }, + { + "epoch": 15.575662794161454, + "grad_norm": 3.6571454984368756e-05, + "learning_rate": 7.090886969851898e-06, + "loss": 0.0, + "num_input_tokens_seen": 60675032, + "step": 104575 + }, + { + "epoch": 15.576407506702413, + "grad_norm": 5.013809277443215e-05, + "learning_rate": 7.088619914599698e-06, + "loss": 0.0943, + "num_input_tokens_seen": 60677944, + "step": 104580 + }, + { + "epoch": 15.577152219243372, + "grad_norm": 0.00027809475432150066, + "learning_rate": 7.086353161942066e-06, + "loss": 0.0, + "num_input_tokens_seen": 60680984, + "step": 104585 + }, + { + "epoch": 15.57789693178433, + "grad_norm": 0.00023239056463353336, + "learning_rate": 7.084086711917287e-06, + "loss": 0.0, + "num_input_tokens_seen": 60683736, + "step": 104590 + }, + { + "epoch": 15.578641644325291, + "grad_norm": 0.00018347262812312692, + "learning_rate": 7.081820564563657e-06, + "loss": 0.0, + "num_input_tokens_seen": 60686840, + "step": 104595 + }, + { + "epoch": 15.57938635686625, + "grad_norm": 0.00011488284508232027, + "learning_rate": 7.0795547199194624e-06, + "loss": 0.0, + "num_input_tokens_seen": 60689656, + "step": 104600 + }, + { + "epoch": 15.580131069407209, + "grad_norm": 0.0014248320367187262, + "learning_rate": 7.077289178022967e-06, + "loss": 0.0, + "num_input_tokens_seen": 60692632, + "step": 104605 + }, + { + "epoch": 15.580875781948167, + "grad_norm": 0.002034827135503292, + "learning_rate": 7.075023938912461e-06, + "loss": 0.0, + "num_input_tokens_seen": 60695512, + "step": 104610 + }, + { + "epoch": 15.581620494489128, + "grad_norm": 0.005727014504373074, + "learning_rate": 7.0727590026262e-06, + "loss": 0.0, + "num_input_tokens_seen": 60698200, + "step": 104615 + }, + { + "epoch": 15.582365207030087, + "grad_norm": 0.004035463090986013, + "learning_rate": 7.070494369202465e-06, + "loss": 0.0, + "num_input_tokens_seen": 60700856, + "step": 104620 + }, + { + "epoch": 15.583109919571045, + "grad_norm": 0.003982569556683302, + "learning_rate": 7.068230038679496e-06, + "loss": 0.0128, + "num_input_tokens_seen": 60703864, + "step": 104625 + }, + { + "epoch": 15.583854632112004, + "grad_norm": 0.0005193124525249004, + "learning_rate": 7.065966011095565e-06, + "loss": 0.0, + "num_input_tokens_seen": 60706776, + "step": 104630 + }, + { + "epoch": 15.584599344652965, + "grad_norm": 2.7739924917113967e-05, + "learning_rate": 7.06370228648891e-06, + "loss": 0.0, + "num_input_tokens_seen": 60709720, + "step": 104635 + }, + { + "epoch": 15.585344057193923, + "grad_norm": 2.5685469154268503e-05, + "learning_rate": 7.061438864897774e-06, + "loss": 0.0, + "num_input_tokens_seen": 60712792, + "step": 104640 + }, + { + "epoch": 15.586088769734882, + "grad_norm": 0.00025501829804852605, + "learning_rate": 7.059175746360397e-06, + "loss": 0.0, + "num_input_tokens_seen": 60715832, + "step": 104645 + }, + { + "epoch": 15.586833482275841, + "grad_norm": 0.0005324443918652833, + "learning_rate": 7.056912930915005e-06, + "loss": 0.0, + "num_input_tokens_seen": 60718808, + "step": 104650 + }, + { + "epoch": 15.587578194816802, + "grad_norm": 9.256224075215869e-06, + "learning_rate": 7.054650418599837e-06, + "loss": 0.0, + "num_input_tokens_seen": 60721912, + "step": 104655 + }, + { + "epoch": 15.58832290735776, + "grad_norm": 0.001008068909868598, + "learning_rate": 7.052388209453106e-06, + "loss": 0.0005, + "num_input_tokens_seen": 60724696, + "step": 104660 + }, + { + "epoch": 15.589067619898719, + "grad_norm": 0.00020857597701251507, + "learning_rate": 7.0501263035130435e-06, + "loss": 0.0063, + "num_input_tokens_seen": 60728024, + "step": 104665 + }, + { + "epoch": 15.589812332439678, + "grad_norm": 0.000991014763712883, + "learning_rate": 7.0478647008178435e-06, + "loss": 0.0, + "num_input_tokens_seen": 60731160, + "step": 104670 + }, + { + "epoch": 15.590557044980638, + "grad_norm": 6.778322858735919e-05, + "learning_rate": 7.045603401405735e-06, + "loss": 0.0, + "num_input_tokens_seen": 60734200, + "step": 104675 + }, + { + "epoch": 15.591301757521597, + "grad_norm": 0.010774480178952217, + "learning_rate": 7.043342405314907e-06, + "loss": 0.0, + "num_input_tokens_seen": 60736920, + "step": 104680 + }, + { + "epoch": 15.592046470062556, + "grad_norm": 0.0015477812848985195, + "learning_rate": 7.04108171258355e-06, + "loss": 0.0078, + "num_input_tokens_seen": 60739608, + "step": 104685 + }, + { + "epoch": 15.592791182603515, + "grad_norm": 3.665907570393756e-06, + "learning_rate": 7.038821323249875e-06, + "loss": 0.0, + "num_input_tokens_seen": 60742936, + "step": 104690 + }, + { + "epoch": 15.593535895144473, + "grad_norm": 0.0002943449653685093, + "learning_rate": 7.036561237352057e-06, + "loss": 0.0945, + "num_input_tokens_seen": 60746168, + "step": 104695 + }, + { + "epoch": 15.594280607685434, + "grad_norm": 1.9878507373505272e-05, + "learning_rate": 7.03430145492828e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60749368, + "step": 104700 + }, + { + "epoch": 15.595025320226393, + "grad_norm": 1.878096372820437e-05, + "learning_rate": 7.032041976016712e-06, + "loss": 0.0, + "num_input_tokens_seen": 60752152, + "step": 104705 + }, + { + "epoch": 15.595770032767351, + "grad_norm": 0.00025717157404869795, + "learning_rate": 7.02978280065554e-06, + "loss": 0.0, + "num_input_tokens_seen": 60755128, + "step": 104710 + }, + { + "epoch": 15.59651474530831, + "grad_norm": 0.0005285318475216627, + "learning_rate": 7.027523928882926e-06, + "loss": 0.0, + "num_input_tokens_seen": 60757752, + "step": 104715 + }, + { + "epoch": 15.59725945784927, + "grad_norm": 6.0210604715393856e-05, + "learning_rate": 7.025265360737021e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60761016, + "step": 104720 + }, + { + "epoch": 15.59800417039023, + "grad_norm": 25.68221664428711, + "learning_rate": 7.023007096255996e-06, + "loss": 0.0913, + "num_input_tokens_seen": 60764376, + "step": 104725 + }, + { + "epoch": 15.598748882931188, + "grad_norm": 3.6559562431648374e-05, + "learning_rate": 7.020749135477986e-06, + "loss": 0.0, + "num_input_tokens_seen": 60767064, + "step": 104730 + }, + { + "epoch": 15.599493595472147, + "grad_norm": 0.06795977801084518, + "learning_rate": 7.0184914784411555e-06, + "loss": 0.0975, + "num_input_tokens_seen": 60769752, + "step": 104735 + }, + { + "epoch": 15.600238308013108, + "grad_norm": 0.16097392141819, + "learning_rate": 7.0162341251836264e-06, + "loss": 0.0, + "num_input_tokens_seen": 60772760, + "step": 104740 + }, + { + "epoch": 15.600983020554066, + "grad_norm": 0.0031241746619343758, + "learning_rate": 7.013977075743553e-06, + "loss": 0.0, + "num_input_tokens_seen": 60775640, + "step": 104745 + }, + { + "epoch": 15.601727733095025, + "grad_norm": 3.036569751202478e-06, + "learning_rate": 7.01172033015905e-06, + "loss": 0.0906, + "num_input_tokens_seen": 60778360, + "step": 104750 + }, + { + "epoch": 15.602472445635984, + "grad_norm": 0.0001813755661714822, + "learning_rate": 7.009463888468254e-06, + "loss": 0.0, + "num_input_tokens_seen": 60781080, + "step": 104755 + }, + { + "epoch": 15.603217158176944, + "grad_norm": 0.006343222223222256, + "learning_rate": 7.0072077507092825e-06, + "loss": 0.0003, + "num_input_tokens_seen": 60784024, + "step": 104760 + }, + { + "epoch": 15.603961870717903, + "grad_norm": 0.002556050196290016, + "learning_rate": 7.004951916920249e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60786904, + "step": 104765 + }, + { + "epoch": 15.604706583258862, + "grad_norm": 0.0006140038603916764, + "learning_rate": 7.002696387139265e-06, + "loss": 0.0, + "num_input_tokens_seen": 60790040, + "step": 104770 + }, + { + "epoch": 15.60545129579982, + "grad_norm": 0.00487724132835865, + "learning_rate": 7.000441161404425e-06, + "loss": 0.0111, + "num_input_tokens_seen": 60792824, + "step": 104775 + }, + { + "epoch": 15.606196008340781, + "grad_norm": 0.0007648521568626165, + "learning_rate": 6.998186239753846e-06, + "loss": 0.0, + "num_input_tokens_seen": 60795640, + "step": 104780 + }, + { + "epoch": 15.60694072088174, + "grad_norm": 9.769973075890448e-06, + "learning_rate": 6.995931622225605e-06, + "loss": 0.0, + "num_input_tokens_seen": 60798488, + "step": 104785 + }, + { + "epoch": 15.607685433422699, + "grad_norm": 0.00015200034249573946, + "learning_rate": 6.99367730885781e-06, + "loss": 0.0, + "num_input_tokens_seen": 60801464, + "step": 104790 + }, + { + "epoch": 15.608430145963657, + "grad_norm": 0.0058043538592755795, + "learning_rate": 6.991423299688535e-06, + "loss": 0.0044, + "num_input_tokens_seen": 60804248, + "step": 104795 + }, + { + "epoch": 15.609174858504618, + "grad_norm": 4.190571416984312e-06, + "learning_rate": 6.989169594755854e-06, + "loss": 0.0, + "num_input_tokens_seen": 60807032, + "step": 104800 + }, + { + "epoch": 15.609919571045577, + "grad_norm": 0.03564071282744408, + "learning_rate": 6.9869161940978535e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60809688, + "step": 104805 + }, + { + "epoch": 15.610664283586535, + "grad_norm": 6.4004088926594704e-06, + "learning_rate": 6.984663097752589e-06, + "loss": 0.0, + "num_input_tokens_seen": 60812472, + "step": 104810 + }, + { + "epoch": 15.611408996127494, + "grad_norm": 0.00015241237997543067, + "learning_rate": 6.982410305758138e-06, + "loss": 0.0616, + "num_input_tokens_seen": 60815640, + "step": 104815 + }, + { + "epoch": 15.612153708668455, + "grad_norm": 0.0017536442028358579, + "learning_rate": 6.980157818152547e-06, + "loss": 0.0, + "num_input_tokens_seen": 60818424, + "step": 104820 + }, + { + "epoch": 15.612898421209414, + "grad_norm": 0.0018625255906954408, + "learning_rate": 6.97790563497388e-06, + "loss": 0.0, + "num_input_tokens_seen": 60821432, + "step": 104825 + }, + { + "epoch": 15.613643133750372, + "grad_norm": 0.005691687576472759, + "learning_rate": 6.9756537562601835e-06, + "loss": 0.0001, + "num_input_tokens_seen": 60824440, + "step": 104830 + }, + { + "epoch": 15.614387846291331, + "grad_norm": 3.535434007062577e-05, + "learning_rate": 6.973402182049496e-06, + "loss": 0.0, + "num_input_tokens_seen": 60827192, + "step": 104835 + }, + { + "epoch": 15.615132558832292, + "grad_norm": 5.0940103392349556e-05, + "learning_rate": 6.971150912379859e-06, + "loss": 0.0428, + "num_input_tokens_seen": 60830360, + "step": 104840 + }, + { + "epoch": 15.61587727137325, + "grad_norm": 0.0004502551455516368, + "learning_rate": 6.968899947289295e-06, + "loss": 0.0, + "num_input_tokens_seen": 60833368, + "step": 104845 + }, + { + "epoch": 15.616621983914209, + "grad_norm": 0.006101813167333603, + "learning_rate": 6.966649286815846e-06, + "loss": 0.0, + "num_input_tokens_seen": 60836408, + "step": 104850 + }, + { + "epoch": 15.617366696455168, + "grad_norm": 0.0003471617237664759, + "learning_rate": 6.9643989309975235e-06, + "loss": 0.3, + "num_input_tokens_seen": 60839384, + "step": 104855 + }, + { + "epoch": 15.618111408996128, + "grad_norm": 1.0804689736687578e-05, + "learning_rate": 6.962148879872357e-06, + "loss": 0.0, + "num_input_tokens_seen": 60842104, + "step": 104860 + }, + { + "epoch": 15.618856121537087, + "grad_norm": 9.602031786926091e-05, + "learning_rate": 6.9598991334783485e-06, + "loss": 0.0, + "num_input_tokens_seen": 60844920, + "step": 104865 + }, + { + "epoch": 15.619600834078046, + "grad_norm": 2.4073191525531e-05, + "learning_rate": 6.957649691853513e-06, + "loss": 0.0, + "num_input_tokens_seen": 60847896, + "step": 104870 + }, + { + "epoch": 15.620345546619005, + "grad_norm": 0.04239373654127121, + "learning_rate": 6.955400555035849e-06, + "loss": 0.0, + "num_input_tokens_seen": 60850872, + "step": 104875 + }, + { + "epoch": 15.621090259159963, + "grad_norm": 0.00022958921908866614, + "learning_rate": 6.953151723063345e-06, + "loss": 0.0013, + "num_input_tokens_seen": 60853784, + "step": 104880 + }, + { + "epoch": 15.621834971700924, + "grad_norm": 5.124932067701593e-05, + "learning_rate": 6.95090319597401e-06, + "loss": 0.0, + "num_input_tokens_seen": 60856536, + "step": 104885 + }, + { + "epoch": 15.622579684241883, + "grad_norm": 165.90701293945312, + "learning_rate": 6.94865497380581e-06, + "loss": 0.2469, + "num_input_tokens_seen": 60859224, + "step": 104890 + }, + { + "epoch": 15.623324396782841, + "grad_norm": 2.0813315131817944e-05, + "learning_rate": 6.9464070565967486e-06, + "loss": 0.0, + "num_input_tokens_seen": 60861848, + "step": 104895 + }, + { + "epoch": 15.6240691093238, + "grad_norm": 8.870341844158247e-05, + "learning_rate": 6.94415944438479e-06, + "loss": 0.0, + "num_input_tokens_seen": 60864824, + "step": 104900 + }, + { + "epoch": 15.62481382186476, + "grad_norm": 5.380716174840927e-06, + "learning_rate": 6.941912137207907e-06, + "loss": 0.0, + "num_input_tokens_seen": 60867832, + "step": 104905 + }, + { + "epoch": 15.62555853440572, + "grad_norm": 0.011423555202782154, + "learning_rate": 6.939665135104056e-06, + "loss": 0.0, + "num_input_tokens_seen": 60870744, + "step": 104910 + }, + { + "epoch": 15.626303246946678, + "grad_norm": 8.74830293469131e-05, + "learning_rate": 6.9374184381112155e-06, + "loss": 0.175, + "num_input_tokens_seen": 60873592, + "step": 104915 + }, + { + "epoch": 15.627047959487637, + "grad_norm": 1.9211520339013077e-05, + "learning_rate": 6.935172046267333e-06, + "loss": 0.0, + "num_input_tokens_seen": 60876344, + "step": 104920 + }, + { + "epoch": 15.627792672028598, + "grad_norm": 1.8207059838459827e-05, + "learning_rate": 6.932925959610351e-06, + "loss": 0.0, + "num_input_tokens_seen": 60878904, + "step": 104925 + }, + { + "epoch": 15.628537384569556, + "grad_norm": 6.303757982095703e-05, + "learning_rate": 6.930680178178228e-06, + "loss": 0.0, + "num_input_tokens_seen": 60882168, + "step": 104930 + }, + { + "epoch": 15.629282097110515, + "grad_norm": 0.000959621393121779, + "learning_rate": 6.928434702008893e-06, + "loss": 0.0, + "num_input_tokens_seen": 60885208, + "step": 104935 + }, + { + "epoch": 15.630026809651474, + "grad_norm": 5.772434178652475e-06, + "learning_rate": 6.9261895311402925e-06, + "loss": 0.0, + "num_input_tokens_seen": 60888152, + "step": 104940 + }, + { + "epoch": 15.630771522192434, + "grad_norm": 0.017119569703936577, + "learning_rate": 6.923944665610344e-06, + "loss": 0.0, + "num_input_tokens_seen": 60891192, + "step": 104945 + }, + { + "epoch": 15.631516234733393, + "grad_norm": 1.337042613158701e-05, + "learning_rate": 6.921700105456985e-06, + "loss": 0.0, + "num_input_tokens_seen": 60894232, + "step": 104950 + }, + { + "epoch": 15.632260947274352, + "grad_norm": 0.015352653339505196, + "learning_rate": 6.919455850718123e-06, + "loss": 0.0, + "num_input_tokens_seen": 60897016, + "step": 104955 + }, + { + "epoch": 15.63300565981531, + "grad_norm": 1.8908331185230054e-05, + "learning_rate": 6.917211901431683e-06, + "loss": 0.1221, + "num_input_tokens_seen": 60899704, + "step": 104960 + }, + { + "epoch": 15.633750372356271, + "grad_norm": 0.003221728838980198, + "learning_rate": 6.914968257635573e-06, + "loss": 0.0, + "num_input_tokens_seen": 60902424, + "step": 104965 + }, + { + "epoch": 15.63449508489723, + "grad_norm": 3.9007587474770844e-05, + "learning_rate": 6.912724919367691e-06, + "loss": 0.0, + "num_input_tokens_seen": 60905368, + "step": 104970 + }, + { + "epoch": 15.635239797438189, + "grad_norm": 0.009760835207998753, + "learning_rate": 6.91048188666594e-06, + "loss": 0.0, + "num_input_tokens_seen": 60908248, + "step": 104975 + }, + { + "epoch": 15.635984509979147, + "grad_norm": 2.777827285171952e-05, + "learning_rate": 6.908239159568203e-06, + "loss": 0.0, + "num_input_tokens_seen": 60911064, + "step": 104980 + }, + { + "epoch": 15.636729222520108, + "grad_norm": 0.0001320220617344603, + "learning_rate": 6.9059967381123854e-06, + "loss": 0.0, + "num_input_tokens_seen": 60913816, + "step": 104985 + }, + { + "epoch": 15.637473935061067, + "grad_norm": 9.936301648849621e-05, + "learning_rate": 6.903754622336358e-06, + "loss": 0.0, + "num_input_tokens_seen": 60916504, + "step": 104990 + }, + { + "epoch": 15.638218647602026, + "grad_norm": 0.0010389165254309773, + "learning_rate": 6.90151281227801e-06, + "loss": 0.0, + "num_input_tokens_seen": 60919352, + "step": 104995 + }, + { + "epoch": 15.638963360142984, + "grad_norm": 0.0002998245763592422, + "learning_rate": 6.899271307975208e-06, + "loss": 0.0, + "num_input_tokens_seen": 60922264, + "step": 105000 + }, + { + "epoch": 15.639708072683945, + "grad_norm": 0.0011826130794361234, + "learning_rate": 6.897030109465813e-06, + "loss": 0.0, + "num_input_tokens_seen": 60925112, + "step": 105005 + }, + { + "epoch": 15.640452785224904, + "grad_norm": 1.1778830412367824e-05, + "learning_rate": 6.894789216787703e-06, + "loss": 0.0, + "num_input_tokens_seen": 60927896, + "step": 105010 + }, + { + "epoch": 15.641197497765862, + "grad_norm": 0.0008979436825029552, + "learning_rate": 6.892548629978721e-06, + "loss": 0.0, + "num_input_tokens_seen": 60930872, + "step": 105015 + }, + { + "epoch": 15.641942210306821, + "grad_norm": 0.0001317040587309748, + "learning_rate": 6.890308349076732e-06, + "loss": 0.0, + "num_input_tokens_seen": 60933944, + "step": 105020 + }, + { + "epoch": 15.64268692284778, + "grad_norm": 0.0006643409142270684, + "learning_rate": 6.88806837411958e-06, + "loss": 0.0713, + "num_input_tokens_seen": 60936856, + "step": 105025 + }, + { + "epoch": 15.64343163538874, + "grad_norm": 2.9175120289437473e-05, + "learning_rate": 6.885828705145103e-06, + "loss": 0.0, + "num_input_tokens_seen": 60939768, + "step": 105030 + }, + { + "epoch": 15.6441763479297, + "grad_norm": 9.170806151814759e-05, + "learning_rate": 6.883589342191132e-06, + "loss": 0.0004, + "num_input_tokens_seen": 60942424, + "step": 105035 + }, + { + "epoch": 15.644921060470658, + "grad_norm": 0.0004058730264659971, + "learning_rate": 6.881350285295515e-06, + "loss": 0.0, + "num_input_tokens_seen": 60945400, + "step": 105040 + }, + { + "epoch": 15.645665773011618, + "grad_norm": 2.295803824381437e-05, + "learning_rate": 6.879111534496069e-06, + "loss": 0.0, + "num_input_tokens_seen": 60948440, + "step": 105045 + }, + { + "epoch": 15.646410485552577, + "grad_norm": 0.0005580263095907867, + "learning_rate": 6.87687308983061e-06, + "loss": 0.0, + "num_input_tokens_seen": 60951448, + "step": 105050 + }, + { + "epoch": 15.647155198093536, + "grad_norm": 0.0009334096685051918, + "learning_rate": 6.874634951336967e-06, + "loss": 0.0116, + "num_input_tokens_seen": 60954328, + "step": 105055 + }, + { + "epoch": 15.647899910634495, + "grad_norm": 3.445737820584327e-05, + "learning_rate": 6.872397119052937e-06, + "loss": 0.0033, + "num_input_tokens_seen": 60957368, + "step": 105060 + }, + { + "epoch": 15.648644623175453, + "grad_norm": 0.0011965119047090411, + "learning_rate": 6.870159593016343e-06, + "loss": 0.0071, + "num_input_tokens_seen": 60960600, + "step": 105065 + }, + { + "epoch": 15.649389335716414, + "grad_norm": 8.190815424313769e-05, + "learning_rate": 6.867922373264968e-06, + "loss": 0.0, + "num_input_tokens_seen": 60963576, + "step": 105070 + }, + { + "epoch": 15.650134048257373, + "grad_norm": 4.040567728225142e-05, + "learning_rate": 6.865685459836621e-06, + "loss": 0.0, + "num_input_tokens_seen": 60966392, + "step": 105075 + }, + { + "epoch": 15.650878760798332, + "grad_norm": 1.4171471775625832e-05, + "learning_rate": 6.8634488527690915e-06, + "loss": 0.0, + "num_input_tokens_seen": 60969272, + "step": 105080 + }, + { + "epoch": 15.65162347333929, + "grad_norm": 0.00018702751549426466, + "learning_rate": 6.861212552100149e-06, + "loss": 0.0, + "num_input_tokens_seen": 60972056, + "step": 105085 + }, + { + "epoch": 15.65236818588025, + "grad_norm": 6.253437459236011e-05, + "learning_rate": 6.858976557867594e-06, + "loss": 0.0, + "num_input_tokens_seen": 60975032, + "step": 105090 + }, + { + "epoch": 15.65311289842121, + "grad_norm": 0.00012060355220455676, + "learning_rate": 6.856740870109194e-06, + "loss": 0.0, + "num_input_tokens_seen": 60977912, + "step": 105095 + }, + { + "epoch": 15.653857610962168, + "grad_norm": 0.00020901170501019806, + "learning_rate": 6.854505488862714e-06, + "loss": 0.0002, + "num_input_tokens_seen": 60980792, + "step": 105100 + }, + { + "epoch": 15.654602323503127, + "grad_norm": 0.010333728045225143, + "learning_rate": 6.852270414165915e-06, + "loss": 0.1008, + "num_input_tokens_seen": 60983736, + "step": 105105 + }, + { + "epoch": 15.655347036044088, + "grad_norm": 0.0020468488801270723, + "learning_rate": 6.850035646056571e-06, + "loss": 0.0, + "num_input_tokens_seen": 60986840, + "step": 105110 + }, + { + "epoch": 15.656091748585046, + "grad_norm": 0.0001413302670698613, + "learning_rate": 6.847801184572422e-06, + "loss": 0.0, + "num_input_tokens_seen": 60989720, + "step": 105115 + }, + { + "epoch": 15.656836461126005, + "grad_norm": 5.879069249203894e-06, + "learning_rate": 6.845567029751229e-06, + "loss": 0.0, + "num_input_tokens_seen": 60992824, + "step": 105120 + }, + { + "epoch": 15.657581173666964, + "grad_norm": 0.0007809415110386908, + "learning_rate": 6.843333181630729e-06, + "loss": 0.0, + "num_input_tokens_seen": 60995704, + "step": 105125 + }, + { + "epoch": 15.658325886207924, + "grad_norm": 3.121732152067125e-05, + "learning_rate": 6.841099640248655e-06, + "loss": 0.0, + "num_input_tokens_seen": 60998744, + "step": 105130 + }, + { + "epoch": 15.659070598748883, + "grad_norm": 3.966343865613453e-05, + "learning_rate": 6.838866405642752e-06, + "loss": 0.0002, + "num_input_tokens_seen": 61001368, + "step": 105135 + }, + { + "epoch": 15.659815311289842, + "grad_norm": 0.004412936046719551, + "learning_rate": 6.836633477850737e-06, + "loss": 0.0, + "num_input_tokens_seen": 61004696, + "step": 105140 + }, + { + "epoch": 15.6605600238308, + "grad_norm": 0.013600755482912064, + "learning_rate": 6.834400856910348e-06, + "loss": 0.0002, + "num_input_tokens_seen": 61007736, + "step": 105145 + }, + { + "epoch": 15.661304736371761, + "grad_norm": 0.0010017885360866785, + "learning_rate": 6.832168542859283e-06, + "loss": 0.0316, + "num_input_tokens_seen": 61010712, + "step": 105150 + }, + { + "epoch": 15.66204944891272, + "grad_norm": 0.0003392221697140485, + "learning_rate": 6.829936535735273e-06, + "loss": 0.0126, + "num_input_tokens_seen": 61013432, + "step": 105155 + }, + { + "epoch": 15.662794161453679, + "grad_norm": 0.00023492933542001992, + "learning_rate": 6.827704835576021e-06, + "loss": 0.0, + "num_input_tokens_seen": 61016120, + "step": 105160 + }, + { + "epoch": 15.663538873994638, + "grad_norm": 0.0003351316845510155, + "learning_rate": 6.8254734424192255e-06, + "loss": 0.0, + "num_input_tokens_seen": 61019032, + "step": 105165 + }, + { + "epoch": 15.664283586535598, + "grad_norm": 0.0001207431050715968, + "learning_rate": 6.823242356302584e-06, + "loss": 0.0, + "num_input_tokens_seen": 61021912, + "step": 105170 + }, + { + "epoch": 15.665028299076557, + "grad_norm": 0.0001162219705292955, + "learning_rate": 6.821011577263781e-06, + "loss": 0.0001, + "num_input_tokens_seen": 61024888, + "step": 105175 + }, + { + "epoch": 15.665773011617516, + "grad_norm": 0.00023922986292745918, + "learning_rate": 6.8187811053405185e-06, + "loss": 0.0, + "num_input_tokens_seen": 61027544, + "step": 105180 + }, + { + "epoch": 15.666517724158474, + "grad_norm": 4.619709216058254e-05, + "learning_rate": 6.816550940570463e-06, + "loss": 0.0, + "num_input_tokens_seen": 61030520, + "step": 105185 + }, + { + "epoch": 15.667262436699435, + "grad_norm": 0.00022148406424093992, + "learning_rate": 6.8143210829913065e-06, + "loss": 0.0, + "num_input_tokens_seen": 61033528, + "step": 105190 + }, + { + "epoch": 15.668007149240394, + "grad_norm": 5.807786146760918e-05, + "learning_rate": 6.812091532640705e-06, + "loss": 0.0, + "num_input_tokens_seen": 61036568, + "step": 105195 + }, + { + "epoch": 15.668751861781352, + "grad_norm": 1.4602764167648274e-05, + "learning_rate": 6.80986228955634e-06, + "loss": 0.0011, + "num_input_tokens_seen": 61039352, + "step": 105200 + }, + { + "epoch": 15.669496574322311, + "grad_norm": 0.00013509904965758324, + "learning_rate": 6.807633353775861e-06, + "loss": 0.0, + "num_input_tokens_seen": 61042296, + "step": 105205 + }, + { + "epoch": 15.67024128686327, + "grad_norm": 0.00046436523552984, + "learning_rate": 6.805404725336923e-06, + "loss": 0.0666, + "num_input_tokens_seen": 61045432, + "step": 105210 + }, + { + "epoch": 15.67098599940423, + "grad_norm": 7.901390745246317e-06, + "learning_rate": 6.803176404277184e-06, + "loss": 0.0, + "num_input_tokens_seen": 61048056, + "step": 105215 + }, + { + "epoch": 15.67173071194519, + "grad_norm": 0.015025981701910496, + "learning_rate": 6.800948390634279e-06, + "loss": 0.0, + "num_input_tokens_seen": 61051288, + "step": 105220 + }, + { + "epoch": 15.672475424486148, + "grad_norm": 0.012109840288758278, + "learning_rate": 6.798720684445861e-06, + "loss": 0.0, + "num_input_tokens_seen": 61054072, + "step": 105225 + }, + { + "epoch": 15.673220137027108, + "grad_norm": 1.2613621038326528e-05, + "learning_rate": 6.796493285749561e-06, + "loss": 0.0, + "num_input_tokens_seen": 61056952, + "step": 105230 + }, + { + "epoch": 15.673964849568067, + "grad_norm": 5.687935117748566e-05, + "learning_rate": 6.794266194583005e-06, + "loss": 0.0, + "num_input_tokens_seen": 61059576, + "step": 105235 + }, + { + "epoch": 15.674709562109026, + "grad_norm": 0.0003034943947568536, + "learning_rate": 6.792039410983817e-06, + "loss": 0.0, + "num_input_tokens_seen": 61062584, + "step": 105240 + }, + { + "epoch": 15.675454274649985, + "grad_norm": 3.7260077078826725e-05, + "learning_rate": 6.789812934989609e-06, + "loss": 0.0, + "num_input_tokens_seen": 61065624, + "step": 105245 + }, + { + "epoch": 15.676198987190944, + "grad_norm": 0.0001756451529217884, + "learning_rate": 6.78758676663801e-06, + "loss": 0.0, + "num_input_tokens_seen": 61068600, + "step": 105250 + }, + { + "epoch": 15.676943699731904, + "grad_norm": 0.0010078160557895899, + "learning_rate": 6.785360905966617e-06, + "loss": 0.0, + "num_input_tokens_seen": 61071512, + "step": 105255 + }, + { + "epoch": 15.677688412272863, + "grad_norm": 0.0013164146803319454, + "learning_rate": 6.783135353013045e-06, + "loss": 0.022, + "num_input_tokens_seen": 61074680, + "step": 105260 + }, + { + "epoch": 15.678433124813822, + "grad_norm": 5.394523395807482e-05, + "learning_rate": 6.7809101078148805e-06, + "loss": 0.0, + "num_input_tokens_seen": 61077784, + "step": 105265 + }, + { + "epoch": 15.67917783735478, + "grad_norm": 0.0001968185679288581, + "learning_rate": 6.7786851704097295e-06, + "loss": 0.0, + "num_input_tokens_seen": 61080824, + "step": 105270 + }, + { + "epoch": 15.67992254989574, + "grad_norm": 0.0005409190198406577, + "learning_rate": 6.776460540835167e-06, + "loss": 0.0, + "num_input_tokens_seen": 61084024, + "step": 105275 + }, + { + "epoch": 15.6806672624367, + "grad_norm": 1.0375886631663889e-05, + "learning_rate": 6.774236219128788e-06, + "loss": 0.0091, + "num_input_tokens_seen": 61087032, + "step": 105280 + }, + { + "epoch": 15.681411974977658, + "grad_norm": 450.40020751953125, + "learning_rate": 6.772012205328166e-06, + "loss": 0.0571, + "num_input_tokens_seen": 61089784, + "step": 105285 + }, + { + "epoch": 15.682156687518617, + "grad_norm": 5.374694592319429e-05, + "learning_rate": 6.769788499470861e-06, + "loss": 0.0, + "num_input_tokens_seen": 61092856, + "step": 105290 + }, + { + "epoch": 15.682901400059578, + "grad_norm": 5.723561844206415e-05, + "learning_rate": 6.767565101594461e-06, + "loss": 0.0, + "num_input_tokens_seen": 61095576, + "step": 105295 + }, + { + "epoch": 15.683646112600536, + "grad_norm": 0.0007965769618749619, + "learning_rate": 6.765342011736517e-06, + "loss": 0.0, + "num_input_tokens_seen": 61098424, + "step": 105300 + }, + { + "epoch": 15.684390825141495, + "grad_norm": 0.003742248984053731, + "learning_rate": 6.763119229934589e-06, + "loss": 0.0, + "num_input_tokens_seen": 61101624, + "step": 105305 + }, + { + "epoch": 15.685135537682454, + "grad_norm": 0.0001914871681947261, + "learning_rate": 6.760896756226215e-06, + "loss": 0.0, + "num_input_tokens_seen": 61104376, + "step": 105310 + }, + { + "epoch": 15.685880250223414, + "grad_norm": 4.206660742056556e-05, + "learning_rate": 6.758674590648964e-06, + "loss": 0.0001, + "num_input_tokens_seen": 61107032, + "step": 105315 + }, + { + "epoch": 15.686624962764373, + "grad_norm": 2.080259218928404e-05, + "learning_rate": 6.756452733240365e-06, + "loss": 0.0, + "num_input_tokens_seen": 61110392, + "step": 105320 + }, + { + "epoch": 15.687369675305332, + "grad_norm": 0.00018966525385621935, + "learning_rate": 6.7542311840379455e-06, + "loss": 0.0285, + "num_input_tokens_seen": 61113272, + "step": 105325 + }, + { + "epoch": 15.68811438784629, + "grad_norm": 0.0003002431767527014, + "learning_rate": 6.752009943079257e-06, + "loss": 0.0, + "num_input_tokens_seen": 61115928, + "step": 105330 + }, + { + "epoch": 15.688859100387251, + "grad_norm": 3.5810669942293316e-05, + "learning_rate": 6.749789010401805e-06, + "loss": 0.0, + "num_input_tokens_seen": 61118712, + "step": 105335 + }, + { + "epoch": 15.68960381292821, + "grad_norm": 0.00034784767194651067, + "learning_rate": 6.7475683860431266e-06, + "loss": 0.0, + "num_input_tokens_seen": 61121496, + "step": 105340 + }, + { + "epoch": 15.690348525469169, + "grad_norm": 0.12908531725406647, + "learning_rate": 6.745348070040722e-06, + "loss": 0.0002, + "num_input_tokens_seen": 61124408, + "step": 105345 + }, + { + "epoch": 15.691093238010128, + "grad_norm": 0.0017235432751476765, + "learning_rate": 6.743128062432113e-06, + "loss": 0.0, + "num_input_tokens_seen": 61127544, + "step": 105350 + }, + { + "epoch": 15.691837950551088, + "grad_norm": 5.9826656070072204e-05, + "learning_rate": 6.740908363254805e-06, + "loss": 0.0, + "num_input_tokens_seen": 61130296, + "step": 105355 + }, + { + "epoch": 15.692582663092047, + "grad_norm": 8.425705163972452e-05, + "learning_rate": 6.7386889725462894e-06, + "loss": 0.0559, + "num_input_tokens_seen": 61133304, + "step": 105360 + }, + { + "epoch": 15.693327375633006, + "grad_norm": 3.947937875636853e-05, + "learning_rate": 6.736469890344058e-06, + "loss": 0.0, + "num_input_tokens_seen": 61136344, + "step": 105365 + }, + { + "epoch": 15.694072088173964, + "grad_norm": 0.00012172038987046108, + "learning_rate": 6.734251116685611e-06, + "loss": 0.0, + "num_input_tokens_seen": 61139224, + "step": 105370 + }, + { + "epoch": 15.694816800714925, + "grad_norm": 0.0007076215115375817, + "learning_rate": 6.732032651608427e-06, + "loss": 0.0, + "num_input_tokens_seen": 61142264, + "step": 105375 + }, + { + "epoch": 15.695561513255884, + "grad_norm": 0.0001226504537044093, + "learning_rate": 6.7298144951499774e-06, + "loss": 0.0, + "num_input_tokens_seen": 61145144, + "step": 105380 + }, + { + "epoch": 15.696306225796842, + "grad_norm": 2.1501538867596537e-05, + "learning_rate": 6.727596647347753e-06, + "loss": 0.0, + "num_input_tokens_seen": 61148216, + "step": 105385 + }, + { + "epoch": 15.697050938337801, + "grad_norm": 8.416073251282796e-05, + "learning_rate": 6.725379108239202e-06, + "loss": 0.0, + "num_input_tokens_seen": 61151160, + "step": 105390 + }, + { + "epoch": 15.69779565087876, + "grad_norm": 0.0006744179991073906, + "learning_rate": 6.723161877861805e-06, + "loss": 0.0, + "num_input_tokens_seen": 61154008, + "step": 105395 + }, + { + "epoch": 15.69854036341972, + "grad_norm": 0.0007761595770716667, + "learning_rate": 6.720944956253012e-06, + "loss": 0.0, + "num_input_tokens_seen": 61156920, + "step": 105400 + }, + { + "epoch": 15.69928507596068, + "grad_norm": 6.304655107669532e-05, + "learning_rate": 6.71872834345027e-06, + "loss": 0.0, + "num_input_tokens_seen": 61159608, + "step": 105405 + }, + { + "epoch": 15.700029788501638, + "grad_norm": 4.720949073089287e-05, + "learning_rate": 6.716512039491038e-06, + "loss": 0.0, + "num_input_tokens_seen": 61162488, + "step": 105410 + }, + { + "epoch": 15.700774501042597, + "grad_norm": 9.470775694353506e-06, + "learning_rate": 6.714296044412746e-06, + "loss": 0.0, + "num_input_tokens_seen": 61165464, + "step": 105415 + }, + { + "epoch": 15.701519213583557, + "grad_norm": 0.001097892178222537, + "learning_rate": 6.712080358252845e-06, + "loss": 0.0, + "num_input_tokens_seen": 61168376, + "step": 105420 + }, + { + "epoch": 15.702263926124516, + "grad_norm": 0.0009687633137218654, + "learning_rate": 6.709864981048761e-06, + "loss": 0.0, + "num_input_tokens_seen": 61171320, + "step": 105425 + }, + { + "epoch": 15.703008638665475, + "grad_norm": 0.004237488377839327, + "learning_rate": 6.707649912837919e-06, + "loss": 0.001, + "num_input_tokens_seen": 61174232, + "step": 105430 + }, + { + "epoch": 15.703753351206434, + "grad_norm": 0.00011551623174455017, + "learning_rate": 6.70543515365773e-06, + "loss": 0.0, + "num_input_tokens_seen": 61177368, + "step": 105435 + }, + { + "epoch": 15.704498063747394, + "grad_norm": 1.7293352357228287e-05, + "learning_rate": 6.703220703545629e-06, + "loss": 0.0, + "num_input_tokens_seen": 61180280, + "step": 105440 + }, + { + "epoch": 15.705242776288353, + "grad_norm": 0.0011630620574578643, + "learning_rate": 6.701006562539019e-06, + "loss": 0.0007, + "num_input_tokens_seen": 61183480, + "step": 105445 + }, + { + "epoch": 15.705987488829312, + "grad_norm": 1.0878020475502126e-05, + "learning_rate": 6.698792730675296e-06, + "loss": 0.0, + "num_input_tokens_seen": 61186392, + "step": 105450 + }, + { + "epoch": 15.70673220137027, + "grad_norm": 0.0002548869524616748, + "learning_rate": 6.6965792079918765e-06, + "loss": 0.0, + "num_input_tokens_seen": 61189336, + "step": 105455 + }, + { + "epoch": 15.707476913911231, + "grad_norm": 0.0004063734959345311, + "learning_rate": 6.694365994526142e-06, + "loss": 0.0, + "num_input_tokens_seen": 61192216, + "step": 105460 + }, + { + "epoch": 15.70822162645219, + "grad_norm": 1.6741660833358765, + "learning_rate": 6.692153090315498e-06, + "loss": 0.0026, + "num_input_tokens_seen": 61194840, + "step": 105465 + }, + { + "epoch": 15.708966338993148, + "grad_norm": 0.018690189346671104, + "learning_rate": 6.689940495397309e-06, + "loss": 0.0, + "num_input_tokens_seen": 61197720, + "step": 105470 + }, + { + "epoch": 15.709711051534107, + "grad_norm": 0.0012097092112526298, + "learning_rate": 6.687728209808977e-06, + "loss": 0.0, + "num_input_tokens_seen": 61200824, + "step": 105475 + }, + { + "epoch": 15.710455764075068, + "grad_norm": 8.024841372389346e-05, + "learning_rate": 6.6855162335878626e-06, + "loss": 0.0, + "num_input_tokens_seen": 61204216, + "step": 105480 + }, + { + "epoch": 15.711200476616026, + "grad_norm": 0.28162291646003723, + "learning_rate": 6.683304566771331e-06, + "loss": 0.0001, + "num_input_tokens_seen": 61207064, + "step": 105485 + }, + { + "epoch": 15.711945189156985, + "grad_norm": 0.0002920753904618323, + "learning_rate": 6.68109320939676e-06, + "loss": 0.0, + "num_input_tokens_seen": 61209944, + "step": 105490 + }, + { + "epoch": 15.712689901697944, + "grad_norm": 1.958379289135337e-05, + "learning_rate": 6.678882161501502e-06, + "loss": 0.0001, + "num_input_tokens_seen": 61212664, + "step": 105495 + }, + { + "epoch": 15.713434614238905, + "grad_norm": 7.106174598447978e-05, + "learning_rate": 6.676671423122907e-06, + "loss": 0.0005, + "num_input_tokens_seen": 61215416, + "step": 105500 + }, + { + "epoch": 15.714179326779863, + "grad_norm": 8.701057959115133e-05, + "learning_rate": 6.674460994298317e-06, + "loss": 0.0, + "num_input_tokens_seen": 61218232, + "step": 105505 + }, + { + "epoch": 15.714924039320822, + "grad_norm": 2.0353809304651804e-05, + "learning_rate": 6.672250875065095e-06, + "loss": 0.0003, + "num_input_tokens_seen": 61221400, + "step": 105510 + }, + { + "epoch": 15.71566875186178, + "grad_norm": 0.0001354966516373679, + "learning_rate": 6.670041065460555e-06, + "loss": 0.0, + "num_input_tokens_seen": 61224248, + "step": 105515 + }, + { + "epoch": 15.716413464402741, + "grad_norm": 2.9619955967064016e-05, + "learning_rate": 6.667831565522051e-06, + "loss": 0.0002, + "num_input_tokens_seen": 61227480, + "step": 105520 + }, + { + "epoch": 15.7171581769437, + "grad_norm": 0.0002625377965159714, + "learning_rate": 6.665622375286901e-06, + "loss": 0.0, + "num_input_tokens_seen": 61230296, + "step": 105525 + }, + { + "epoch": 15.717902889484659, + "grad_norm": 0.00020209146896377206, + "learning_rate": 6.66341349479242e-06, + "loss": 0.0, + "num_input_tokens_seen": 61233016, + "step": 105530 + }, + { + "epoch": 15.718647602025618, + "grad_norm": 0.00023756877635605633, + "learning_rate": 6.661204924075937e-06, + "loss": 0.0, + "num_input_tokens_seen": 61236184, + "step": 105535 + }, + { + "epoch": 15.719392314566576, + "grad_norm": 11.53159236907959, + "learning_rate": 6.658996663174752e-06, + "loss": 0.1221, + "num_input_tokens_seen": 61238744, + "step": 105540 + }, + { + "epoch": 15.720137027107537, + "grad_norm": 0.002562494482845068, + "learning_rate": 6.656788712126183e-06, + "loss": 0.0, + "num_input_tokens_seen": 61241560, + "step": 105545 + }, + { + "epoch": 15.720881739648496, + "grad_norm": 0.003355900989845395, + "learning_rate": 6.654581070967519e-06, + "loss": 0.0, + "num_input_tokens_seen": 61244312, + "step": 105550 + }, + { + "epoch": 15.721626452189454, + "grad_norm": 0.0019108171109110117, + "learning_rate": 6.6523737397360705e-06, + "loss": 0.0, + "num_input_tokens_seen": 61247160, + "step": 105555 + }, + { + "epoch": 15.722371164730415, + "grad_norm": 5.268888344289735e-05, + "learning_rate": 6.65016671846912e-06, + "loss": 0.0, + "num_input_tokens_seen": 61250040, + "step": 105560 + }, + { + "epoch": 15.723115877271374, + "grad_norm": 0.004645011853426695, + "learning_rate": 6.647960007203952e-06, + "loss": 0.0, + "num_input_tokens_seen": 61252664, + "step": 105565 + }, + { + "epoch": 15.723860589812332, + "grad_norm": 0.00017020876111928374, + "learning_rate": 6.645753605977847e-06, + "loss": 0.0, + "num_input_tokens_seen": 61255352, + "step": 105570 + }, + { + "epoch": 15.724605302353291, + "grad_norm": 6.9976019859313965, + "learning_rate": 6.643547514828075e-06, + "loss": 0.0032, + "num_input_tokens_seen": 61258232, + "step": 105575 + }, + { + "epoch": 15.72535001489425, + "grad_norm": 6.930180097697303e-05, + "learning_rate": 6.641341733791917e-06, + "loss": 0.0, + "num_input_tokens_seen": 61261176, + "step": 105580 + }, + { + "epoch": 15.72609472743521, + "grad_norm": 1.598665585333947e-05, + "learning_rate": 6.639136262906625e-06, + "loss": 0.0, + "num_input_tokens_seen": 61263896, + "step": 105585 + }, + { + "epoch": 15.72683943997617, + "grad_norm": 0.11082340776920319, + "learning_rate": 6.636931102209471e-06, + "loss": 0.0, + "num_input_tokens_seen": 61266744, + "step": 105590 + }, + { + "epoch": 15.727584152517128, + "grad_norm": 0.004022948909550905, + "learning_rate": 6.634726251737697e-06, + "loss": 0.0861, + "num_input_tokens_seen": 61269400, + "step": 105595 + }, + { + "epoch": 15.728328865058087, + "grad_norm": 6.310812750598416e-05, + "learning_rate": 6.632521711528564e-06, + "loss": 0.0, + "num_input_tokens_seen": 61272280, + "step": 105600 + }, + { + "epoch": 15.729073577599047, + "grad_norm": 0.0002542755100876093, + "learning_rate": 6.630317481619308e-06, + "loss": 0.0001, + "num_input_tokens_seen": 61275288, + "step": 105605 + }, + { + "epoch": 15.729818290140006, + "grad_norm": 0.0003826922329608351, + "learning_rate": 6.628113562047161e-06, + "loss": 0.0, + "num_input_tokens_seen": 61278104, + "step": 105610 + }, + { + "epoch": 15.730563002680965, + "grad_norm": 6.871936784591526e-05, + "learning_rate": 6.625909952849368e-06, + "loss": 0.0, + "num_input_tokens_seen": 61280856, + "step": 105615 + }, + { + "epoch": 15.731307715221924, + "grad_norm": 5.550109199248254e-05, + "learning_rate": 6.62370665406315e-06, + "loss": 0.0, + "num_input_tokens_seen": 61283832, + "step": 105620 + }, + { + "epoch": 15.732052427762884, + "grad_norm": 3.9636438486923e-06, + "learning_rate": 6.621503665725734e-06, + "loss": 0.0, + "num_input_tokens_seen": 61286776, + "step": 105625 + }, + { + "epoch": 15.732797140303843, + "grad_norm": 0.000347659777617082, + "learning_rate": 6.619300987874336e-06, + "loss": 0.0, + "num_input_tokens_seen": 61289400, + "step": 105630 + }, + { + "epoch": 15.733541852844802, + "grad_norm": 7.835999713279307e-05, + "learning_rate": 6.617098620546166e-06, + "loss": 0.0, + "num_input_tokens_seen": 61291992, + "step": 105635 + }, + { + "epoch": 15.73428656538576, + "grad_norm": 3.484501939965412e-05, + "learning_rate": 6.614896563778425e-06, + "loss": 0.0067, + "num_input_tokens_seen": 61294648, + "step": 105640 + }, + { + "epoch": 15.735031277926721, + "grad_norm": 0.00017596683755982667, + "learning_rate": 6.6126948176083284e-06, + "loss": 0.0, + "num_input_tokens_seen": 61297528, + "step": 105645 + }, + { + "epoch": 15.73577599046768, + "grad_norm": 0.00018299036310054362, + "learning_rate": 6.610493382073063e-06, + "loss": 0.0, + "num_input_tokens_seen": 61300152, + "step": 105650 + }, + { + "epoch": 15.736520703008638, + "grad_norm": 0.0026672978419810534, + "learning_rate": 6.6082922572098135e-06, + "loss": 0.0028, + "num_input_tokens_seen": 61303032, + "step": 105655 + }, + { + "epoch": 15.737265415549597, + "grad_norm": 0.0005008724401704967, + "learning_rate": 6.6060914430557845e-06, + "loss": 0.0, + "num_input_tokens_seen": 61306072, + "step": 105660 + }, + { + "epoch": 15.738010128090558, + "grad_norm": 4.7361114411614835e-05, + "learning_rate": 6.603890939648136e-06, + "loss": 0.0, + "num_input_tokens_seen": 61309176, + "step": 105665 + }, + { + "epoch": 15.738754840631517, + "grad_norm": 6.750435568392277e-05, + "learning_rate": 6.601690747024061e-06, + "loss": 0.0, + "num_input_tokens_seen": 61312184, + "step": 105670 + }, + { + "epoch": 15.739499553172475, + "grad_norm": 0.00020734354620799422, + "learning_rate": 6.599490865220714e-06, + "loss": 0.0, + "num_input_tokens_seen": 61315672, + "step": 105675 + }, + { + "epoch": 15.740244265713434, + "grad_norm": 1.0198246854997706e-05, + "learning_rate": 6.597291294275276e-06, + "loss": 0.0, + "num_input_tokens_seen": 61318488, + "step": 105680 + }, + { + "epoch": 15.740988978254395, + "grad_norm": 6.0027738072676584e-05, + "learning_rate": 6.595092034224898e-06, + "loss": 0.0, + "num_input_tokens_seen": 61321176, + "step": 105685 + }, + { + "epoch": 15.741733690795353, + "grad_norm": 0.0003939622547477484, + "learning_rate": 6.592893085106733e-06, + "loss": 0.0, + "num_input_tokens_seen": 61324152, + "step": 105690 + }, + { + "epoch": 15.742478403336312, + "grad_norm": 0.010030069388449192, + "learning_rate": 6.590694446957924e-06, + "loss": 0.0, + "num_input_tokens_seen": 61326904, + "step": 105695 + }, + { + "epoch": 15.74322311587727, + "grad_norm": 0.0003099566383752972, + "learning_rate": 6.588496119815629e-06, + "loss": 0.0, + "num_input_tokens_seen": 61329784, + "step": 105700 + }, + { + "epoch": 15.743967828418231, + "grad_norm": 0.00014176435070112348, + "learning_rate": 6.5862981037169816e-06, + "loss": 0.0, + "num_input_tokens_seen": 61332760, + "step": 105705 + }, + { + "epoch": 15.74471254095919, + "grad_norm": 0.00010978749924106523, + "learning_rate": 6.584100398699103e-06, + "loss": 0.0, + "num_input_tokens_seen": 61335544, + "step": 105710 + }, + { + "epoch": 15.745457253500149, + "grad_norm": 3.9889222534839064e-05, + "learning_rate": 6.581903004799139e-06, + "loss": 0.0, + "num_input_tokens_seen": 61338168, + "step": 105715 + }, + { + "epoch": 15.746201966041108, + "grad_norm": 0.00011571119830477983, + "learning_rate": 6.5797059220541965e-06, + "loss": 0.0, + "num_input_tokens_seen": 61341240, + "step": 105720 + }, + { + "epoch": 15.746946678582066, + "grad_norm": 5.9032456192653626e-05, + "learning_rate": 6.57750915050141e-06, + "loss": 0.0004, + "num_input_tokens_seen": 61343896, + "step": 105725 + }, + { + "epoch": 15.747691391123027, + "grad_norm": 0.0006023237365297973, + "learning_rate": 6.575312690177882e-06, + "loss": 0.0, + "num_input_tokens_seen": 61346488, + "step": 105730 + }, + { + "epoch": 15.748436103663986, + "grad_norm": 0.006837570108473301, + "learning_rate": 6.573116541120714e-06, + "loss": 0.0, + "num_input_tokens_seen": 61349368, + "step": 105735 + }, + { + "epoch": 15.749180816204944, + "grad_norm": 0.0004700675490312278, + "learning_rate": 6.570920703367017e-06, + "loss": 0.0, + "num_input_tokens_seen": 61352344, + "step": 105740 + }, + { + "epoch": 15.749925528745905, + "grad_norm": 1.8541251847636886e-05, + "learning_rate": 6.5687251769538795e-06, + "loss": 0.0, + "num_input_tokens_seen": 61354968, + "step": 105745 + }, + { + "epoch": 15.750670241286864, + "grad_norm": 0.0056242262944579124, + "learning_rate": 6.566529961918405e-06, + "loss": 0.0025, + "num_input_tokens_seen": 61357944, + "step": 105750 + }, + { + "epoch": 15.751414953827823, + "grad_norm": 0.0004899841733276844, + "learning_rate": 6.564335058297674e-06, + "loss": 0.0, + "num_input_tokens_seen": 61360728, + "step": 105755 + }, + { + "epoch": 15.752159666368781, + "grad_norm": 0.0004420671903062612, + "learning_rate": 6.56214046612876e-06, + "loss": 0.0, + "num_input_tokens_seen": 61363512, + "step": 105760 + }, + { + "epoch": 15.75290437890974, + "grad_norm": 3.374268089828547e-06, + "learning_rate": 6.5599461854487485e-06, + "loss": 0.0, + "num_input_tokens_seen": 61366392, + "step": 105765 + }, + { + "epoch": 15.7536490914507, + "grad_norm": 5.761007785797119, + "learning_rate": 6.557752216294693e-06, + "loss": 0.0032, + "num_input_tokens_seen": 61369304, + "step": 105770 + }, + { + "epoch": 15.75439380399166, + "grad_norm": 0.0015717780916020274, + "learning_rate": 6.555558558703681e-06, + "loss": 0.0, + "num_input_tokens_seen": 61372056, + "step": 105775 + }, + { + "epoch": 15.755138516532618, + "grad_norm": 3.106378790107556e-05, + "learning_rate": 6.5533652127127505e-06, + "loss": 0.0, + "num_input_tokens_seen": 61375160, + "step": 105780 + }, + { + "epoch": 15.755883229073577, + "grad_norm": 4.392941264086403e-06, + "learning_rate": 6.551172178358975e-06, + "loss": 0.0, + "num_input_tokens_seen": 61378168, + "step": 105785 + }, + { + "epoch": 15.756627941614537, + "grad_norm": 1.571172833791934e-05, + "learning_rate": 6.548979455679388e-06, + "loss": 0.0, + "num_input_tokens_seen": 61381016, + "step": 105790 + }, + { + "epoch": 15.757372654155496, + "grad_norm": 0.004448717460036278, + "learning_rate": 6.5467870447110475e-06, + "loss": 0.0003, + "num_input_tokens_seen": 61383928, + "step": 105795 + }, + { + "epoch": 15.758117366696455, + "grad_norm": 0.00012326144496910274, + "learning_rate": 6.544594945490978e-06, + "loss": 0.0, + "num_input_tokens_seen": 61386808, + "step": 105800 + }, + { + "epoch": 15.758862079237414, + "grad_norm": 0.0001187633752124384, + "learning_rate": 6.542403158056226e-06, + "loss": 0.0, + "num_input_tokens_seen": 61389528, + "step": 105805 + }, + { + "epoch": 15.759606791778374, + "grad_norm": 7.95559462858364e-05, + "learning_rate": 6.540211682443814e-06, + "loss": 0.0001, + "num_input_tokens_seen": 61392536, + "step": 105810 + }, + { + "epoch": 15.760351504319333, + "grad_norm": 1.2140058970544487e-05, + "learning_rate": 6.5380205186907545e-06, + "loss": 0.0, + "num_input_tokens_seen": 61395448, + "step": 105815 + }, + { + "epoch": 15.761096216860292, + "grad_norm": 1.94507392734522e-05, + "learning_rate": 6.535829666834084e-06, + "loss": 0.0, + "num_input_tokens_seen": 61398008, + "step": 105820 + }, + { + "epoch": 15.76184092940125, + "grad_norm": 0.0023764462675899267, + "learning_rate": 6.533639126910804e-06, + "loss": 0.0, + "num_input_tokens_seen": 61400792, + "step": 105825 + }, + { + "epoch": 15.762585641942211, + "grad_norm": 5.3100488912605215e-06, + "learning_rate": 6.531448898957923e-06, + "loss": 0.0, + "num_input_tokens_seen": 61403512, + "step": 105830 + }, + { + "epoch": 15.76333035448317, + "grad_norm": 0.00013615807984024286, + "learning_rate": 6.529258983012437e-06, + "loss": 0.0, + "num_input_tokens_seen": 61406232, + "step": 105835 + }, + { + "epoch": 15.764075067024129, + "grad_norm": 0.001750321825966239, + "learning_rate": 6.527069379111353e-06, + "loss": 0.0, + "num_input_tokens_seen": 61409080, + "step": 105840 + }, + { + "epoch": 15.764819779565087, + "grad_norm": 0.0014620592119172215, + "learning_rate": 6.5248800872916574e-06, + "loss": 0.0, + "num_input_tokens_seen": 61411960, + "step": 105845 + }, + { + "epoch": 15.765564492106048, + "grad_norm": 1.0530889085202944e-05, + "learning_rate": 6.5226911075903255e-06, + "loss": 0.0, + "num_input_tokens_seen": 61414904, + "step": 105850 + }, + { + "epoch": 15.766309204647007, + "grad_norm": 7.560275077819824, + "learning_rate": 6.5205024400443575e-06, + "loss": 0.0376, + "num_input_tokens_seen": 61417496, + "step": 105855 + }, + { + "epoch": 15.767053917187965, + "grad_norm": 0.00035866021062247455, + "learning_rate": 6.51831408469071e-06, + "loss": 0.0, + "num_input_tokens_seen": 61420504, + "step": 105860 + }, + { + "epoch": 15.767798629728924, + "grad_norm": 0.0025343827437609434, + "learning_rate": 6.516126041566373e-06, + "loss": 0.0, + "num_input_tokens_seen": 61423608, + "step": 105865 + }, + { + "epoch": 15.768543342269885, + "grad_norm": 0.00021526686032302678, + "learning_rate": 6.51393831070829e-06, + "loss": 0.0, + "num_input_tokens_seen": 61426520, + "step": 105870 + }, + { + "epoch": 15.769288054810843, + "grad_norm": 0.0001065868855221197, + "learning_rate": 6.511750892153439e-06, + "loss": 0.0, + "num_input_tokens_seen": 61429176, + "step": 105875 + }, + { + "epoch": 15.770032767351802, + "grad_norm": 1.832179623306729e-05, + "learning_rate": 6.50956378593876e-06, + "loss": 0.0, + "num_input_tokens_seen": 61432120, + "step": 105880 + }, + { + "epoch": 15.770777479892761, + "grad_norm": 0.00010006062075262889, + "learning_rate": 6.507376992101214e-06, + "loss": 0.0, + "num_input_tokens_seen": 61434904, + "step": 105885 + }, + { + "epoch": 15.771522192433721, + "grad_norm": 0.0005070858169347048, + "learning_rate": 6.505190510677739e-06, + "loss": 0.0005, + "num_input_tokens_seen": 61437752, + "step": 105890 + }, + { + "epoch": 15.77226690497468, + "grad_norm": 192.6223907470703, + "learning_rate": 6.503004341705271e-06, + "loss": 0.2875, + "num_input_tokens_seen": 61440664, + "step": 105895 + }, + { + "epoch": 15.773011617515639, + "grad_norm": 0.0002967000473290682, + "learning_rate": 6.5008184852207475e-06, + "loss": 0.0, + "num_input_tokens_seen": 61443512, + "step": 105900 + }, + { + "epoch": 15.773756330056598, + "grad_norm": 0.00014819727221038193, + "learning_rate": 6.498632941261088e-06, + "loss": 0.0, + "num_input_tokens_seen": 61446456, + "step": 105905 + }, + { + "epoch": 15.774501042597556, + "grad_norm": 0.0015229200944304466, + "learning_rate": 6.496447709863227e-06, + "loss": 0.0, + "num_input_tokens_seen": 61449368, + "step": 105910 + }, + { + "epoch": 15.775245755138517, + "grad_norm": 2.4807932277326472e-05, + "learning_rate": 6.494262791064065e-06, + "loss": 0.0, + "num_input_tokens_seen": 61452248, + "step": 105915 + }, + { + "epoch": 15.775990467679476, + "grad_norm": 0.00016467935347463936, + "learning_rate": 6.4920781849005355e-06, + "loss": 0.0, + "num_input_tokens_seen": 61455064, + "step": 105920 + }, + { + "epoch": 15.776735180220435, + "grad_norm": 0.0002150628570234403, + "learning_rate": 6.489893891409535e-06, + "loss": 0.0, + "num_input_tokens_seen": 61457944, + "step": 105925 + }, + { + "epoch": 15.777479892761393, + "grad_norm": 2.368620152992662e-05, + "learning_rate": 6.487709910627957e-06, + "loss": 0.0, + "num_input_tokens_seen": 61460632, + "step": 105930 + }, + { + "epoch": 15.778224605302354, + "grad_norm": 3.3116562008217443e-06, + "learning_rate": 6.48552624259271e-06, + "loss": 0.0, + "num_input_tokens_seen": 61463288, + "step": 105935 + }, + { + "epoch": 15.778969317843313, + "grad_norm": 6.863762973807752e-05, + "learning_rate": 6.483342887340674e-06, + "loss": 0.0, + "num_input_tokens_seen": 61466392, + "step": 105940 + }, + { + "epoch": 15.779714030384271, + "grad_norm": 0.00018941341841127723, + "learning_rate": 6.481159844908746e-06, + "loss": 0.0, + "num_input_tokens_seen": 61469400, + "step": 105945 + }, + { + "epoch": 15.78045874292523, + "grad_norm": 8.298937609652057e-05, + "learning_rate": 6.478977115333796e-06, + "loss": 0.0, + "num_input_tokens_seen": 61472344, + "step": 105950 + }, + { + "epoch": 15.78120345546619, + "grad_norm": 0.00010692527575884014, + "learning_rate": 6.4767946986527105e-06, + "loss": 0.0, + "num_input_tokens_seen": 61475128, + "step": 105955 + }, + { + "epoch": 15.78194816800715, + "grad_norm": 9.46114869293524e-06, + "learning_rate": 6.474612594902351e-06, + "loss": 0.0, + "num_input_tokens_seen": 61478360, + "step": 105960 + }, + { + "epoch": 15.782692880548108, + "grad_norm": 5.110056008561514e-05, + "learning_rate": 6.472430804119584e-06, + "loss": 0.0, + "num_input_tokens_seen": 61481400, + "step": 105965 + }, + { + "epoch": 15.783437593089067, + "grad_norm": 0.0013223427813500166, + "learning_rate": 6.470249326341269e-06, + "loss": 0.0, + "num_input_tokens_seen": 61484184, + "step": 105970 + }, + { + "epoch": 15.784182305630027, + "grad_norm": 6.533015039167367e-06, + "learning_rate": 6.468068161604251e-06, + "loss": 0.0, + "num_input_tokens_seen": 61487256, + "step": 105975 + }, + { + "epoch": 15.784927018170986, + "grad_norm": 0.0009102378971874714, + "learning_rate": 6.465887309945393e-06, + "loss": 0.0, + "num_input_tokens_seen": 61490200, + "step": 105980 + }, + { + "epoch": 15.785671730711945, + "grad_norm": 4.224883468850749e-06, + "learning_rate": 6.4637067714015255e-06, + "loss": 0.0778, + "num_input_tokens_seen": 61493496, + "step": 105985 + }, + { + "epoch": 15.786416443252904, + "grad_norm": 0.00035901361843571067, + "learning_rate": 6.4615265460095e-06, + "loss": 0.0, + "num_input_tokens_seen": 61496088, + "step": 105990 + }, + { + "epoch": 15.787161155793864, + "grad_norm": 0.00037037901347503066, + "learning_rate": 6.459346633806132e-06, + "loss": 0.0001, + "num_input_tokens_seen": 61499000, + "step": 105995 + }, + { + "epoch": 15.787905868334823, + "grad_norm": 2.0565257727866992e-05, + "learning_rate": 6.457167034828268e-06, + "loss": 0.0, + "num_input_tokens_seen": 61501784, + "step": 106000 + }, + { + "epoch": 15.788650580875782, + "grad_norm": 0.0003671915619634092, + "learning_rate": 6.45498774911272e-06, + "loss": 0.0, + "num_input_tokens_seen": 61504664, + "step": 106005 + }, + { + "epoch": 15.78939529341674, + "grad_norm": 0.0003785437729675323, + "learning_rate": 6.452808776696301e-06, + "loss": 0.0, + "num_input_tokens_seen": 61507480, + "step": 106010 + }, + { + "epoch": 15.790140005957701, + "grad_norm": 0.0044069490395486355, + "learning_rate": 6.450630117615833e-06, + "loss": 0.0, + "num_input_tokens_seen": 61510360, + "step": 106015 + }, + { + "epoch": 15.79088471849866, + "grad_norm": 0.00014530951739288867, + "learning_rate": 6.448451771908115e-06, + "loss": 0.0, + "num_input_tokens_seen": 61513336, + "step": 106020 + }, + { + "epoch": 15.791629431039619, + "grad_norm": 2.9532563985412708e-06, + "learning_rate": 6.446273739609943e-06, + "loss": 0.0, + "num_input_tokens_seen": 61516472, + "step": 106025 + }, + { + "epoch": 15.792374143580577, + "grad_norm": 0.00022083510702941567, + "learning_rate": 6.444096020758125e-06, + "loss": 0.0, + "num_input_tokens_seen": 61519480, + "step": 106030 + }, + { + "epoch": 15.793118856121538, + "grad_norm": 6.182795004860964e-06, + "learning_rate": 6.4419186153894475e-06, + "loss": 0.0, + "num_input_tokens_seen": 61521976, + "step": 106035 + }, + { + "epoch": 15.793863568662497, + "grad_norm": 0.0006465785554610193, + "learning_rate": 6.439741523540685e-06, + "loss": 0.0, + "num_input_tokens_seen": 61524760, + "step": 106040 + }, + { + "epoch": 15.794608281203455, + "grad_norm": 2.9115435609128326e-06, + "learning_rate": 6.437564745248634e-06, + "loss": 0.0, + "num_input_tokens_seen": 61527352, + "step": 106045 + }, + { + "epoch": 15.795352993744414, + "grad_norm": 9.154972758551594e-06, + "learning_rate": 6.435388280550062e-06, + "loss": 0.0, + "num_input_tokens_seen": 61530232, + "step": 106050 + }, + { + "epoch": 15.796097706285373, + "grad_norm": 6.854459206806496e-05, + "learning_rate": 6.4332121294817245e-06, + "loss": 0.0, + "num_input_tokens_seen": 61532888, + "step": 106055 + }, + { + "epoch": 15.796842418826333, + "grad_norm": 3.2355968869524077e-05, + "learning_rate": 6.431036292080409e-06, + "loss": 0.0, + "num_input_tokens_seen": 61535768, + "step": 106060 + }, + { + "epoch": 15.797587131367292, + "grad_norm": 5.074533692095429e-05, + "learning_rate": 6.428860768382855e-06, + "loss": 0.0, + "num_input_tokens_seen": 61538712, + "step": 106065 + }, + { + "epoch": 15.798331843908251, + "grad_norm": 5.049142419011332e-05, + "learning_rate": 6.42668555842583e-06, + "loss": 0.0, + "num_input_tokens_seen": 61541432, + "step": 106070 + }, + { + "epoch": 15.799076556449211, + "grad_norm": 3.899774674209766e-05, + "learning_rate": 6.4245106622460665e-06, + "loss": 0.0, + "num_input_tokens_seen": 61544024, + "step": 106075 + }, + { + "epoch": 15.79982126899017, + "grad_norm": 1.2752788279613014e-05, + "learning_rate": 6.422336079880325e-06, + "loss": 0.0, + "num_input_tokens_seen": 61546616, + "step": 106080 + }, + { + "epoch": 15.800565981531129, + "grad_norm": 4.473378339753253e-06, + "learning_rate": 6.420161811365336e-06, + "loss": 0.0, + "num_input_tokens_seen": 61549848, + "step": 106085 + }, + { + "epoch": 15.801310694072088, + "grad_norm": 0.00010065315291285515, + "learning_rate": 6.417987856737825e-06, + "loss": 0.0, + "num_input_tokens_seen": 61552536, + "step": 106090 + }, + { + "epoch": 15.802055406613047, + "grad_norm": 4.104950676264707e-06, + "learning_rate": 6.415814216034527e-06, + "loss": 0.0, + "num_input_tokens_seen": 61555320, + "step": 106095 + }, + { + "epoch": 15.802800119154007, + "grad_norm": 6.669030426564859e-06, + "learning_rate": 6.41364088929215e-06, + "loss": 0.0006, + "num_input_tokens_seen": 61558360, + "step": 106100 + }, + { + "epoch": 15.803544831694966, + "grad_norm": 4.874409114563605e-06, + "learning_rate": 6.4114678765474275e-06, + "loss": 0.0, + "num_input_tokens_seen": 61561176, + "step": 106105 + }, + { + "epoch": 15.804289544235925, + "grad_norm": 4.8941376007860526e-05, + "learning_rate": 6.409295177837058e-06, + "loss": 0.0, + "num_input_tokens_seen": 61564024, + "step": 106110 + }, + { + "epoch": 15.805034256776883, + "grad_norm": 4.5744076487608254e-05, + "learning_rate": 6.407122793197756e-06, + "loss": 0.0, + "num_input_tokens_seen": 61566808, + "step": 106115 + }, + { + "epoch": 15.805778969317844, + "grad_norm": 1.8007562175625935e-05, + "learning_rate": 6.404950722666211e-06, + "loss": 0.0003, + "num_input_tokens_seen": 61569528, + "step": 106120 + }, + { + "epoch": 15.806523681858803, + "grad_norm": 0.00012650770077016205, + "learning_rate": 6.402778966279133e-06, + "loss": 0.0, + "num_input_tokens_seen": 61572184, + "step": 106125 + }, + { + "epoch": 15.807268394399761, + "grad_norm": 1.9595810954342596e-05, + "learning_rate": 6.400607524073201e-06, + "loss": 0.0, + "num_input_tokens_seen": 61575352, + "step": 106130 + }, + { + "epoch": 15.80801310694072, + "grad_norm": 0.00013760758156422526, + "learning_rate": 6.3984363960850945e-06, + "loss": 0.0, + "num_input_tokens_seen": 61578328, + "step": 106135 + }, + { + "epoch": 15.80875781948168, + "grad_norm": 5.5375399824697524e-05, + "learning_rate": 6.396265582351508e-06, + "loss": 0.0, + "num_input_tokens_seen": 61581208, + "step": 106140 + }, + { + "epoch": 15.80950253202264, + "grad_norm": 3.283495607320219e-05, + "learning_rate": 6.394095082909099e-06, + "loss": 0.0, + "num_input_tokens_seen": 61584120, + "step": 106145 + }, + { + "epoch": 15.810247244563598, + "grad_norm": 0.0001826607040129602, + "learning_rate": 6.391924897794549e-06, + "loss": 0.0, + "num_input_tokens_seen": 61586744, + "step": 106150 + }, + { + "epoch": 15.810991957104557, + "grad_norm": 0.014007436111569405, + "learning_rate": 6.3897550270445165e-06, + "loss": 0.0, + "num_input_tokens_seen": 61589912, + "step": 106155 + }, + { + "epoch": 15.811736669645517, + "grad_norm": 0.00019475328736007214, + "learning_rate": 6.387585470695659e-06, + "loss": 0.0, + "num_input_tokens_seen": 61592760, + "step": 106160 + }, + { + "epoch": 15.812481382186476, + "grad_norm": 5.070079714641906e-05, + "learning_rate": 6.385416228784618e-06, + "loss": 0.0, + "num_input_tokens_seen": 61595736, + "step": 106165 + }, + { + "epoch": 15.813226094727435, + "grad_norm": 0.0003390810452401638, + "learning_rate": 6.383247301348061e-06, + "loss": 0.0, + "num_input_tokens_seen": 61598616, + "step": 106170 + }, + { + "epoch": 15.813970807268394, + "grad_norm": 0.0020643831230700016, + "learning_rate": 6.381078688422617e-06, + "loss": 0.0975, + "num_input_tokens_seen": 61601752, + "step": 106175 + }, + { + "epoch": 15.814715519809354, + "grad_norm": 2.9641254513990134e-05, + "learning_rate": 6.3789103900449205e-06, + "loss": 0.0, + "num_input_tokens_seen": 61604664, + "step": 106180 + }, + { + "epoch": 15.815460232350313, + "grad_norm": 0.000489661528263241, + "learning_rate": 6.3767424062516155e-06, + "loss": 0.0, + "num_input_tokens_seen": 61607672, + "step": 106185 + }, + { + "epoch": 15.816204944891272, + "grad_norm": 2.267972558911424e-05, + "learning_rate": 6.374574737079309e-06, + "loss": 0.0, + "num_input_tokens_seen": 61610904, + "step": 106190 + }, + { + "epoch": 15.81694965743223, + "grad_norm": 0.0001429601397830993, + "learning_rate": 6.372407382564641e-06, + "loss": 0.0, + "num_input_tokens_seen": 61613720, + "step": 106195 + }, + { + "epoch": 15.817694369973191, + "grad_norm": 0.008094409480690956, + "learning_rate": 6.37024034274421e-06, + "loss": 0.1037, + "num_input_tokens_seen": 61616504, + "step": 106200 + }, + { + "epoch": 15.81843908251415, + "grad_norm": 0.006436481606215239, + "learning_rate": 6.368073617654643e-06, + "loss": 0.0, + "num_input_tokens_seen": 61619608, + "step": 106205 + }, + { + "epoch": 15.819183795055109, + "grad_norm": 0.00018462372827343643, + "learning_rate": 6.365907207332536e-06, + "loss": 0.0, + "num_input_tokens_seen": 61622520, + "step": 106210 + }, + { + "epoch": 15.819928507596067, + "grad_norm": 0.000248292664764449, + "learning_rate": 6.3637411118144776e-06, + "loss": 0.0, + "num_input_tokens_seen": 61625464, + "step": 106215 + }, + { + "epoch": 15.820673220137028, + "grad_norm": 0.0019932007417082787, + "learning_rate": 6.361575331137082e-06, + "loss": 0.0, + "num_input_tokens_seen": 61628184, + "step": 106220 + }, + { + "epoch": 15.821417932677987, + "grad_norm": 0.003153536468744278, + "learning_rate": 6.359409865336927e-06, + "loss": 0.0, + "num_input_tokens_seen": 61631000, + "step": 106225 + }, + { + "epoch": 15.822162645218945, + "grad_norm": 3.745334879567963e-06, + "learning_rate": 6.357244714450597e-06, + "loss": 0.0, + "num_input_tokens_seen": 61633752, + "step": 106230 + }, + { + "epoch": 15.822907357759904, + "grad_norm": 2.6236893972964026e-05, + "learning_rate": 6.355079878514661e-06, + "loss": 0.0, + "num_input_tokens_seen": 61636600, + "step": 106235 + }, + { + "epoch": 15.823652070300863, + "grad_norm": 0.0001647731987759471, + "learning_rate": 6.352915357565712e-06, + "loss": 0.0, + "num_input_tokens_seen": 61639288, + "step": 106240 + }, + { + "epoch": 15.824396782841823, + "grad_norm": 0.0009598436881788075, + "learning_rate": 6.350751151640294e-06, + "loss": 0.0001, + "num_input_tokens_seen": 61642424, + "step": 106245 + }, + { + "epoch": 15.825141495382782, + "grad_norm": 0.00018322662799619138, + "learning_rate": 6.348587260774991e-06, + "loss": 0.0003, + "num_input_tokens_seen": 61645176, + "step": 106250 + }, + { + "epoch": 15.825886207923741, + "grad_norm": 0.0004242541326675564, + "learning_rate": 6.346423685006348e-06, + "loss": 0.0, + "num_input_tokens_seen": 61648472, + "step": 106255 + }, + { + "epoch": 15.826630920464702, + "grad_norm": 8.200120646506548e-06, + "learning_rate": 6.344260424370912e-06, + "loss": 0.0, + "num_input_tokens_seen": 61651512, + "step": 106260 + }, + { + "epoch": 15.82737563300566, + "grad_norm": 28.508499145507812, + "learning_rate": 6.342097478905243e-06, + "loss": 0.106, + "num_input_tokens_seen": 61654360, + "step": 106265 + }, + { + "epoch": 15.828120345546619, + "grad_norm": 0.0006429634522646666, + "learning_rate": 6.339934848645868e-06, + "loss": 0.0, + "num_input_tokens_seen": 61657144, + "step": 106270 + }, + { + "epoch": 15.828865058087578, + "grad_norm": 6.843156006652862e-05, + "learning_rate": 6.337772533629333e-06, + "loss": 0.0, + "num_input_tokens_seen": 61659992, + "step": 106275 + }, + { + "epoch": 15.829609770628537, + "grad_norm": 9.648481864132918e-06, + "learning_rate": 6.335610533892156e-06, + "loss": 0.0, + "num_input_tokens_seen": 61662552, + "step": 106280 + }, + { + "epoch": 15.830354483169497, + "grad_norm": 3.0932282243156806e-05, + "learning_rate": 6.33344884947088e-06, + "loss": 0.0, + "num_input_tokens_seen": 61665560, + "step": 106285 + }, + { + "epoch": 15.831099195710456, + "grad_norm": 2.9779916076222435e-05, + "learning_rate": 6.331287480402012e-06, + "loss": 0.0, + "num_input_tokens_seen": 61668216, + "step": 106290 + }, + { + "epoch": 15.831843908251415, + "grad_norm": 3.672082311823033e-05, + "learning_rate": 6.329126426722068e-06, + "loss": 0.0, + "num_input_tokens_seen": 61671192, + "step": 106295 + }, + { + "epoch": 15.832588620792373, + "grad_norm": 4.03803787776269e-05, + "learning_rate": 6.326965688467557e-06, + "loss": 0.0, + "num_input_tokens_seen": 61674104, + "step": 106300 + }, + { + "epoch": 15.833333333333334, + "grad_norm": 8.65447236719774e-06, + "learning_rate": 6.324805265674974e-06, + "loss": 0.0, + "num_input_tokens_seen": 61677176, + "step": 106305 + }, + { + "epoch": 15.834078045874293, + "grad_norm": 7.490635198337259e-06, + "learning_rate": 6.322645158380833e-06, + "loss": 0.0, + "num_input_tokens_seen": 61679928, + "step": 106310 + }, + { + "epoch": 15.834822758415251, + "grad_norm": 1.0770200788101647e-05, + "learning_rate": 6.320485366621612e-06, + "loss": 0.0, + "num_input_tokens_seen": 61682840, + "step": 106315 + }, + { + "epoch": 15.83556747095621, + "grad_norm": 0.0006040657754056156, + "learning_rate": 6.318325890433813e-06, + "loss": 0.0, + "num_input_tokens_seen": 61685656, + "step": 106320 + }, + { + "epoch": 15.83631218349717, + "grad_norm": 0.0003924363700207323, + "learning_rate": 6.316166729853906e-06, + "loss": 0.0, + "num_input_tokens_seen": 61688312, + "step": 106325 + }, + { + "epoch": 15.83705689603813, + "grad_norm": 0.0002274183789268136, + "learning_rate": 6.314007884918377e-06, + "loss": 0.0, + "num_input_tokens_seen": 61691288, + "step": 106330 + }, + { + "epoch": 15.837801608579088, + "grad_norm": 0.00016891161794774234, + "learning_rate": 6.311849355663693e-06, + "loss": 0.0, + "num_input_tokens_seen": 61694296, + "step": 106335 + }, + { + "epoch": 15.838546321120047, + "grad_norm": 0.00011335233284626156, + "learning_rate": 6.309691142126315e-06, + "loss": 0.0, + "num_input_tokens_seen": 61697336, + "step": 106340 + }, + { + "epoch": 15.839291033661008, + "grad_norm": 4.083761086803861e-05, + "learning_rate": 6.307533244342717e-06, + "loss": 0.0, + "num_input_tokens_seen": 61700504, + "step": 106345 + }, + { + "epoch": 15.840035746201966, + "grad_norm": 0.0002979069540742785, + "learning_rate": 6.305375662349344e-06, + "loss": 0.0, + "num_input_tokens_seen": 61703416, + "step": 106350 + }, + { + "epoch": 15.840780458742925, + "grad_norm": 0.0001483041123719886, + "learning_rate": 6.303218396182644e-06, + "loss": 0.0, + "num_input_tokens_seen": 61706232, + "step": 106355 + }, + { + "epoch": 15.841525171283884, + "grad_norm": 7.83806899562478e-05, + "learning_rate": 6.301061445879072e-06, + "loss": 0.0, + "num_input_tokens_seen": 61709304, + "step": 106360 + }, + { + "epoch": 15.842269883824844, + "grad_norm": 0.00032618766999803483, + "learning_rate": 6.298904811475062e-06, + "loss": 0.0, + "num_input_tokens_seen": 61712152, + "step": 106365 + }, + { + "epoch": 15.843014596365803, + "grad_norm": 1.1080295735155232e-05, + "learning_rate": 6.296748493007051e-06, + "loss": 0.0, + "num_input_tokens_seen": 61714904, + "step": 106370 + }, + { + "epoch": 15.843759308906762, + "grad_norm": 114.48766326904297, + "learning_rate": 6.294592490511456e-06, + "loss": 0.2094, + "num_input_tokens_seen": 61717784, + "step": 106375 + }, + { + "epoch": 15.84450402144772, + "grad_norm": 0.0009335039649158716, + "learning_rate": 6.292436804024715e-06, + "loss": 0.0, + "num_input_tokens_seen": 61720568, + "step": 106380 + }, + { + "epoch": 15.845248733988681, + "grad_norm": 7.02240940881893e-05, + "learning_rate": 6.290281433583237e-06, + "loss": 0.0, + "num_input_tokens_seen": 61723384, + "step": 106385 + }, + { + "epoch": 15.84599344652964, + "grad_norm": 0.00014277479203883559, + "learning_rate": 6.288126379223444e-06, + "loss": 0.0, + "num_input_tokens_seen": 61726264, + "step": 106390 + }, + { + "epoch": 15.846738159070599, + "grad_norm": 0.005120118148624897, + "learning_rate": 6.285971640981731e-06, + "loss": 0.0, + "num_input_tokens_seen": 61729144, + "step": 106395 + }, + { + "epoch": 15.847482871611557, + "grad_norm": 0.0014970989432185888, + "learning_rate": 6.283817218894514e-06, + "loss": 0.0, + "num_input_tokens_seen": 61732088, + "step": 106400 + }, + { + "epoch": 15.848227584152518, + "grad_norm": 1.639451875234954e-05, + "learning_rate": 6.281663112998174e-06, + "loss": 0.0209, + "num_input_tokens_seen": 61735288, + "step": 106405 + }, + { + "epoch": 15.848972296693477, + "grad_norm": 0.0007298276177607477, + "learning_rate": 6.2795093233291195e-06, + "loss": 0.0, + "num_input_tokens_seen": 61738488, + "step": 106410 + }, + { + "epoch": 15.849717009234435, + "grad_norm": 0.0003626323596108705, + "learning_rate": 6.27735584992373e-06, + "loss": 0.0001, + "num_input_tokens_seen": 61741336, + "step": 106415 + }, + { + "epoch": 15.850461721775394, + "grad_norm": 0.0004129250592086464, + "learning_rate": 6.275202692818383e-06, + "loss": 0.0, + "num_input_tokens_seen": 61744344, + "step": 106420 + }, + { + "epoch": 15.851206434316353, + "grad_norm": 3.159945845254697e-05, + "learning_rate": 6.2730498520494565e-06, + "loss": 0.2733, + "num_input_tokens_seen": 61747192, + "step": 106425 + }, + { + "epoch": 15.851951146857314, + "grad_norm": 0.0003694627375807613, + "learning_rate": 6.27089732765331e-06, + "loss": 0.0002, + "num_input_tokens_seen": 61749816, + "step": 106430 + }, + { + "epoch": 15.852695859398272, + "grad_norm": 0.002376381540670991, + "learning_rate": 6.2687451196663275e-06, + "loss": 0.0, + "num_input_tokens_seen": 61752632, + "step": 106435 + }, + { + "epoch": 15.853440571939231, + "grad_norm": 0.0030664713121950626, + "learning_rate": 6.266593228124851e-06, + "loss": 0.0, + "num_input_tokens_seen": 61755512, + "step": 106440 + }, + { + "epoch": 15.85418528448019, + "grad_norm": 0.0007021179771982133, + "learning_rate": 6.264441653065248e-06, + "loss": 0.183, + "num_input_tokens_seen": 61758040, + "step": 106445 + }, + { + "epoch": 15.85492999702115, + "grad_norm": 0.002733756322413683, + "learning_rate": 6.262290394523862e-06, + "loss": 0.0, + "num_input_tokens_seen": 61760824, + "step": 106450 + }, + { + "epoch": 15.85567470956211, + "grad_norm": 8.37850893731229e-06, + "learning_rate": 6.260139452537028e-06, + "loss": 0.0, + "num_input_tokens_seen": 61763672, + "step": 106455 + }, + { + "epoch": 15.856419422103068, + "grad_norm": 3.1130998650041874e-06, + "learning_rate": 6.257988827141101e-06, + "loss": 0.0, + "num_input_tokens_seen": 61766424, + "step": 106460 + }, + { + "epoch": 15.857164134644027, + "grad_norm": 0.004466614220291376, + "learning_rate": 6.255838518372395e-06, + "loss": 0.0, + "num_input_tokens_seen": 61769272, + "step": 106465 + }, + { + "epoch": 15.857908847184987, + "grad_norm": 0.0005088744801469147, + "learning_rate": 6.253688526267254e-06, + "loss": 0.0, + "num_input_tokens_seen": 61772344, + "step": 106470 + }, + { + "epoch": 15.858653559725946, + "grad_norm": 0.008995178155601025, + "learning_rate": 6.251538850861985e-06, + "loss": 0.0, + "num_input_tokens_seen": 61774968, + "step": 106475 + }, + { + "epoch": 15.859398272266905, + "grad_norm": 3.8400699850171804e-05, + "learning_rate": 6.24938949219292e-06, + "loss": 0.0, + "num_input_tokens_seen": 61778136, + "step": 106480 + }, + { + "epoch": 15.860142984807863, + "grad_norm": 2.0222489833831787, + "learning_rate": 6.2472404502963625e-06, + "loss": 0.0059, + "num_input_tokens_seen": 61780952, + "step": 106485 + }, + { + "epoch": 15.860887697348824, + "grad_norm": 3.8770176615798846e-05, + "learning_rate": 6.245091725208616e-06, + "loss": 0.0, + "num_input_tokens_seen": 61783992, + "step": 106490 + }, + { + "epoch": 15.861632409889783, + "grad_norm": 1.732857163005974e-05, + "learning_rate": 6.242943316965985e-06, + "loss": 0.0, + "num_input_tokens_seen": 61786744, + "step": 106495 + }, + { + "epoch": 15.862377122430741, + "grad_norm": 0.0002602175227366388, + "learning_rate": 6.2407952256047565e-06, + "loss": 0.0, + "num_input_tokens_seen": 61789720, + "step": 106500 + }, + { + "epoch": 15.8631218349717, + "grad_norm": 2.932195639004931e-05, + "learning_rate": 6.238647451161231e-06, + "loss": 0.0, + "num_input_tokens_seen": 61792376, + "step": 106505 + }, + { + "epoch": 15.86386654751266, + "grad_norm": 0.00061200832715258, + "learning_rate": 6.2364999936716825e-06, + "loss": 0.0, + "num_input_tokens_seen": 61795192, + "step": 106510 + }, + { + "epoch": 15.86461126005362, + "grad_norm": 9.374663204653189e-05, + "learning_rate": 6.234352853172404e-06, + "loss": 0.0, + "num_input_tokens_seen": 61798168, + "step": 106515 + }, + { + "epoch": 15.865355972594578, + "grad_norm": 0.04180942848324776, + "learning_rate": 6.232206029699655e-06, + "loss": 0.0, + "num_input_tokens_seen": 61801240, + "step": 106520 + }, + { + "epoch": 15.866100685135537, + "grad_norm": 4.262031325197313e-06, + "learning_rate": 6.230059523289716e-06, + "loss": 0.0, + "num_input_tokens_seen": 61804184, + "step": 106525 + }, + { + "epoch": 15.866845397676498, + "grad_norm": 5.8314279158366844e-05, + "learning_rate": 6.227913333978847e-06, + "loss": 0.0, + "num_input_tokens_seen": 61807352, + "step": 106530 + }, + { + "epoch": 15.867590110217456, + "grad_norm": 0.000533802027348429, + "learning_rate": 6.225767461803295e-06, + "loss": 0.0, + "num_input_tokens_seen": 61810488, + "step": 106535 + }, + { + "epoch": 15.868334822758415, + "grad_norm": 1.7079884855775163e-05, + "learning_rate": 6.223621906799326e-06, + "loss": 0.0016, + "num_input_tokens_seen": 61813176, + "step": 106540 + }, + { + "epoch": 15.869079535299374, + "grad_norm": 3.1177110940916464e-05, + "learning_rate": 6.221476669003176e-06, + "loss": 0.0, + "num_input_tokens_seen": 61816216, + "step": 106545 + }, + { + "epoch": 15.869824247840334, + "grad_norm": 0.004451471380889416, + "learning_rate": 6.219331748451096e-06, + "loss": 0.0762, + "num_input_tokens_seen": 61819032, + "step": 106550 + }, + { + "epoch": 15.870568960381293, + "grad_norm": 0.00010413851123303175, + "learning_rate": 6.217187145179321e-06, + "loss": 0.0, + "num_input_tokens_seen": 61822040, + "step": 106555 + }, + { + "epoch": 15.871313672922252, + "grad_norm": 0.0002031376352533698, + "learning_rate": 6.2150428592240795e-06, + "loss": 0.0302, + "num_input_tokens_seen": 61824920, + "step": 106560 + }, + { + "epoch": 15.87205838546321, + "grad_norm": 4.823101335205138e-06, + "learning_rate": 6.212898890621588e-06, + "loss": 0.0, + "num_input_tokens_seen": 61827928, + "step": 106565 + }, + { + "epoch": 15.872803098004171, + "grad_norm": 1.4840694348094985e-05, + "learning_rate": 6.210755239408083e-06, + "loss": 0.0, + "num_input_tokens_seen": 61830936, + "step": 106570 + }, + { + "epoch": 15.87354781054513, + "grad_norm": 0.00023385290114674717, + "learning_rate": 6.208611905619774e-06, + "loss": 0.0, + "num_input_tokens_seen": 61833976, + "step": 106575 + }, + { + "epoch": 15.874292523086089, + "grad_norm": 2.7979474452877184e-06, + "learning_rate": 6.20646888929286e-06, + "loss": 0.0, + "num_input_tokens_seen": 61836728, + "step": 106580 + }, + { + "epoch": 15.875037235627047, + "grad_norm": 8.805455581750721e-05, + "learning_rate": 6.204326190463558e-06, + "loss": 0.0037, + "num_input_tokens_seen": 61839608, + "step": 106585 + }, + { + "epoch": 15.875781948168008, + "grad_norm": 0.009078203700482845, + "learning_rate": 6.202183809168058e-06, + "loss": 0.0, + "num_input_tokens_seen": 61842520, + "step": 106590 + }, + { + "epoch": 15.876526660708967, + "grad_norm": 3.3694461762934225e-06, + "learning_rate": 6.200041745442561e-06, + "loss": 0.0001, + "num_input_tokens_seen": 61845496, + "step": 106595 + }, + { + "epoch": 15.877271373249926, + "grad_norm": 0.0022973918821662664, + "learning_rate": 6.197899999323245e-06, + "loss": 0.0, + "num_input_tokens_seen": 61848408, + "step": 106600 + }, + { + "epoch": 15.878016085790884, + "grad_norm": 172.9752197265625, + "learning_rate": 6.195758570846308e-06, + "loss": 0.0352, + "num_input_tokens_seen": 61851576, + "step": 106605 + }, + { + "epoch": 15.878760798331843, + "grad_norm": 2.2194761186256073e-05, + "learning_rate": 6.193617460047918e-06, + "loss": 0.0, + "num_input_tokens_seen": 61854424, + "step": 106610 + }, + { + "epoch": 15.879505510872804, + "grad_norm": 283.1442565917969, + "learning_rate": 6.191476666964238e-06, + "loss": 0.0284, + "num_input_tokens_seen": 61857336, + "step": 106615 + }, + { + "epoch": 15.880250223413762, + "grad_norm": 6.816009408794343e-05, + "learning_rate": 6.189336191631451e-06, + "loss": 0.0, + "num_input_tokens_seen": 61860024, + "step": 106620 + }, + { + "epoch": 15.880994935954721, + "grad_norm": 0.000215949461562559, + "learning_rate": 6.1871960340857126e-06, + "loss": 0.0, + "num_input_tokens_seen": 61863032, + "step": 106625 + }, + { + "epoch": 15.88173964849568, + "grad_norm": 1.8610233382787555e-05, + "learning_rate": 6.185056194363176e-06, + "loss": 0.0001, + "num_input_tokens_seen": 61865880, + "step": 106630 + }, + { + "epoch": 15.88248436103664, + "grad_norm": 0.00010522521188249812, + "learning_rate": 6.182916672499983e-06, + "loss": 0.0, + "num_input_tokens_seen": 61868824, + "step": 106635 + }, + { + "epoch": 15.8832290735776, + "grad_norm": 8.927837916417047e-05, + "learning_rate": 6.180777468532298e-06, + "loss": 0.0, + "num_input_tokens_seen": 61871704, + "step": 106640 + }, + { + "epoch": 15.883973786118558, + "grad_norm": 2.1847632524440996e-05, + "learning_rate": 6.1786385824962436e-06, + "loss": 0.0, + "num_input_tokens_seen": 61874616, + "step": 106645 + }, + { + "epoch": 15.884718498659517, + "grad_norm": 5.7009299780474976e-05, + "learning_rate": 6.176500014427966e-06, + "loss": 0.0499, + "num_input_tokens_seen": 61877464, + "step": 106650 + }, + { + "epoch": 15.885463211200477, + "grad_norm": 0.00018613362044561654, + "learning_rate": 6.17436176436359e-06, + "loss": 0.0, + "num_input_tokens_seen": 61880440, + "step": 106655 + }, + { + "epoch": 15.886207923741436, + "grad_norm": 1.6031473933253437e-05, + "learning_rate": 6.1722238323392325e-06, + "loss": 0.0, + "num_input_tokens_seen": 61883384, + "step": 106660 + }, + { + "epoch": 15.886952636282395, + "grad_norm": 6.431732617784292e-05, + "learning_rate": 6.1700862183910245e-06, + "loss": 0.0, + "num_input_tokens_seen": 61886232, + "step": 106665 + }, + { + "epoch": 15.887697348823353, + "grad_norm": 0.0001653764775255695, + "learning_rate": 6.167948922555064e-06, + "loss": 0.0, + "num_input_tokens_seen": 61888824, + "step": 106670 + }, + { + "epoch": 15.888442061364314, + "grad_norm": 6.472048880823422e-06, + "learning_rate": 6.165811944867475e-06, + "loss": 0.0001, + "num_input_tokens_seen": 61891416, + "step": 106675 + }, + { + "epoch": 15.889186773905273, + "grad_norm": 4.313541285227984e-05, + "learning_rate": 6.163675285364348e-06, + "loss": 0.0, + "num_input_tokens_seen": 61894360, + "step": 106680 + }, + { + "epoch": 15.889931486446232, + "grad_norm": 0.007543889340013266, + "learning_rate": 6.161538944081779e-06, + "loss": 0.0, + "num_input_tokens_seen": 61897048, + "step": 106685 + }, + { + "epoch": 15.89067619898719, + "grad_norm": 3.360733899171464e-05, + "learning_rate": 6.1594029210558675e-06, + "loss": 0.0, + "num_input_tokens_seen": 61899736, + "step": 106690 + }, + { + "epoch": 15.89142091152815, + "grad_norm": 0.00026076461654156446, + "learning_rate": 6.157267216322696e-06, + "loss": 0.0, + "num_input_tokens_seen": 61902552, + "step": 106695 + }, + { + "epoch": 15.89216562406911, + "grad_norm": 0.0023605194874107838, + "learning_rate": 6.155131829918345e-06, + "loss": 0.0, + "num_input_tokens_seen": 61905464, + "step": 106700 + }, + { + "epoch": 15.892910336610068, + "grad_norm": 0.003423417219892144, + "learning_rate": 6.1529967618788795e-06, + "loss": 0.0, + "num_input_tokens_seen": 61908344, + "step": 106705 + }, + { + "epoch": 15.893655049151027, + "grad_norm": 1.0685973393265158e-05, + "learning_rate": 6.1508620122403885e-06, + "loss": 0.0, + "num_input_tokens_seen": 61910968, + "step": 106710 + }, + { + "epoch": 15.894399761691988, + "grad_norm": 1.2976396646990906e-05, + "learning_rate": 6.148727581038915e-06, + "loss": 0.0, + "num_input_tokens_seen": 61913944, + "step": 106715 + }, + { + "epoch": 15.895144474232946, + "grad_norm": 0.0003982977941632271, + "learning_rate": 6.146593468310541e-06, + "loss": 0.0, + "num_input_tokens_seen": 61916504, + "step": 106720 + }, + { + "epoch": 15.895889186773905, + "grad_norm": 2.353560739720706e-05, + "learning_rate": 6.144459674091299e-06, + "loss": 0.0, + "num_input_tokens_seen": 61919160, + "step": 106725 + }, + { + "epoch": 15.896633899314864, + "grad_norm": 0.00041846843669191003, + "learning_rate": 6.1423261984172535e-06, + "loss": 0.0, + "num_input_tokens_seen": 61922168, + "step": 106730 + }, + { + "epoch": 15.897378611855824, + "grad_norm": 0.00014246191130951047, + "learning_rate": 6.140193041324444e-06, + "loss": 0.1283, + "num_input_tokens_seen": 61925176, + "step": 106735 + }, + { + "epoch": 15.898123324396783, + "grad_norm": 3.4796519230440026e-06, + "learning_rate": 6.138060202848894e-06, + "loss": 0.0002, + "num_input_tokens_seen": 61927864, + "step": 106740 + }, + { + "epoch": 15.898868036937742, + "grad_norm": 3.6028552131028846e-05, + "learning_rate": 6.135927683026654e-06, + "loss": 0.0004, + "num_input_tokens_seen": 61930648, + "step": 106745 + }, + { + "epoch": 15.8996127494787, + "grad_norm": 6.673180178040639e-05, + "learning_rate": 6.133795481893745e-06, + "loss": 0.0, + "num_input_tokens_seen": 61933464, + "step": 106750 + }, + { + "epoch": 15.90035746201966, + "grad_norm": 4.585467104334384e-05, + "learning_rate": 6.1316635994861875e-06, + "loss": 0.0003, + "num_input_tokens_seen": 61936568, + "step": 106755 + }, + { + "epoch": 15.90110217456062, + "grad_norm": 0.00011626402556430548, + "learning_rate": 6.129532035839985e-06, + "loss": 0.0, + "num_input_tokens_seen": 61939320, + "step": 106760 + }, + { + "epoch": 15.901846887101579, + "grad_norm": 0.0018109207740053535, + "learning_rate": 6.12740079099117e-06, + "loss": 0.0001, + "num_input_tokens_seen": 61942072, + "step": 106765 + }, + { + "epoch": 15.902591599642538, + "grad_norm": 2.7691728973877616e-05, + "learning_rate": 6.12526986497573e-06, + "loss": 0.0, + "num_input_tokens_seen": 61944952, + "step": 106770 + }, + { + "epoch": 15.903336312183498, + "grad_norm": 1.0600942005112302e-05, + "learning_rate": 6.12313925782968e-06, + "loss": 0.0, + "num_input_tokens_seen": 61947640, + "step": 106775 + }, + { + "epoch": 15.904081024724457, + "grad_norm": 3.7990841519786045e-05, + "learning_rate": 6.1210089695890065e-06, + "loss": 0.0, + "num_input_tokens_seen": 61950584, + "step": 106780 + }, + { + "epoch": 15.904825737265416, + "grad_norm": 0.00014062461559660733, + "learning_rate": 6.11887900028969e-06, + "loss": 0.0, + "num_input_tokens_seen": 61953464, + "step": 106785 + }, + { + "epoch": 15.905570449806374, + "grad_norm": 5.29030330653768e-05, + "learning_rate": 6.116749349967732e-06, + "loss": 0.0, + "num_input_tokens_seen": 61956728, + "step": 106790 + }, + { + "epoch": 15.906315162347333, + "grad_norm": 0.00020804246014449745, + "learning_rate": 6.114620018659093e-06, + "loss": 0.0, + "num_input_tokens_seen": 61959960, + "step": 106795 + }, + { + "epoch": 15.907059874888294, + "grad_norm": 2.7826738005387597e-05, + "learning_rate": 6.112491006399762e-06, + "loss": 0.0, + "num_input_tokens_seen": 61962872, + "step": 106800 + }, + { + "epoch": 15.907804587429252, + "grad_norm": 9.211737051373348e-05, + "learning_rate": 6.110362313225693e-06, + "loss": 0.0, + "num_input_tokens_seen": 61965560, + "step": 106805 + }, + { + "epoch": 15.908549299970211, + "grad_norm": 2.476756435498828e-06, + "learning_rate": 6.108233939172858e-06, + "loss": 0.0, + "num_input_tokens_seen": 61968664, + "step": 106810 + }, + { + "epoch": 15.90929401251117, + "grad_norm": 0.6084028482437134, + "learning_rate": 6.106105884277213e-06, + "loss": 0.0003, + "num_input_tokens_seen": 61971416, + "step": 106815 + }, + { + "epoch": 15.91003872505213, + "grad_norm": 5.879880518477876e-06, + "learning_rate": 6.1039781485747045e-06, + "loss": 0.0, + "num_input_tokens_seen": 61974264, + "step": 106820 + }, + { + "epoch": 15.91078343759309, + "grad_norm": 9.071848762687296e-05, + "learning_rate": 6.101850732101283e-06, + "loss": 0.0001, + "num_input_tokens_seen": 61977048, + "step": 106825 + }, + { + "epoch": 15.911528150134048, + "grad_norm": 3.5666469102579867e-06, + "learning_rate": 6.099723634892876e-06, + "loss": 0.0, + "num_input_tokens_seen": 61979928, + "step": 106830 + }, + { + "epoch": 15.912272862675007, + "grad_norm": 0.0006385009619407356, + "learning_rate": 6.097596856985435e-06, + "loss": 0.0, + "num_input_tokens_seen": 61983160, + "step": 106835 + }, + { + "epoch": 15.913017575215967, + "grad_norm": 9.811846393859014e-05, + "learning_rate": 6.095470398414879e-06, + "loss": 0.0, + "num_input_tokens_seen": 61985976, + "step": 106840 + }, + { + "epoch": 15.913762287756926, + "grad_norm": 0.00026632871595211327, + "learning_rate": 6.093344259217143e-06, + "loss": 0.0, + "num_input_tokens_seen": 61988824, + "step": 106845 + }, + { + "epoch": 15.914507000297885, + "grad_norm": 8.769298437982798e-05, + "learning_rate": 6.091218439428134e-06, + "loss": 0.0, + "num_input_tokens_seen": 61991704, + "step": 106850 + }, + { + "epoch": 15.915251712838844, + "grad_norm": 0.00045780467917211354, + "learning_rate": 6.089092939083779e-06, + "loss": 0.0, + "num_input_tokens_seen": 61994680, + "step": 106855 + }, + { + "epoch": 15.915996425379804, + "grad_norm": 3.737608267329051e-06, + "learning_rate": 6.0869677582199775e-06, + "loss": 0.0, + "num_input_tokens_seen": 61997944, + "step": 106860 + }, + { + "epoch": 15.916741137920763, + "grad_norm": 1.074966257874621e-05, + "learning_rate": 6.084842896872625e-06, + "loss": 0.0, + "num_input_tokens_seen": 62000504, + "step": 106865 + }, + { + "epoch": 15.917485850461722, + "grad_norm": 4.626302688848227e-05, + "learning_rate": 6.082718355077635e-06, + "loss": 0.0, + "num_input_tokens_seen": 62003320, + "step": 106870 + }, + { + "epoch": 15.91823056300268, + "grad_norm": 5.721535671909805e-06, + "learning_rate": 6.080594132870885e-06, + "loss": 0.0, + "num_input_tokens_seen": 62006232, + "step": 106875 + }, + { + "epoch": 15.918975275543641, + "grad_norm": 1.3931684406998102e-05, + "learning_rate": 6.078470230288274e-06, + "loss": 0.0008, + "num_input_tokens_seen": 62009112, + "step": 106880 + }, + { + "epoch": 15.9197199880846, + "grad_norm": 8.546100434614345e-05, + "learning_rate": 6.07634664736568e-06, + "loss": 0.0, + "num_input_tokens_seen": 62011992, + "step": 106885 + }, + { + "epoch": 15.920464700625558, + "grad_norm": 4.206158337183297e-05, + "learning_rate": 6.074223384138977e-06, + "loss": 0.0, + "num_input_tokens_seen": 62015160, + "step": 106890 + }, + { + "epoch": 15.921209413166517, + "grad_norm": 1.4336620552057866e-05, + "learning_rate": 6.072100440644033e-06, + "loss": 0.0, + "num_input_tokens_seen": 62017848, + "step": 106895 + }, + { + "epoch": 15.921954125707478, + "grad_norm": 0.00813978724181652, + "learning_rate": 6.069977816916705e-06, + "loss": 0.0, + "num_input_tokens_seen": 62020824, + "step": 106900 + }, + { + "epoch": 15.922698838248436, + "grad_norm": 7.355096022365615e-05, + "learning_rate": 6.067855512992873e-06, + "loss": 0.0, + "num_input_tokens_seen": 62023800, + "step": 106905 + }, + { + "epoch": 15.923443550789395, + "grad_norm": 3.3091986551880836e-05, + "learning_rate": 6.065733528908371e-06, + "loss": 0.0, + "num_input_tokens_seen": 62026712, + "step": 106910 + }, + { + "epoch": 15.924188263330354, + "grad_norm": 0.0001547609135741368, + "learning_rate": 6.0636118646990644e-06, + "loss": 0.0, + "num_input_tokens_seen": 62029560, + "step": 106915 + }, + { + "epoch": 15.924932975871315, + "grad_norm": 1.956081177922897e-05, + "learning_rate": 6.061490520400784e-06, + "loss": 0.0, + "num_input_tokens_seen": 62032152, + "step": 106920 + }, + { + "epoch": 15.925677688412273, + "grad_norm": 0.00022752437507733703, + "learning_rate": 6.059369496049377e-06, + "loss": 0.0, + "num_input_tokens_seen": 62034936, + "step": 106925 + }, + { + "epoch": 15.926422400953232, + "grad_norm": 6.513446714961901e-05, + "learning_rate": 6.05724879168067e-06, + "loss": 0.0, + "num_input_tokens_seen": 62037816, + "step": 106930 + }, + { + "epoch": 15.92716711349419, + "grad_norm": 5.400428108259803e-06, + "learning_rate": 6.055128407330493e-06, + "loss": 0.0, + "num_input_tokens_seen": 62040632, + "step": 106935 + }, + { + "epoch": 15.92791182603515, + "grad_norm": 0.0004444392106961459, + "learning_rate": 6.053008343034671e-06, + "loss": 0.04, + "num_input_tokens_seen": 62043736, + "step": 106940 + }, + { + "epoch": 15.92865653857611, + "grad_norm": 0.0071306293830275536, + "learning_rate": 6.0508885988290075e-06, + "loss": 0.0, + "num_input_tokens_seen": 62046456, + "step": 106945 + }, + { + "epoch": 15.929401251117069, + "grad_norm": 0.0006036516861058772, + "learning_rate": 6.04876917474933e-06, + "loss": 0.0, + "num_input_tokens_seen": 62049304, + "step": 106950 + }, + { + "epoch": 15.930145963658028, + "grad_norm": 0.00010700395796447992, + "learning_rate": 6.046650070831436e-06, + "loss": 0.0, + "num_input_tokens_seen": 62052056, + "step": 106955 + }, + { + "epoch": 15.930890676198988, + "grad_norm": 1.2241727745276876e-05, + "learning_rate": 6.044531287111124e-06, + "loss": 0.0, + "num_input_tokens_seen": 62054840, + "step": 106960 + }, + { + "epoch": 15.931635388739947, + "grad_norm": 0.00010299179120920599, + "learning_rate": 6.042412823624186e-06, + "loss": 0.0, + "num_input_tokens_seen": 62057784, + "step": 106965 + }, + { + "epoch": 15.932380101280906, + "grad_norm": 2.338092599529773e-05, + "learning_rate": 6.040294680406422e-06, + "loss": 0.0, + "num_input_tokens_seen": 62060920, + "step": 106970 + }, + { + "epoch": 15.933124813821864, + "grad_norm": 0.0001185024666483514, + "learning_rate": 6.0381768574936104e-06, + "loss": 0.0002, + "num_input_tokens_seen": 62063608, + "step": 106975 + }, + { + "epoch": 15.933869526362823, + "grad_norm": 0.003089368809014559, + "learning_rate": 6.03605935492152e-06, + "loss": 0.0, + "num_input_tokens_seen": 62066552, + "step": 106980 + }, + { + "epoch": 15.934614238903784, + "grad_norm": 0.012050006538629532, + "learning_rate": 6.0339421727259395e-06, + "loss": 0.1226, + "num_input_tokens_seen": 62069720, + "step": 106985 + }, + { + "epoch": 15.935358951444742, + "grad_norm": 3.473165998002514e-05, + "learning_rate": 6.031825310942624e-06, + "loss": 0.1283, + "num_input_tokens_seen": 62072632, + "step": 106990 + }, + { + "epoch": 15.936103663985701, + "grad_norm": 2.441184187773615e-05, + "learning_rate": 6.029708769607348e-06, + "loss": 0.0, + "num_input_tokens_seen": 62075576, + "step": 106995 + }, + { + "epoch": 15.93684837652666, + "grad_norm": 7.622381235705689e-05, + "learning_rate": 6.027592548755853e-06, + "loss": 0.0, + "num_input_tokens_seen": 62078328, + "step": 107000 + }, + { + "epoch": 15.93759308906762, + "grad_norm": 7.019665190455271e-06, + "learning_rate": 6.025476648423908e-06, + "loss": 0.0, + "num_input_tokens_seen": 62081208, + "step": 107005 + }, + { + "epoch": 15.93833780160858, + "grad_norm": 0.00010465325613040477, + "learning_rate": 6.023361068647251e-06, + "loss": 0.0, + "num_input_tokens_seen": 62084248, + "step": 107010 + }, + { + "epoch": 15.939082514149538, + "grad_norm": 0.013950477354228497, + "learning_rate": 6.021245809461615e-06, + "loss": 0.0, + "num_input_tokens_seen": 62087096, + "step": 107015 + }, + { + "epoch": 15.939827226690497, + "grad_norm": 3.212948513464653e-06, + "learning_rate": 6.0191308709027475e-06, + "loss": 0.0, + "num_input_tokens_seen": 62090232, + "step": 107020 + }, + { + "epoch": 15.940571939231457, + "grad_norm": 0.0039502824656665325, + "learning_rate": 6.017016253006372e-06, + "loss": 0.0, + "num_input_tokens_seen": 62093240, + "step": 107025 + }, + { + "epoch": 15.941316651772416, + "grad_norm": 0.0007193674682639539, + "learning_rate": 6.014901955808216e-06, + "loss": 0.0, + "num_input_tokens_seen": 62095896, + "step": 107030 + }, + { + "epoch": 15.942061364313375, + "grad_norm": 0.0014985542511567473, + "learning_rate": 6.012787979343987e-06, + "loss": 0.0, + "num_input_tokens_seen": 62098776, + "step": 107035 + }, + { + "epoch": 15.942806076854334, + "grad_norm": 2.9117691155988723e-05, + "learning_rate": 6.010674323649415e-06, + "loss": 0.0, + "num_input_tokens_seen": 62101784, + "step": 107040 + }, + { + "epoch": 15.943550789395294, + "grad_norm": 0.0014372339937835932, + "learning_rate": 6.0085609887601925e-06, + "loss": 0.0, + "num_input_tokens_seen": 62104760, + "step": 107045 + }, + { + "epoch": 15.944295501936253, + "grad_norm": 3.6922974686603993e-06, + "learning_rate": 6.0064479747120375e-06, + "loss": 0.0528, + "num_input_tokens_seen": 62107928, + "step": 107050 + }, + { + "epoch": 15.945040214477212, + "grad_norm": 0.0003088769444730133, + "learning_rate": 6.004335281540641e-06, + "loss": 0.0, + "num_input_tokens_seen": 62110520, + "step": 107055 + }, + { + "epoch": 15.94578492701817, + "grad_norm": 1.4490708053926937e-05, + "learning_rate": 6.002222909281685e-06, + "loss": 0.0, + "num_input_tokens_seen": 62113656, + "step": 107060 + }, + { + "epoch": 15.946529639559131, + "grad_norm": 1.8577313312562183e-05, + "learning_rate": 6.000110857970873e-06, + "loss": 0.0, + "num_input_tokens_seen": 62116952, + "step": 107065 + }, + { + "epoch": 15.94727435210009, + "grad_norm": 0.0010499507188796997, + "learning_rate": 5.9979991276438695e-06, + "loss": 0.0, + "num_input_tokens_seen": 62119640, + "step": 107070 + }, + { + "epoch": 15.948019064641048, + "grad_norm": 1.7872395515441895, + "learning_rate": 5.995887718336363e-06, + "loss": 0.0022, + "num_input_tokens_seen": 62122296, + "step": 107075 + }, + { + "epoch": 15.948763777182007, + "grad_norm": 0.0008396044140681624, + "learning_rate": 5.993776630084022e-06, + "loss": 0.0, + "num_input_tokens_seen": 62125144, + "step": 107080 + }, + { + "epoch": 15.949508489722968, + "grad_norm": 7.253195235534804e-06, + "learning_rate": 5.991665862922505e-06, + "loss": 0.0, + "num_input_tokens_seen": 62128088, + "step": 107085 + }, + { + "epoch": 15.950253202263927, + "grad_norm": 9.792563105293084e-06, + "learning_rate": 5.989555416887469e-06, + "loss": 0.0, + "num_input_tokens_seen": 62130936, + "step": 107090 + }, + { + "epoch": 15.950997914804885, + "grad_norm": 4.76185159641318e-05, + "learning_rate": 5.987445292014579e-06, + "loss": 0.0, + "num_input_tokens_seen": 62133848, + "step": 107095 + }, + { + "epoch": 15.951742627345844, + "grad_norm": 6.863763246656163e-06, + "learning_rate": 5.985335488339477e-06, + "loss": 0.0, + "num_input_tokens_seen": 62136792, + "step": 107100 + }, + { + "epoch": 15.952487339886805, + "grad_norm": 0.02930859662592411, + "learning_rate": 5.983226005897799e-06, + "loss": 0.0119, + "num_input_tokens_seen": 62139640, + "step": 107105 + }, + { + "epoch": 15.953232052427763, + "grad_norm": 0.0013249394251033664, + "learning_rate": 5.981116844725199e-06, + "loss": 0.0, + "num_input_tokens_seen": 62142520, + "step": 107110 + }, + { + "epoch": 15.953976764968722, + "grad_norm": 8.790631000010762e-06, + "learning_rate": 5.979008004857292e-06, + "loss": 0.0189, + "num_input_tokens_seen": 62145240, + "step": 107115 + }, + { + "epoch": 15.95472147750968, + "grad_norm": 0.00011715656728483737, + "learning_rate": 5.976899486329718e-06, + "loss": 0.0, + "num_input_tokens_seen": 62148312, + "step": 107120 + }, + { + "epoch": 15.95546619005064, + "grad_norm": 0.00010521871445234865, + "learning_rate": 5.974791289178089e-06, + "loss": 0.0, + "num_input_tokens_seen": 62151064, + "step": 107125 + }, + { + "epoch": 15.9562109025916, + "grad_norm": 5.101775059301872e-06, + "learning_rate": 5.972683413438029e-06, + "loss": 0.0, + "num_input_tokens_seen": 62153848, + "step": 107130 + }, + { + "epoch": 15.956955615132559, + "grad_norm": 1.1094528436660767, + "learning_rate": 5.970575859145144e-06, + "loss": 0.0013, + "num_input_tokens_seen": 62156760, + "step": 107135 + }, + { + "epoch": 15.957700327673518, + "grad_norm": 0.0003384553419891745, + "learning_rate": 5.968468626335033e-06, + "loss": 0.0, + "num_input_tokens_seen": 62159544, + "step": 107140 + }, + { + "epoch": 15.958445040214476, + "grad_norm": 0.00013969201245345175, + "learning_rate": 5.966361715043312e-06, + "loss": 0.0, + "num_input_tokens_seen": 62162456, + "step": 107145 + }, + { + "epoch": 15.959189752755437, + "grad_norm": 5.9251574384688865e-06, + "learning_rate": 5.96425512530556e-06, + "loss": 0.0, + "num_input_tokens_seen": 62165400, + "step": 107150 + }, + { + "epoch": 15.959934465296396, + "grad_norm": 0.00010849049431271851, + "learning_rate": 5.962148857157373e-06, + "loss": 0.0, + "num_input_tokens_seen": 62168152, + "step": 107155 + }, + { + "epoch": 15.960679177837354, + "grad_norm": 2.2808930225437507e-05, + "learning_rate": 5.960042910634325e-06, + "loss": 0.0, + "num_input_tokens_seen": 62170840, + "step": 107160 + }, + { + "epoch": 15.961423890378313, + "grad_norm": 3.433802703511901e-06, + "learning_rate": 5.9579372857720085e-06, + "loss": 0.0, + "num_input_tokens_seen": 62173592, + "step": 107165 + }, + { + "epoch": 15.962168602919274, + "grad_norm": 0.0002222574985353276, + "learning_rate": 5.9558319826059775e-06, + "loss": 0.0732, + "num_input_tokens_seen": 62176568, + "step": 107170 + }, + { + "epoch": 15.962913315460233, + "grad_norm": 0.023711973801255226, + "learning_rate": 5.953727001171819e-06, + "loss": 0.0, + "num_input_tokens_seen": 62179448, + "step": 107175 + }, + { + "epoch": 15.963658028001191, + "grad_norm": 2.482013587723486e-05, + "learning_rate": 5.951622341505086e-06, + "loss": 0.0, + "num_input_tokens_seen": 62182424, + "step": 107180 + }, + { + "epoch": 15.96440274054215, + "grad_norm": 3.969785393564962e-05, + "learning_rate": 5.949518003641325e-06, + "loss": 0.0, + "num_input_tokens_seen": 62185464, + "step": 107185 + }, + { + "epoch": 15.96514745308311, + "grad_norm": 0.0002781306393444538, + "learning_rate": 5.947413987616105e-06, + "loss": 0.0, + "num_input_tokens_seen": 62188248, + "step": 107190 + }, + { + "epoch": 15.96589216562407, + "grad_norm": 0.00036419174284674227, + "learning_rate": 5.94531029346495e-06, + "loss": 0.0057, + "num_input_tokens_seen": 62191032, + "step": 107195 + }, + { + "epoch": 15.966636878165028, + "grad_norm": 0.00043661563540808856, + "learning_rate": 5.943206921223421e-06, + "loss": 0.0597, + "num_input_tokens_seen": 62193944, + "step": 107200 + }, + { + "epoch": 15.967381590705987, + "grad_norm": 6.4243886299664155e-06, + "learning_rate": 5.9411038709270365e-06, + "loss": 0.0, + "num_input_tokens_seen": 62197048, + "step": 107205 + }, + { + "epoch": 15.968126303246947, + "grad_norm": 3.6670019198936643e-06, + "learning_rate": 5.939001142611336e-06, + "loss": 0.0, + "num_input_tokens_seen": 62199928, + "step": 107210 + }, + { + "epoch": 15.968871015787906, + "grad_norm": 0.0009068381041288376, + "learning_rate": 5.9368987363118415e-06, + "loss": 0.0, + "num_input_tokens_seen": 62202840, + "step": 107215 + }, + { + "epoch": 15.969615728328865, + "grad_norm": 0.00027064807363785803, + "learning_rate": 5.934796652064065e-06, + "loss": 0.0, + "num_input_tokens_seen": 62205720, + "step": 107220 + }, + { + "epoch": 15.970360440869824, + "grad_norm": 0.00039716807077638805, + "learning_rate": 5.932694889903523e-06, + "loss": 0.0, + "num_input_tokens_seen": 62208696, + "step": 107225 + }, + { + "epoch": 15.971105153410784, + "grad_norm": 3.7396471270767506e-06, + "learning_rate": 5.930593449865715e-06, + "loss": 0.0, + "num_input_tokens_seen": 62211544, + "step": 107230 + }, + { + "epoch": 15.971849865951743, + "grad_norm": 0.0016108985291793942, + "learning_rate": 5.928492331986155e-06, + "loss": 0.0, + "num_input_tokens_seen": 62214232, + "step": 107235 + }, + { + "epoch": 15.972594578492702, + "grad_norm": 0.0001960854569915682, + "learning_rate": 5.9263915363003294e-06, + "loss": 0.0, + "num_input_tokens_seen": 62217624, + "step": 107240 + }, + { + "epoch": 15.97333929103366, + "grad_norm": 7.787845788698178e-06, + "learning_rate": 5.924291062843737e-06, + "loss": 0.0, + "num_input_tokens_seen": 62220632, + "step": 107245 + }, + { + "epoch": 15.974084003574621, + "grad_norm": 5.572322152147535e-06, + "learning_rate": 5.922190911651857e-06, + "loss": 0.0, + "num_input_tokens_seen": 62223576, + "step": 107250 + }, + { + "epoch": 15.97482871611558, + "grad_norm": 8.419559162575752e-05, + "learning_rate": 5.920091082760174e-06, + "loss": 0.0, + "num_input_tokens_seen": 62226424, + "step": 107255 + }, + { + "epoch": 15.975573428656539, + "grad_norm": 3.063375473022461, + "learning_rate": 5.917991576204163e-06, + "loss": 0.0157, + "num_input_tokens_seen": 62229240, + "step": 107260 + }, + { + "epoch": 15.976318141197497, + "grad_norm": 0.00038325920468196273, + "learning_rate": 5.915892392019282e-06, + "loss": 0.0, + "num_input_tokens_seen": 62232344, + "step": 107265 + }, + { + "epoch": 15.977062853738456, + "grad_norm": 1.7913120245793834e-05, + "learning_rate": 5.913793530241011e-06, + "loss": 0.0016, + "num_input_tokens_seen": 62235736, + "step": 107270 + }, + { + "epoch": 15.977807566279417, + "grad_norm": 0.00021416816161945462, + "learning_rate": 5.91169499090479e-06, + "loss": 0.0, + "num_input_tokens_seen": 62238488, + "step": 107275 + }, + { + "epoch": 15.978552278820375, + "grad_norm": 1.7479866073699668e-05, + "learning_rate": 5.909596774046092e-06, + "loss": 0.0, + "num_input_tokens_seen": 62241368, + "step": 107280 + }, + { + "epoch": 15.979296991361334, + "grad_norm": 3.412188743823208e-05, + "learning_rate": 5.907498879700352e-06, + "loss": 0.0, + "num_input_tokens_seen": 62244280, + "step": 107285 + }, + { + "epoch": 15.980041703902295, + "grad_norm": 0.0007647209567949176, + "learning_rate": 5.905401307903013e-06, + "loss": 0.0, + "num_input_tokens_seen": 62246968, + "step": 107290 + }, + { + "epoch": 15.980786416443253, + "grad_norm": 2.625518798828125, + "learning_rate": 5.903304058689507e-06, + "loss": 0.0087, + "num_input_tokens_seen": 62249720, + "step": 107295 + }, + { + "epoch": 15.981531128984212, + "grad_norm": 1.877607974165585e-05, + "learning_rate": 5.901207132095276e-06, + "loss": 0.0, + "num_input_tokens_seen": 62252792, + "step": 107300 + }, + { + "epoch": 15.982275841525171, + "grad_norm": 0.00013172210310585797, + "learning_rate": 5.899110528155741e-06, + "loss": 0.0119, + "num_input_tokens_seen": 62255416, + "step": 107305 + }, + { + "epoch": 15.98302055406613, + "grad_norm": 0.009492629207670689, + "learning_rate": 5.897014246906312e-06, + "loss": 0.0, + "num_input_tokens_seen": 62258200, + "step": 107310 + }, + { + "epoch": 15.98376526660709, + "grad_norm": 0.0008919390966184437, + "learning_rate": 5.894918288382417e-06, + "loss": 0.0, + "num_input_tokens_seen": 62260920, + "step": 107315 + }, + { + "epoch": 15.984509979148049, + "grad_norm": 0.00015499598521273583, + "learning_rate": 5.8928226526194565e-06, + "loss": 0.0, + "num_input_tokens_seen": 62263960, + "step": 107320 + }, + { + "epoch": 15.985254691689008, + "grad_norm": 0.00010389798262622207, + "learning_rate": 5.890727339652843e-06, + "loss": 0.0, + "num_input_tokens_seen": 62266552, + "step": 107325 + }, + { + "epoch": 15.985999404229966, + "grad_norm": 0.0010054110316559672, + "learning_rate": 5.888632349517962e-06, + "loss": 0.0, + "num_input_tokens_seen": 62269624, + "step": 107330 + }, + { + "epoch": 15.986744116770927, + "grad_norm": 1.0420889339002315e-05, + "learning_rate": 5.886537682250221e-06, + "loss": 0.0, + "num_input_tokens_seen": 62272536, + "step": 107335 + }, + { + "epoch": 15.987488829311886, + "grad_norm": 0.0001082939124898985, + "learning_rate": 5.8844433378849986e-06, + "loss": 0.0, + "num_input_tokens_seen": 62275384, + "step": 107340 + }, + { + "epoch": 15.988233541852845, + "grad_norm": 0.00045185015187598765, + "learning_rate": 5.882349316457672e-06, + "loss": 0.0005, + "num_input_tokens_seen": 62278136, + "step": 107345 + }, + { + "epoch": 15.988978254393803, + "grad_norm": 0.00013462037895806134, + "learning_rate": 5.88025561800363e-06, + "loss": 0.0, + "num_input_tokens_seen": 62280792, + "step": 107350 + }, + { + "epoch": 15.989722966934764, + "grad_norm": 0.0001559978409204632, + "learning_rate": 5.878162242558239e-06, + "loss": 0.0, + "num_input_tokens_seen": 62283512, + "step": 107355 + }, + { + "epoch": 15.990467679475723, + "grad_norm": 0.009169988334178925, + "learning_rate": 5.87606919015686e-06, + "loss": 0.0, + "num_input_tokens_seen": 62286360, + "step": 107360 + }, + { + "epoch": 15.991212392016681, + "grad_norm": 2.027430309681222e-05, + "learning_rate": 5.873976460834848e-06, + "loss": 0.0, + "num_input_tokens_seen": 62289144, + "step": 107365 + }, + { + "epoch": 15.99195710455764, + "grad_norm": 1.779348349373322e-05, + "learning_rate": 5.871884054627571e-06, + "loss": 0.0, + "num_input_tokens_seen": 62291736, + "step": 107370 + }, + { + "epoch": 15.9927018170986, + "grad_norm": 0.00015708536375313997, + "learning_rate": 5.869791971570368e-06, + "loss": 0.0, + "num_input_tokens_seen": 62294904, + "step": 107375 + }, + { + "epoch": 15.99344652963956, + "grad_norm": 2.1011310309404507e-05, + "learning_rate": 5.867700211698593e-06, + "loss": 0.0, + "num_input_tokens_seen": 62297784, + "step": 107380 + }, + { + "epoch": 15.994191242180518, + "grad_norm": 0.0021780345123261213, + "learning_rate": 5.8656087750475765e-06, + "loss": 0.0107, + "num_input_tokens_seen": 62300504, + "step": 107385 + }, + { + "epoch": 15.994935954721477, + "grad_norm": 1.3456257875077426e-05, + "learning_rate": 5.863517661652645e-06, + "loss": 0.0, + "num_input_tokens_seen": 62303768, + "step": 107390 + }, + { + "epoch": 15.995680667262437, + "grad_norm": 2.7127896828460507e-05, + "learning_rate": 5.861426871549142e-06, + "loss": 0.0, + "num_input_tokens_seen": 62307032, + "step": 107395 + }, + { + "epoch": 15.996425379803396, + "grad_norm": 0.00023457789211533964, + "learning_rate": 5.859336404772372e-06, + "loss": 0.0, + "num_input_tokens_seen": 62309816, + "step": 107400 + }, + { + "epoch": 15.997170092344355, + "grad_norm": 7.298195851035416e-05, + "learning_rate": 5.857246261357666e-06, + "loss": 0.0, + "num_input_tokens_seen": 62312888, + "step": 107405 + }, + { + "epoch": 15.997914804885314, + "grad_norm": 0.00013302759907674044, + "learning_rate": 5.855156441340331e-06, + "loss": 0.0, + "num_input_tokens_seen": 62316056, + "step": 107410 + }, + { + "epoch": 15.998659517426274, + "grad_norm": 0.0018517477437853813, + "learning_rate": 5.853066944755667e-06, + "loss": 0.0, + "num_input_tokens_seen": 62318936, + "step": 107415 + }, + { + "epoch": 15.999404229967233, + "grad_norm": 5.82588545512408e-05, + "learning_rate": 5.8509777716389715e-06, + "loss": 0.0084, + "num_input_tokens_seen": 62322040, + "step": 107420 + }, + { + "epoch": 16.0, + "eval_loss": 3.1546268463134766, + "eval_runtime": 51.236, + "eval_samples_per_second": 58.24, + "eval_steps_per_second": 14.56, + "num_input_tokens_seen": 62323952, + "step": 107424 + }, + { + "epoch": 16.000148942508194, + "grad_norm": 0.000771706982050091, + "learning_rate": 5.848888922025553e-06, + "loss": 0.0, + "num_input_tokens_seen": 62324624, + "step": 107425 + }, + { + "epoch": 16.00089365504915, + "grad_norm": 0.00010065617243526503, + "learning_rate": 5.8468003959506915e-06, + "loss": 0.0, + "num_input_tokens_seen": 62327344, + "step": 107430 + }, + { + "epoch": 16.00163836759011, + "grad_norm": 2.4919569113990292e-05, + "learning_rate": 5.844712193449662e-06, + "loss": 0.0, + "num_input_tokens_seen": 62330128, + "step": 107435 + }, + { + "epoch": 16.002383080131068, + "grad_norm": 0.0020003793761134148, + "learning_rate": 5.842624314557757e-06, + "loss": 0.0, + "num_input_tokens_seen": 62332944, + "step": 107440 + }, + { + "epoch": 16.00312779267203, + "grad_norm": 3.011504077221616e-06, + "learning_rate": 5.840536759310239e-06, + "loss": 0.0, + "num_input_tokens_seen": 62335792, + "step": 107445 + }, + { + "epoch": 16.00387250521299, + "grad_norm": 0.0063158669508993626, + "learning_rate": 5.838449527742388e-06, + "loss": 0.0, + "num_input_tokens_seen": 62338736, + "step": 107450 + }, + { + "epoch": 16.004617217753946, + "grad_norm": 3.886815102305263e-05, + "learning_rate": 5.836362619889446e-06, + "loss": 0.0, + "num_input_tokens_seen": 62341616, + "step": 107455 + }, + { + "epoch": 16.005361930294907, + "grad_norm": 0.003931538667529821, + "learning_rate": 5.83427603578669e-06, + "loss": 0.0, + "num_input_tokens_seen": 62344368, + "step": 107460 + }, + { + "epoch": 16.006106642835864, + "grad_norm": 5.246542423265055e-05, + "learning_rate": 5.832189775469363e-06, + "loss": 0.0, + "num_input_tokens_seen": 62347440, + "step": 107465 + }, + { + "epoch": 16.006851355376824, + "grad_norm": 2.9398988772300072e-05, + "learning_rate": 5.8301038389727005e-06, + "loss": 0.0, + "num_input_tokens_seen": 62350736, + "step": 107470 + }, + { + "epoch": 16.007596067917785, + "grad_norm": 2.0795709133381024e-05, + "learning_rate": 5.8280182263319545e-06, + "loss": 0.0, + "num_input_tokens_seen": 62353680, + "step": 107475 + }, + { + "epoch": 16.00834078045874, + "grad_norm": 1.1333644579281099e-05, + "learning_rate": 5.825932937582357e-06, + "loss": 0.0, + "num_input_tokens_seen": 62356656, + "step": 107480 + }, + { + "epoch": 16.009085492999702, + "grad_norm": 0.0008328556432388723, + "learning_rate": 5.823847972759136e-06, + "loss": 0.0001, + "num_input_tokens_seen": 62359664, + "step": 107485 + }, + { + "epoch": 16.009830205540663, + "grad_norm": 0.00024045357713475823, + "learning_rate": 5.821763331897503e-06, + "loss": 0.0, + "num_input_tokens_seen": 62362512, + "step": 107490 + }, + { + "epoch": 16.01057491808162, + "grad_norm": 0.0001610247854841873, + "learning_rate": 5.819679015032697e-06, + "loss": 0.0, + "num_input_tokens_seen": 62365616, + "step": 107495 + }, + { + "epoch": 16.01131963062258, + "grad_norm": 53.95942687988281, + "learning_rate": 5.81759502219992e-06, + "loss": 0.1969, + "num_input_tokens_seen": 62368528, + "step": 107500 + }, + { + "epoch": 16.012064343163537, + "grad_norm": 2.6732282094599213e-06, + "learning_rate": 5.815511353434372e-06, + "loss": 0.0, + "num_input_tokens_seen": 62371504, + "step": 107505 + }, + { + "epoch": 16.012809055704498, + "grad_norm": 0.0009629964479245245, + "learning_rate": 5.813428008771266e-06, + "loss": 0.0, + "num_input_tokens_seen": 62374384, + "step": 107510 + }, + { + "epoch": 16.01355376824546, + "grad_norm": 9.135977052210364e-06, + "learning_rate": 5.811344988245787e-06, + "loss": 0.0, + "num_input_tokens_seen": 62377008, + "step": 107515 + }, + { + "epoch": 16.014298480786415, + "grad_norm": 4.702678415924311e-05, + "learning_rate": 5.809262291893141e-06, + "loss": 0.0, + "num_input_tokens_seen": 62380208, + "step": 107520 + }, + { + "epoch": 16.015043193327376, + "grad_norm": 8.73614990268834e-05, + "learning_rate": 5.807179919748496e-06, + "loss": 0.0, + "num_input_tokens_seen": 62382960, + "step": 107525 + }, + { + "epoch": 16.015787905868336, + "grad_norm": 0.00010026732343249023, + "learning_rate": 5.805097871847046e-06, + "loss": 0.0, + "num_input_tokens_seen": 62385648, + "step": 107530 + }, + { + "epoch": 16.016532618409293, + "grad_norm": 9.285534906666726e-05, + "learning_rate": 5.803016148223953e-06, + "loss": 0.0, + "num_input_tokens_seen": 62388496, + "step": 107535 + }, + { + "epoch": 16.017277330950254, + "grad_norm": 0.030785443261265755, + "learning_rate": 5.800934748914397e-06, + "loss": 0.0943, + "num_input_tokens_seen": 62391536, + "step": 107540 + }, + { + "epoch": 16.01802204349121, + "grad_norm": 0.0005910654435865581, + "learning_rate": 5.798853673953536e-06, + "loss": 0.0, + "num_input_tokens_seen": 62394320, + "step": 107545 + }, + { + "epoch": 16.01876675603217, + "grad_norm": 0.0005383475800044835, + "learning_rate": 5.796772923376526e-06, + "loss": 0.0, + "num_input_tokens_seen": 62397072, + "step": 107550 + }, + { + "epoch": 16.019511468573132, + "grad_norm": 0.00011376373004168272, + "learning_rate": 5.794692497218521e-06, + "loss": 0.2313, + "num_input_tokens_seen": 62400016, + "step": 107555 + }, + { + "epoch": 16.02025618111409, + "grad_norm": 0.000372516515199095, + "learning_rate": 5.79261239551466e-06, + "loss": 0.0, + "num_input_tokens_seen": 62402864, + "step": 107560 + }, + { + "epoch": 16.02100089365505, + "grad_norm": 0.0001414004509570077, + "learning_rate": 5.790532618300099e-06, + "loss": 0.0, + "num_input_tokens_seen": 62405968, + "step": 107565 + }, + { + "epoch": 16.02174560619601, + "grad_norm": 0.0006512126419693232, + "learning_rate": 5.788453165609955e-06, + "loss": 0.0, + "num_input_tokens_seen": 62408880, + "step": 107570 + }, + { + "epoch": 16.022490318736967, + "grad_norm": 1.2591825907293241e-05, + "learning_rate": 5.786374037479381e-06, + "loss": 0.0, + "num_input_tokens_seen": 62411952, + "step": 107575 + }, + { + "epoch": 16.023235031277927, + "grad_norm": 0.0001430023112334311, + "learning_rate": 5.784295233943488e-06, + "loss": 0.0, + "num_input_tokens_seen": 62414800, + "step": 107580 + }, + { + "epoch": 16.023979743818884, + "grad_norm": 0.00731581961736083, + "learning_rate": 5.7822167550373865e-06, + "loss": 0.0, + "num_input_tokens_seen": 62417616, + "step": 107585 + }, + { + "epoch": 16.024724456359845, + "grad_norm": 0.00012232227891217917, + "learning_rate": 5.780138600796212e-06, + "loss": 0.0079, + "num_input_tokens_seen": 62420496, + "step": 107590 + }, + { + "epoch": 16.025469168900806, + "grad_norm": 0.00014973917859606445, + "learning_rate": 5.778060771255053e-06, + "loss": 0.0, + "num_input_tokens_seen": 62423312, + "step": 107595 + }, + { + "epoch": 16.026213881441763, + "grad_norm": 0.000286496157059446, + "learning_rate": 5.775983266449029e-06, + "loss": 0.0, + "num_input_tokens_seen": 62426544, + "step": 107600 + }, + { + "epoch": 16.026958593982723, + "grad_norm": 6.429788754758192e-06, + "learning_rate": 5.773906086413222e-06, + "loss": 0.0, + "num_input_tokens_seen": 62429488, + "step": 107605 + }, + { + "epoch": 16.027703306523684, + "grad_norm": 0.00021653830481227487, + "learning_rate": 5.771829231182737e-06, + "loss": 0.0, + "num_input_tokens_seen": 62432272, + "step": 107610 + }, + { + "epoch": 16.02844801906464, + "grad_norm": 5.672957286151359e-06, + "learning_rate": 5.769752700792655e-06, + "loss": 0.0, + "num_input_tokens_seen": 62435216, + "step": 107615 + }, + { + "epoch": 16.0291927316056, + "grad_norm": 0.00015715059998910874, + "learning_rate": 5.767676495278057e-06, + "loss": 0.0, + "num_input_tokens_seen": 62438032, + "step": 107620 + }, + { + "epoch": 16.029937444146558, + "grad_norm": 0.0002603515749797225, + "learning_rate": 5.765600614674019e-06, + "loss": 0.0, + "num_input_tokens_seen": 62440848, + "step": 107625 + }, + { + "epoch": 16.03068215668752, + "grad_norm": 0.0010203932179138064, + "learning_rate": 5.763525059015601e-06, + "loss": 0.0, + "num_input_tokens_seen": 62443952, + "step": 107630 + }, + { + "epoch": 16.03142686922848, + "grad_norm": 5.826857432111865e-06, + "learning_rate": 5.761449828337881e-06, + "loss": 0.0, + "num_input_tokens_seen": 62446960, + "step": 107635 + }, + { + "epoch": 16.032171581769436, + "grad_norm": 0.0002921857812907547, + "learning_rate": 5.759374922675908e-06, + "loss": 0.0, + "num_input_tokens_seen": 62449584, + "step": 107640 + }, + { + "epoch": 16.032916294310397, + "grad_norm": 0.00017363516963087022, + "learning_rate": 5.757300342064748e-06, + "loss": 0.0, + "num_input_tokens_seen": 62452432, + "step": 107645 + }, + { + "epoch": 16.033661006851354, + "grad_norm": 0.003150119911879301, + "learning_rate": 5.755226086539433e-06, + "loss": 0.0, + "num_input_tokens_seen": 62455664, + "step": 107650 + }, + { + "epoch": 16.034405719392314, + "grad_norm": 0.010357450693845749, + "learning_rate": 5.753152156135022e-06, + "loss": 0.0, + "num_input_tokens_seen": 62458640, + "step": 107655 + }, + { + "epoch": 16.035150431933275, + "grad_norm": 0.0002333179145352915, + "learning_rate": 5.751078550886543e-06, + "loss": 0.0, + "num_input_tokens_seen": 62461680, + "step": 107660 + }, + { + "epoch": 16.03589514447423, + "grad_norm": 0.00026755890576168895, + "learning_rate": 5.749005270829022e-06, + "loss": 0.0, + "num_input_tokens_seen": 62464816, + "step": 107665 + }, + { + "epoch": 16.036639857015192, + "grad_norm": 0.0006039303261786699, + "learning_rate": 5.746932315997497e-06, + "loss": 0.0, + "num_input_tokens_seen": 62467376, + "step": 107670 + }, + { + "epoch": 16.037384569556153, + "grad_norm": 7.516824553022161e-06, + "learning_rate": 5.744859686426976e-06, + "loss": 0.0, + "num_input_tokens_seen": 62470288, + "step": 107675 + }, + { + "epoch": 16.03812928209711, + "grad_norm": 1.2306738426559605e-05, + "learning_rate": 5.742787382152489e-06, + "loss": 0.0, + "num_input_tokens_seen": 62473520, + "step": 107680 + }, + { + "epoch": 16.03887399463807, + "grad_norm": 7.154505510698073e-06, + "learning_rate": 5.740715403209035e-06, + "loss": 0.0, + "num_input_tokens_seen": 62476560, + "step": 107685 + }, + { + "epoch": 16.039618707179027, + "grad_norm": 3.6873698263661936e-05, + "learning_rate": 5.738643749631623e-06, + "loss": 0.0, + "num_input_tokens_seen": 62479600, + "step": 107690 + }, + { + "epoch": 16.040363419719988, + "grad_norm": 0.0002686776570044458, + "learning_rate": 5.736572421455239e-06, + "loss": 0.0, + "num_input_tokens_seen": 62482416, + "step": 107695 + }, + { + "epoch": 16.04110813226095, + "grad_norm": 0.0001525879488326609, + "learning_rate": 5.734501418714891e-06, + "loss": 0.0, + "num_input_tokens_seen": 62485360, + "step": 107700 + }, + { + "epoch": 16.041852844801905, + "grad_norm": 0.00012284936383366585, + "learning_rate": 5.732430741445563e-06, + "loss": 0.0001, + "num_input_tokens_seen": 62488208, + "step": 107705 + }, + { + "epoch": 16.042597557342866, + "grad_norm": 5.6709621276240796e-05, + "learning_rate": 5.730360389682227e-06, + "loss": 0.0, + "num_input_tokens_seen": 62491184, + "step": 107710 + }, + { + "epoch": 16.043342269883826, + "grad_norm": 3.458695573499426e-05, + "learning_rate": 5.728290363459876e-06, + "loss": 0.0, + "num_input_tokens_seen": 62493776, + "step": 107715 + }, + { + "epoch": 16.044086982424783, + "grad_norm": 2.162390956073068e-05, + "learning_rate": 5.726220662813464e-06, + "loss": 0.0, + "num_input_tokens_seen": 62496624, + "step": 107720 + }, + { + "epoch": 16.044831694965744, + "grad_norm": 4.8833262553671375e-05, + "learning_rate": 5.72415128777797e-06, + "loss": 0.0001, + "num_input_tokens_seen": 62499376, + "step": 107725 + }, + { + "epoch": 16.0455764075067, + "grad_norm": 8.484912541462108e-06, + "learning_rate": 5.722082238388346e-06, + "loss": 0.0, + "num_input_tokens_seen": 62502320, + "step": 107730 + }, + { + "epoch": 16.04632112004766, + "grad_norm": 6.341856988001382e-06, + "learning_rate": 5.720013514679553e-06, + "loss": 0.0, + "num_input_tokens_seen": 62505456, + "step": 107735 + }, + { + "epoch": 16.047065832588622, + "grad_norm": 0.002993097295984626, + "learning_rate": 5.71794511668654e-06, + "loss": 0.0, + "num_input_tokens_seen": 62508528, + "step": 107740 + }, + { + "epoch": 16.04781054512958, + "grad_norm": 1.160868941951776e-05, + "learning_rate": 5.7158770444442425e-06, + "loss": 0.0, + "num_input_tokens_seen": 62511440, + "step": 107745 + }, + { + "epoch": 16.04855525767054, + "grad_norm": 2.14169795071939e-05, + "learning_rate": 5.713809297987599e-06, + "loss": 0.0, + "num_input_tokens_seen": 62514224, + "step": 107750 + }, + { + "epoch": 16.0492999702115, + "grad_norm": 5.3674924856750295e-05, + "learning_rate": 5.7117418773515535e-06, + "loss": 0.0, + "num_input_tokens_seen": 62517104, + "step": 107755 + }, + { + "epoch": 16.050044682752457, + "grad_norm": 0.02107328735291958, + "learning_rate": 5.709674782571023e-06, + "loss": 0.0, + "num_input_tokens_seen": 62520016, + "step": 107760 + }, + { + "epoch": 16.050789395293418, + "grad_norm": 0.004861581139266491, + "learning_rate": 5.707608013680923e-06, + "loss": 0.0, + "num_input_tokens_seen": 62522864, + "step": 107765 + }, + { + "epoch": 16.051534107834375, + "grad_norm": 2.5439843739150092e-05, + "learning_rate": 5.705541570716189e-06, + "loss": 0.0, + "num_input_tokens_seen": 62525968, + "step": 107770 + }, + { + "epoch": 16.052278820375335, + "grad_norm": 0.00038001654320396483, + "learning_rate": 5.70347545371171e-06, + "loss": 0.0, + "num_input_tokens_seen": 62528848, + "step": 107775 + }, + { + "epoch": 16.053023532916296, + "grad_norm": 5.843912367708981e-05, + "learning_rate": 5.701409662702409e-06, + "loss": 0.0002, + "num_input_tokens_seen": 62531824, + "step": 107780 + }, + { + "epoch": 16.053768245457253, + "grad_norm": 4.496652036323212e-05, + "learning_rate": 5.699344197723178e-06, + "loss": 0.0, + "num_input_tokens_seen": 62534672, + "step": 107785 + }, + { + "epoch": 16.054512957998213, + "grad_norm": 0.00013748292985837907, + "learning_rate": 5.697279058808902e-06, + "loss": 0.0, + "num_input_tokens_seen": 62537520, + "step": 107790 + }, + { + "epoch": 16.055257670539174, + "grad_norm": 0.002715202746912837, + "learning_rate": 5.6952142459944845e-06, + "loss": 0.0, + "num_input_tokens_seen": 62540272, + "step": 107795 + }, + { + "epoch": 16.05600238308013, + "grad_norm": 0.0011391524458304048, + "learning_rate": 5.693149759314798e-06, + "loss": 0.0007, + "num_input_tokens_seen": 62543056, + "step": 107800 + }, + { + "epoch": 16.05674709562109, + "grad_norm": 2.804723771987483e-05, + "learning_rate": 5.691085598804727e-06, + "loss": 0.0, + "num_input_tokens_seen": 62545968, + "step": 107805 + }, + { + "epoch": 16.057491808162048, + "grad_norm": 0.0003788807662203908, + "learning_rate": 5.689021764499142e-06, + "loss": 0.0, + "num_input_tokens_seen": 62548528, + "step": 107810 + }, + { + "epoch": 16.05823652070301, + "grad_norm": 0.015407836996018887, + "learning_rate": 5.6869582564329085e-06, + "loss": 0.0, + "num_input_tokens_seen": 62551312, + "step": 107815 + }, + { + "epoch": 16.05898123324397, + "grad_norm": 7.790823292452842e-05, + "learning_rate": 5.684895074640884e-06, + "loss": 0.0, + "num_input_tokens_seen": 62554448, + "step": 107820 + }, + { + "epoch": 16.059725945784926, + "grad_norm": 4.459934280021116e-05, + "learning_rate": 5.682832219157922e-06, + "loss": 0.0003, + "num_input_tokens_seen": 62557424, + "step": 107825 + }, + { + "epoch": 16.060470658325887, + "grad_norm": 0.16174893081188202, + "learning_rate": 5.68076969001888e-06, + "loss": 0.0001, + "num_input_tokens_seen": 62560720, + "step": 107830 + }, + { + "epoch": 16.061215370866844, + "grad_norm": 0.0001280895376112312, + "learning_rate": 5.678707487258594e-06, + "loss": 0.0, + "num_input_tokens_seen": 62563952, + "step": 107835 + }, + { + "epoch": 16.061960083407804, + "grad_norm": 6.5106228248623665e-06, + "learning_rate": 5.676645610911916e-06, + "loss": 0.0, + "num_input_tokens_seen": 62567152, + "step": 107840 + }, + { + "epoch": 16.062704795948765, + "grad_norm": 1.6516680261702277e-05, + "learning_rate": 5.674584061013663e-06, + "loss": 0.0, + "num_input_tokens_seen": 62570000, + "step": 107845 + }, + { + "epoch": 16.06344950848972, + "grad_norm": 0.00016254613001365215, + "learning_rate": 5.672522837598676e-06, + "loss": 0.0, + "num_input_tokens_seen": 62572848, + "step": 107850 + }, + { + "epoch": 16.064194221030682, + "grad_norm": 0.0002658934681676328, + "learning_rate": 5.670461940701768e-06, + "loss": 0.0, + "num_input_tokens_seen": 62575664, + "step": 107855 + }, + { + "epoch": 16.064938933571643, + "grad_norm": 5.983500159345567e-06, + "learning_rate": 5.668401370357765e-06, + "loss": 0.0, + "num_input_tokens_seen": 62578608, + "step": 107860 + }, + { + "epoch": 16.0656836461126, + "grad_norm": 3.198977356078103e-05, + "learning_rate": 5.666341126601474e-06, + "loss": 0.0, + "num_input_tokens_seen": 62581424, + "step": 107865 + }, + { + "epoch": 16.06642835865356, + "grad_norm": 0.0005227028159424663, + "learning_rate": 5.664281209467692e-06, + "loss": 0.0, + "num_input_tokens_seen": 62584240, + "step": 107870 + }, + { + "epoch": 16.067173071194517, + "grad_norm": 9.840016900852788e-06, + "learning_rate": 5.662221618991234e-06, + "loss": 0.0, + "num_input_tokens_seen": 62587216, + "step": 107875 + }, + { + "epoch": 16.067917783735478, + "grad_norm": 3.921075403923169e-05, + "learning_rate": 5.660162355206888e-06, + "loss": 0.0001, + "num_input_tokens_seen": 62590160, + "step": 107880 + }, + { + "epoch": 16.06866249627644, + "grad_norm": 8.67434027895797e-06, + "learning_rate": 5.658103418149443e-06, + "loss": 0.0, + "num_input_tokens_seen": 62592848, + "step": 107885 + }, + { + "epoch": 16.069407208817395, + "grad_norm": 8.277953747892752e-05, + "learning_rate": 5.656044807853675e-06, + "loss": 0.0, + "num_input_tokens_seen": 62595728, + "step": 107890 + }, + { + "epoch": 16.070151921358356, + "grad_norm": 5.260078251012601e-05, + "learning_rate": 5.653986524354377e-06, + "loss": 0.0, + "num_input_tokens_seen": 62598512, + "step": 107895 + }, + { + "epoch": 16.070896633899316, + "grad_norm": 0.0007323191384784877, + "learning_rate": 5.651928567686307e-06, + "loss": 0.002, + "num_input_tokens_seen": 62601232, + "step": 107900 + }, + { + "epoch": 16.071641346440273, + "grad_norm": 5.094725111121079e-06, + "learning_rate": 5.649870937884247e-06, + "loss": 0.0, + "num_input_tokens_seen": 62604080, + "step": 107905 + }, + { + "epoch": 16.072386058981234, + "grad_norm": 2.2398378860088997e-05, + "learning_rate": 5.647813634982952e-06, + "loss": 0.0, + "num_input_tokens_seen": 62606992, + "step": 107910 + }, + { + "epoch": 16.07313077152219, + "grad_norm": 4.589179297909141e-06, + "learning_rate": 5.6457566590171675e-06, + "loss": 0.0, + "num_input_tokens_seen": 62609616, + "step": 107915 + }, + { + "epoch": 16.07387548406315, + "grad_norm": 1.2133633390476461e-05, + "learning_rate": 5.64370001002166e-06, + "loss": 0.0, + "num_input_tokens_seen": 62612368, + "step": 107920 + }, + { + "epoch": 16.074620196604112, + "grad_norm": 7.835312317183707e-06, + "learning_rate": 5.641643688031162e-06, + "loss": 0.0, + "num_input_tokens_seen": 62615184, + "step": 107925 + }, + { + "epoch": 16.07536490914507, + "grad_norm": 0.00028969947015866637, + "learning_rate": 5.639587693080428e-06, + "loss": 0.0, + "num_input_tokens_seen": 62618192, + "step": 107930 + }, + { + "epoch": 16.07610962168603, + "grad_norm": 0.0001943073730217293, + "learning_rate": 5.637532025204173e-06, + "loss": 0.0, + "num_input_tokens_seen": 62621168, + "step": 107935 + }, + { + "epoch": 16.07685433422699, + "grad_norm": 6.372146799549228e-06, + "learning_rate": 5.635476684437144e-06, + "loss": 0.0, + "num_input_tokens_seen": 62623856, + "step": 107940 + }, + { + "epoch": 16.077599046767947, + "grad_norm": 4.341918156569591e-06, + "learning_rate": 5.633421670814054e-06, + "loss": 0.0, + "num_input_tokens_seen": 62626864, + "step": 107945 + }, + { + "epoch": 16.078343759308908, + "grad_norm": 2.8992901206947863e-05, + "learning_rate": 5.631366984369624e-06, + "loss": 0.016, + "num_input_tokens_seen": 62629840, + "step": 107950 + }, + { + "epoch": 16.079088471849865, + "grad_norm": 1.4906333490216639e-05, + "learning_rate": 5.629312625138561e-06, + "loss": 0.0, + "num_input_tokens_seen": 62632848, + "step": 107955 + }, + { + "epoch": 16.079833184390825, + "grad_norm": 4.852224265050609e-06, + "learning_rate": 5.627258593155568e-06, + "loss": 0.0, + "num_input_tokens_seen": 62636080, + "step": 107960 + }, + { + "epoch": 16.080577896931786, + "grad_norm": 0.00043124216608703136, + "learning_rate": 5.625204888455357e-06, + "loss": 0.0, + "num_input_tokens_seen": 62639056, + "step": 107965 + }, + { + "epoch": 16.081322609472743, + "grad_norm": 0.00016239300020970404, + "learning_rate": 5.623151511072613e-06, + "loss": 0.0, + "num_input_tokens_seen": 62641840, + "step": 107970 + }, + { + "epoch": 16.082067322013703, + "grad_norm": 0.0006411616923287511, + "learning_rate": 5.6210984610420345e-06, + "loss": 0.0, + "num_input_tokens_seen": 62644720, + "step": 107975 + }, + { + "epoch": 16.082812034554664, + "grad_norm": 2.2149721189634874e-05, + "learning_rate": 5.619045738398299e-06, + "loss": 0.0, + "num_input_tokens_seen": 62647856, + "step": 107980 + }, + { + "epoch": 16.08355674709562, + "grad_norm": 0.00020082340051885694, + "learning_rate": 5.616993343176091e-06, + "loss": 0.0001, + "num_input_tokens_seen": 62650736, + "step": 107985 + }, + { + "epoch": 16.08430145963658, + "grad_norm": 8.049592725001276e-05, + "learning_rate": 5.614941275410082e-06, + "loss": 0.0, + "num_input_tokens_seen": 62653872, + "step": 107990 + }, + { + "epoch": 16.085046172177538, + "grad_norm": 3.835295865428634e-05, + "learning_rate": 5.61288953513493e-06, + "loss": 0.0, + "num_input_tokens_seen": 62656912, + "step": 107995 + }, + { + "epoch": 16.0857908847185, + "grad_norm": 2.1057421690784395e-05, + "learning_rate": 5.610838122385312e-06, + "loss": 0.0, + "num_input_tokens_seen": 62659824, + "step": 108000 + }, + { + "epoch": 16.08653559725946, + "grad_norm": 4.9028789362637326e-05, + "learning_rate": 5.608787037195873e-06, + "loss": 0.0, + "num_input_tokens_seen": 62662352, + "step": 108005 + }, + { + "epoch": 16.087280309800416, + "grad_norm": 1.7548234609421343e-05, + "learning_rate": 5.606736279601274e-06, + "loss": 0.0, + "num_input_tokens_seen": 62665136, + "step": 108010 + }, + { + "epoch": 16.088025022341377, + "grad_norm": 1.3877943274565041e-05, + "learning_rate": 5.6046858496361545e-06, + "loss": 0.0, + "num_input_tokens_seen": 62668112, + "step": 108015 + }, + { + "epoch": 16.088769734882334, + "grad_norm": 0.00015632028225809336, + "learning_rate": 5.602635747335155e-06, + "loss": 0.0, + "num_input_tokens_seen": 62670736, + "step": 108020 + }, + { + "epoch": 16.089514447423294, + "grad_norm": 2.6839968995773233e-05, + "learning_rate": 5.600585972732911e-06, + "loss": 0.0, + "num_input_tokens_seen": 62673584, + "step": 108025 + }, + { + "epoch": 16.090259159964255, + "grad_norm": 0.00017968873726204038, + "learning_rate": 5.598536525864042e-06, + "loss": 0.0, + "num_input_tokens_seen": 62676432, + "step": 108030 + }, + { + "epoch": 16.091003872505212, + "grad_norm": 0.011508760042488575, + "learning_rate": 5.59648740676319e-06, + "loss": 0.0, + "num_input_tokens_seen": 62679312, + "step": 108035 + }, + { + "epoch": 16.091748585046172, + "grad_norm": 4.906874892185442e-05, + "learning_rate": 5.594438615464953e-06, + "loss": 0.0, + "num_input_tokens_seen": 62682352, + "step": 108040 + }, + { + "epoch": 16.092493297587133, + "grad_norm": 0.0006284167757257819, + "learning_rate": 5.59239015200396e-06, + "loss": 0.0, + "num_input_tokens_seen": 62685424, + "step": 108045 + }, + { + "epoch": 16.09323801012809, + "grad_norm": 0.000911862007342279, + "learning_rate": 5.590342016414801e-06, + "loss": 0.0, + "num_input_tokens_seen": 62688432, + "step": 108050 + }, + { + "epoch": 16.09398272266905, + "grad_norm": 0.013305106200277805, + "learning_rate": 5.588294208732098e-06, + "loss": 0.0, + "num_input_tokens_seen": 62691568, + "step": 108055 + }, + { + "epoch": 16.094727435210007, + "grad_norm": 6.4322189246013295e-06, + "learning_rate": 5.586246728990424e-06, + "loss": 0.0, + "num_input_tokens_seen": 62694288, + "step": 108060 + }, + { + "epoch": 16.095472147750968, + "grad_norm": 1.0399026905361097e-05, + "learning_rate": 5.584199577224389e-06, + "loss": 0.0, + "num_input_tokens_seen": 62696944, + "step": 108065 + }, + { + "epoch": 16.09621686029193, + "grad_norm": 1.4255280802899506e-05, + "learning_rate": 5.58215275346857e-06, + "loss": 0.0, + "num_input_tokens_seen": 62699792, + "step": 108070 + }, + { + "epoch": 16.096961572832885, + "grad_norm": 0.0001283145829802379, + "learning_rate": 5.580106257757542e-06, + "loss": 0.0, + "num_input_tokens_seen": 62702704, + "step": 108075 + }, + { + "epoch": 16.097706285373846, + "grad_norm": 7.721006841165945e-05, + "learning_rate": 5.5780600901258774e-06, + "loss": 0.0, + "num_input_tokens_seen": 62705520, + "step": 108080 + }, + { + "epoch": 16.098450997914806, + "grad_norm": 0.0002464821736793965, + "learning_rate": 5.576014250608152e-06, + "loss": 0.0616, + "num_input_tokens_seen": 62708400, + "step": 108085 + }, + { + "epoch": 16.099195710455763, + "grad_norm": 0.004516345914453268, + "learning_rate": 5.573968739238927e-06, + "loss": 0.0, + "num_input_tokens_seen": 62711376, + "step": 108090 + }, + { + "epoch": 16.099940422996724, + "grad_norm": 0.31148403882980347, + "learning_rate": 5.571923556052749e-06, + "loss": 0.0011, + "num_input_tokens_seen": 62714384, + "step": 108095 + }, + { + "epoch": 16.10068513553768, + "grad_norm": 0.14877372980117798, + "learning_rate": 5.569878701084183e-06, + "loss": 0.0003, + "num_input_tokens_seen": 62717008, + "step": 108100 + }, + { + "epoch": 16.10142984807864, + "grad_norm": 3.60292979166843e-05, + "learning_rate": 5.567834174367767e-06, + "loss": 0.0, + "num_input_tokens_seen": 62720112, + "step": 108105 + }, + { + "epoch": 16.102174560619602, + "grad_norm": 3.2965247100946726e-06, + "learning_rate": 5.565789975938038e-06, + "loss": 0.0, + "num_input_tokens_seen": 62723152, + "step": 108110 + }, + { + "epoch": 16.10291927316056, + "grad_norm": 0.00011510089098010212, + "learning_rate": 5.56374610582954e-06, + "loss": 0.0, + "num_input_tokens_seen": 62726128, + "step": 108115 + }, + { + "epoch": 16.10366398570152, + "grad_norm": 0.0009205689420923591, + "learning_rate": 5.561702564076793e-06, + "loss": 0.0, + "num_input_tokens_seen": 62729072, + "step": 108120 + }, + { + "epoch": 16.10440869824248, + "grad_norm": 0.0005994322127662599, + "learning_rate": 5.5596593507143304e-06, + "loss": 0.0, + "num_input_tokens_seen": 62731632, + "step": 108125 + }, + { + "epoch": 16.105153410783437, + "grad_norm": 4.520616494119167e-05, + "learning_rate": 5.557616465776658e-06, + "loss": 0.0, + "num_input_tokens_seen": 62734384, + "step": 108130 + }, + { + "epoch": 16.105898123324398, + "grad_norm": 9.343570127384737e-05, + "learning_rate": 5.5555739092983e-06, + "loss": 0.0, + "num_input_tokens_seen": 62737264, + "step": 108135 + }, + { + "epoch": 16.106642835865355, + "grad_norm": 0.00010175842908211052, + "learning_rate": 5.553531681313762e-06, + "loss": 0.0, + "num_input_tokens_seen": 62740016, + "step": 108140 + }, + { + "epoch": 16.107387548406315, + "grad_norm": 6.015510734869167e-05, + "learning_rate": 5.5514897818575415e-06, + "loss": 0.0, + "num_input_tokens_seen": 62742928, + "step": 108145 + }, + { + "epoch": 16.108132260947276, + "grad_norm": 5.396161213866435e-06, + "learning_rate": 5.549448210964131e-06, + "loss": 0.0, + "num_input_tokens_seen": 62745648, + "step": 108150 + }, + { + "epoch": 16.108876973488233, + "grad_norm": 3.803854724537814e-06, + "learning_rate": 5.5474069686680205e-06, + "loss": 0.0, + "num_input_tokens_seen": 62748848, + "step": 108155 + }, + { + "epoch": 16.109621686029193, + "grad_norm": 8.531688945367932e-05, + "learning_rate": 5.545366055003706e-06, + "loss": 0.0, + "num_input_tokens_seen": 62751760, + "step": 108160 + }, + { + "epoch": 16.11036639857015, + "grad_norm": 3.733084213308757e-06, + "learning_rate": 5.54332547000565e-06, + "loss": 0.0, + "num_input_tokens_seen": 62754928, + "step": 108165 + }, + { + "epoch": 16.11111111111111, + "grad_norm": 0.01754833199083805, + "learning_rate": 5.541285213708342e-06, + "loss": 0.0, + "num_input_tokens_seen": 62757744, + "step": 108170 + }, + { + "epoch": 16.11185582365207, + "grad_norm": 2.763870725175366e-06, + "learning_rate": 5.539245286146238e-06, + "loss": 0.0, + "num_input_tokens_seen": 62760752, + "step": 108175 + }, + { + "epoch": 16.11260053619303, + "grad_norm": 6.5181807258340996e-06, + "learning_rate": 5.537205687353813e-06, + "loss": 0.0, + "num_input_tokens_seen": 62763824, + "step": 108180 + }, + { + "epoch": 16.11334524873399, + "grad_norm": 1.8238864868180826e-05, + "learning_rate": 5.535166417365517e-06, + "loss": 0.0, + "num_input_tokens_seen": 62766448, + "step": 108185 + }, + { + "epoch": 16.11408996127495, + "grad_norm": 2.1595149519271217e-05, + "learning_rate": 5.533127476215791e-06, + "loss": 0.0, + "num_input_tokens_seen": 62769328, + "step": 108190 + }, + { + "epoch": 16.114834673815906, + "grad_norm": 6.528235098812729e-05, + "learning_rate": 5.531088863939101e-06, + "loss": 0.0, + "num_input_tokens_seen": 62772144, + "step": 108195 + }, + { + "epoch": 16.115579386356867, + "grad_norm": 0.1943584531545639, + "learning_rate": 5.529050580569869e-06, + "loss": 0.0002, + "num_input_tokens_seen": 62774992, + "step": 108200 + }, + { + "epoch": 16.116324098897824, + "grad_norm": 0.00010779897274915129, + "learning_rate": 5.527012626142547e-06, + "loss": 0.0, + "num_input_tokens_seen": 62777904, + "step": 108205 + }, + { + "epoch": 16.117068811438784, + "grad_norm": 9.818642865866423e-05, + "learning_rate": 5.524975000691554e-06, + "loss": 0.0, + "num_input_tokens_seen": 62780720, + "step": 108210 + }, + { + "epoch": 16.117813523979745, + "grad_norm": 1.89738639164716e-05, + "learning_rate": 5.522937704251316e-06, + "loss": 0.0, + "num_input_tokens_seen": 62783536, + "step": 108215 + }, + { + "epoch": 16.118558236520702, + "grad_norm": 1.2489221262512729e-05, + "learning_rate": 5.520900736856241e-06, + "loss": 0.0061, + "num_input_tokens_seen": 62786384, + "step": 108220 + }, + { + "epoch": 16.119302949061662, + "grad_norm": 3.7972604332026094e-05, + "learning_rate": 5.5188640985407575e-06, + "loss": 0.0, + "num_input_tokens_seen": 62789136, + "step": 108225 + }, + { + "epoch": 16.120047661602623, + "grad_norm": 4.400965917739086e-05, + "learning_rate": 5.516827789339266e-06, + "loss": 0.0, + "num_input_tokens_seen": 62792112, + "step": 108230 + }, + { + "epoch": 16.12079237414358, + "grad_norm": 7.39610732125584e-06, + "learning_rate": 5.51479180928616e-06, + "loss": 0.0307, + "num_input_tokens_seen": 62795152, + "step": 108235 + }, + { + "epoch": 16.12153708668454, + "grad_norm": 3.653008388937451e-05, + "learning_rate": 5.5127561584158495e-06, + "loss": 0.0, + "num_input_tokens_seen": 62798160, + "step": 108240 + }, + { + "epoch": 16.122281799225497, + "grad_norm": 2.726312231970951e-06, + "learning_rate": 5.510720836762712e-06, + "loss": 0.0, + "num_input_tokens_seen": 62801008, + "step": 108245 + }, + { + "epoch": 16.123026511766458, + "grad_norm": 0.0011482967529445887, + "learning_rate": 5.508685844361142e-06, + "loss": 0.0, + "num_input_tokens_seen": 62804080, + "step": 108250 + }, + { + "epoch": 16.12377122430742, + "grad_norm": 7.591134362883167e-06, + "learning_rate": 5.506651181245509e-06, + "loss": 0.0, + "num_input_tokens_seen": 62807120, + "step": 108255 + }, + { + "epoch": 16.124515936848375, + "grad_norm": 5.609005256701494e-06, + "learning_rate": 5.5046168474502e-06, + "loss": 0.0, + "num_input_tokens_seen": 62810352, + "step": 108260 + }, + { + "epoch": 16.125260649389336, + "grad_norm": 4.41349538959912e-06, + "learning_rate": 5.502582843009577e-06, + "loss": 0.0, + "num_input_tokens_seen": 62813232, + "step": 108265 + }, + { + "epoch": 16.126005361930297, + "grad_norm": 0.00011419133079471067, + "learning_rate": 5.500549167957989e-06, + "loss": 0.0, + "num_input_tokens_seen": 62816112, + "step": 108270 + }, + { + "epoch": 16.126750074471254, + "grad_norm": 8.704409992787987e-06, + "learning_rate": 5.498515822329814e-06, + "loss": 0.0, + "num_input_tokens_seen": 62819056, + "step": 108275 + }, + { + "epoch": 16.127494787012214, + "grad_norm": 0.002162587596103549, + "learning_rate": 5.496482806159395e-06, + "loss": 0.0, + "num_input_tokens_seen": 62821808, + "step": 108280 + }, + { + "epoch": 16.12823949955317, + "grad_norm": 0.00015505767078138888, + "learning_rate": 5.494450119481073e-06, + "loss": 0.0, + "num_input_tokens_seen": 62824592, + "step": 108285 + }, + { + "epoch": 16.12898421209413, + "grad_norm": 0.006083234678953886, + "learning_rate": 5.492417762329188e-06, + "loss": 0.0, + "num_input_tokens_seen": 62827376, + "step": 108290 + }, + { + "epoch": 16.129728924635092, + "grad_norm": 6.203366410773015e-06, + "learning_rate": 5.490385734738082e-06, + "loss": 0.0, + "num_input_tokens_seen": 62830256, + "step": 108295 + }, + { + "epoch": 16.13047363717605, + "grad_norm": 5.917901489738142e-06, + "learning_rate": 5.4883540367420775e-06, + "loss": 0.0, + "num_input_tokens_seen": 62833008, + "step": 108300 + }, + { + "epoch": 16.13121834971701, + "grad_norm": 2.5859018933260813e-05, + "learning_rate": 5.486322668375504e-06, + "loss": 0.0, + "num_input_tokens_seen": 62835824, + "step": 108305 + }, + { + "epoch": 16.13196306225797, + "grad_norm": 0.4027590751647949, + "learning_rate": 5.484291629672677e-06, + "loss": 0.0001, + "num_input_tokens_seen": 62839024, + "step": 108310 + }, + { + "epoch": 16.132707774798927, + "grad_norm": 0.0006264076218940318, + "learning_rate": 5.482260920667903e-06, + "loss": 0.0, + "num_input_tokens_seen": 62842160, + "step": 108315 + }, + { + "epoch": 16.133452487339888, + "grad_norm": 0.0012258775532245636, + "learning_rate": 5.480230541395501e-06, + "loss": 0.0, + "num_input_tokens_seen": 62845040, + "step": 108320 + }, + { + "epoch": 16.134197199880845, + "grad_norm": 8.409449037571903e-06, + "learning_rate": 5.478200491889754e-06, + "loss": 0.0, + "num_input_tokens_seen": 62847824, + "step": 108325 + }, + { + "epoch": 16.134941912421805, + "grad_norm": 4.9506284995004535e-05, + "learning_rate": 5.47617077218498e-06, + "loss": 0.0, + "num_input_tokens_seen": 62850736, + "step": 108330 + }, + { + "epoch": 16.135686624962766, + "grad_norm": 0.00397720979526639, + "learning_rate": 5.474141382315448e-06, + "loss": 0.0, + "num_input_tokens_seen": 62853744, + "step": 108335 + }, + { + "epoch": 16.136431337503723, + "grad_norm": 4.641271516447887e-05, + "learning_rate": 5.472112322315459e-06, + "loss": 0.0, + "num_input_tokens_seen": 62856464, + "step": 108340 + }, + { + "epoch": 16.137176050044683, + "grad_norm": 0.0008767091203480959, + "learning_rate": 5.4700835922192885e-06, + "loss": 0.0, + "num_input_tokens_seen": 62859344, + "step": 108345 + }, + { + "epoch": 16.13792076258564, + "grad_norm": 2.450990359648131e-05, + "learning_rate": 5.468055192061203e-06, + "loss": 0.0, + "num_input_tokens_seen": 62862672, + "step": 108350 + }, + { + "epoch": 16.1386654751266, + "grad_norm": 0.00021588554955087602, + "learning_rate": 5.466027121875475e-06, + "loss": 0.0338, + "num_input_tokens_seen": 62865808, + "step": 108355 + }, + { + "epoch": 16.13941018766756, + "grad_norm": 0.00329815736040473, + "learning_rate": 5.463999381696358e-06, + "loss": 0.0, + "num_input_tokens_seen": 62868656, + "step": 108360 + }, + { + "epoch": 16.14015490020852, + "grad_norm": 5.536314347409643e-06, + "learning_rate": 5.4619719715581215e-06, + "loss": 0.0, + "num_input_tokens_seen": 62871664, + "step": 108365 + }, + { + "epoch": 16.14089961274948, + "grad_norm": 2.3140858047554502e-06, + "learning_rate": 5.4599448914950055e-06, + "loss": 0.0, + "num_input_tokens_seen": 62874704, + "step": 108370 + }, + { + "epoch": 16.14164432529044, + "grad_norm": 3.739876774488948e-05, + "learning_rate": 5.457918141541268e-06, + "loss": 0.4281, + "num_input_tokens_seen": 62877808, + "step": 108375 + }, + { + "epoch": 16.142389037831396, + "grad_norm": 4.3802301661344245e-05, + "learning_rate": 5.455891721731135e-06, + "loss": 0.0, + "num_input_tokens_seen": 62880656, + "step": 108380 + }, + { + "epoch": 16.143133750372357, + "grad_norm": 1.8255848772241734e-05, + "learning_rate": 5.453865632098853e-06, + "loss": 0.0, + "num_input_tokens_seen": 62883664, + "step": 108385 + }, + { + "epoch": 16.143878462913314, + "grad_norm": 2.7042447982239537e-05, + "learning_rate": 5.451839872678646e-06, + "loss": 0.0, + "num_input_tokens_seen": 62886608, + "step": 108390 + }, + { + "epoch": 16.144623175454274, + "grad_norm": 0.0109030120074749, + "learning_rate": 5.449814443504731e-06, + "loss": 0.0, + "num_input_tokens_seen": 62889488, + "step": 108395 + }, + { + "epoch": 16.145367887995235, + "grad_norm": 2.331891118956264e-05, + "learning_rate": 5.447789344611337e-06, + "loss": 0.0, + "num_input_tokens_seen": 62892464, + "step": 108400 + }, + { + "epoch": 16.146112600536192, + "grad_norm": 2.368536479480099e-05, + "learning_rate": 5.445764576032672e-06, + "loss": 0.0, + "num_input_tokens_seen": 62895408, + "step": 108405 + }, + { + "epoch": 16.146857313077152, + "grad_norm": 2.549021246522898e-06, + "learning_rate": 5.44374013780293e-06, + "loss": 0.0, + "num_input_tokens_seen": 62898512, + "step": 108410 + }, + { + "epoch": 16.147602025618113, + "grad_norm": 5.2008712373208255e-05, + "learning_rate": 5.441716029956331e-06, + "loss": 0.0, + "num_input_tokens_seen": 62902256, + "step": 108415 + }, + { + "epoch": 16.14834673815907, + "grad_norm": 2.1686948457499966e-05, + "learning_rate": 5.439692252527062e-06, + "loss": 0.0, + "num_input_tokens_seen": 62904944, + "step": 108420 + }, + { + "epoch": 16.14909145070003, + "grad_norm": 9.79972755885683e-05, + "learning_rate": 5.437668805549312e-06, + "loss": 0.0006, + "num_input_tokens_seen": 62907856, + "step": 108425 + }, + { + "epoch": 16.149836163240987, + "grad_norm": 1.1515102414705325e-05, + "learning_rate": 5.435645689057256e-06, + "loss": 0.0, + "num_input_tokens_seen": 62910736, + "step": 108430 + }, + { + "epoch": 16.150580875781948, + "grad_norm": 3.948770245187916e-05, + "learning_rate": 5.433622903085092e-06, + "loss": 0.0, + "num_input_tokens_seen": 62913648, + "step": 108435 + }, + { + "epoch": 16.15132558832291, + "grad_norm": 0.008609065786004066, + "learning_rate": 5.4316004476669735e-06, + "loss": 0.0388, + "num_input_tokens_seen": 62916784, + "step": 108440 + }, + { + "epoch": 16.152070300863866, + "grad_norm": 2.40569806919666e-05, + "learning_rate": 5.429578322837084e-06, + "loss": 0.0, + "num_input_tokens_seen": 62919632, + "step": 108445 + }, + { + "epoch": 16.152815013404826, + "grad_norm": 2.5371746232849546e-05, + "learning_rate": 5.4275565286295735e-06, + "loss": 0.0, + "num_input_tokens_seen": 62922896, + "step": 108450 + }, + { + "epoch": 16.153559725945787, + "grad_norm": 3.0663315556012094e-05, + "learning_rate": 5.425535065078608e-06, + "loss": 0.0, + "num_input_tokens_seen": 62925936, + "step": 108455 + }, + { + "epoch": 16.154304438486744, + "grad_norm": 9.152978600468487e-05, + "learning_rate": 5.423513932218327e-06, + "loss": 0.0, + "num_input_tokens_seen": 62928784, + "step": 108460 + }, + { + "epoch": 16.155049151027704, + "grad_norm": 4.389236892166082e-06, + "learning_rate": 5.421493130082889e-06, + "loss": 0.0, + "num_input_tokens_seen": 62931792, + "step": 108465 + }, + { + "epoch": 16.15579386356866, + "grad_norm": 0.008949297480285168, + "learning_rate": 5.419472658706423e-06, + "loss": 0.0, + "num_input_tokens_seen": 62934768, + "step": 108470 + }, + { + "epoch": 16.15653857610962, + "grad_norm": 3.312991339043947e-06, + "learning_rate": 5.417452518123067e-06, + "loss": 0.0, + "num_input_tokens_seen": 62937456, + "step": 108475 + }, + { + "epoch": 16.157283288650582, + "grad_norm": 8.070538024185225e-05, + "learning_rate": 5.415432708366949e-06, + "loss": 0.0, + "num_input_tokens_seen": 62940464, + "step": 108480 + }, + { + "epoch": 16.15802800119154, + "grad_norm": 2.903950417021406e-06, + "learning_rate": 5.413413229472184e-06, + "loss": 0.0, + "num_input_tokens_seen": 62943376, + "step": 108485 + }, + { + "epoch": 16.1587727137325, + "grad_norm": 1.2211293324071448e-05, + "learning_rate": 5.411394081472901e-06, + "loss": 0.0052, + "num_input_tokens_seen": 62946160, + "step": 108490 + }, + { + "epoch": 16.15951742627346, + "grad_norm": 3.0874005460646003e-05, + "learning_rate": 5.409375264403199e-06, + "loss": 0.0, + "num_input_tokens_seen": 62948944, + "step": 108495 + }, + { + "epoch": 16.160262138814417, + "grad_norm": 5.693640559911728e-06, + "learning_rate": 5.407356778297198e-06, + "loss": 0.0, + "num_input_tokens_seen": 62951824, + "step": 108500 + }, + { + "epoch": 16.161006851355378, + "grad_norm": 2.2756159523851238e-05, + "learning_rate": 5.4053386231889855e-06, + "loss": 0.0, + "num_input_tokens_seen": 62954800, + "step": 108505 + }, + { + "epoch": 16.161751563896335, + "grad_norm": 2.740594936767593e-05, + "learning_rate": 5.403320799112666e-06, + "loss": 0.0, + "num_input_tokens_seen": 62957456, + "step": 108510 + }, + { + "epoch": 16.162496276437295, + "grad_norm": 8.361876825802028e-05, + "learning_rate": 5.401303306102326e-06, + "loss": 0.0, + "num_input_tokens_seen": 62960272, + "step": 108515 + }, + { + "epoch": 16.163240988978256, + "grad_norm": 1.9462346244836226e-05, + "learning_rate": 5.3992861441920425e-06, + "loss": 0.0, + "num_input_tokens_seen": 62963024, + "step": 108520 + }, + { + "epoch": 16.163985701519213, + "grad_norm": 0.018184058368206024, + "learning_rate": 5.397269313415903e-06, + "loss": 0.0, + "num_input_tokens_seen": 62966128, + "step": 108525 + }, + { + "epoch": 16.164730414060173, + "grad_norm": 3.8721253076801077e-05, + "learning_rate": 5.395252813807969e-06, + "loss": 0.0, + "num_input_tokens_seen": 62969008, + "step": 108530 + }, + { + "epoch": 16.16547512660113, + "grad_norm": 0.0006454480462707579, + "learning_rate": 5.39323664540232e-06, + "loss": 0.0, + "num_input_tokens_seen": 62971888, + "step": 108535 + }, + { + "epoch": 16.16621983914209, + "grad_norm": 3.176085738232359e-05, + "learning_rate": 5.391220808233008e-06, + "loss": 0.0, + "num_input_tokens_seen": 62974736, + "step": 108540 + }, + { + "epoch": 16.16696455168305, + "grad_norm": 1.3145569027983584e-05, + "learning_rate": 5.3892053023340935e-06, + "loss": 0.0, + "num_input_tokens_seen": 62977360, + "step": 108545 + }, + { + "epoch": 16.16770926422401, + "grad_norm": 5.409954610513523e-06, + "learning_rate": 5.387190127739625e-06, + "loss": 0.0, + "num_input_tokens_seen": 62980016, + "step": 108550 + }, + { + "epoch": 16.16845397676497, + "grad_norm": 0.00029141438426449895, + "learning_rate": 5.3851752844836374e-06, + "loss": 0.0, + "num_input_tokens_seen": 62982960, + "step": 108555 + }, + { + "epoch": 16.16919868930593, + "grad_norm": 4.311341399443336e-05, + "learning_rate": 5.383160772600185e-06, + "loss": 0.0, + "num_input_tokens_seen": 62985904, + "step": 108560 + }, + { + "epoch": 16.169943401846886, + "grad_norm": 3.5072214814135805e-05, + "learning_rate": 5.381146592123287e-06, + "loss": 0.0, + "num_input_tokens_seen": 62988816, + "step": 108565 + }, + { + "epoch": 16.170688114387847, + "grad_norm": 2.9893300961703062e-05, + "learning_rate": 5.379132743086984e-06, + "loss": 0.0, + "num_input_tokens_seen": 62991888, + "step": 108570 + }, + { + "epoch": 16.171432826928804, + "grad_norm": 0.00026690331287682056, + "learning_rate": 5.377119225525284e-06, + "loss": 0.0, + "num_input_tokens_seen": 62994608, + "step": 108575 + }, + { + "epoch": 16.172177539469764, + "grad_norm": 0.0008783260709606111, + "learning_rate": 5.375106039472219e-06, + "loss": 0.0, + "num_input_tokens_seen": 62997392, + "step": 108580 + }, + { + "epoch": 16.172922252010725, + "grad_norm": 3.12718793793465e-06, + "learning_rate": 5.373093184961783e-06, + "loss": 0.0, + "num_input_tokens_seen": 63000432, + "step": 108585 + }, + { + "epoch": 16.173666964551682, + "grad_norm": 0.00024808035232126713, + "learning_rate": 5.371080662028e-06, + "loss": 0.0025, + "num_input_tokens_seen": 63003280, + "step": 108590 + }, + { + "epoch": 16.174411677092642, + "grad_norm": 5.532536761165829e-06, + "learning_rate": 5.369068470704855e-06, + "loss": 0.0, + "num_input_tokens_seen": 63005904, + "step": 108595 + }, + { + "epoch": 16.175156389633603, + "grad_norm": 0.00012306739517953247, + "learning_rate": 5.367056611026341e-06, + "loss": 0.0, + "num_input_tokens_seen": 63008912, + "step": 108600 + }, + { + "epoch": 16.17590110217456, + "grad_norm": 3.780423867283389e-06, + "learning_rate": 5.36504508302646e-06, + "loss": 0.0, + "num_input_tokens_seen": 63011760, + "step": 108605 + }, + { + "epoch": 16.17664581471552, + "grad_norm": 5.974221039650729e-06, + "learning_rate": 5.363033886739186e-06, + "loss": 0.0072, + "num_input_tokens_seen": 63014512, + "step": 108610 + }, + { + "epoch": 16.177390527256478, + "grad_norm": 2.0790974303963594e-05, + "learning_rate": 5.361023022198494e-06, + "loss": 0.0, + "num_input_tokens_seen": 63017264, + "step": 108615 + }, + { + "epoch": 16.178135239797438, + "grad_norm": 1.6873931599548087e-05, + "learning_rate": 5.359012489438353e-06, + "loss": 0.0, + "num_input_tokens_seen": 63020144, + "step": 108620 + }, + { + "epoch": 16.1788799523384, + "grad_norm": 2.0039110495417845e-06, + "learning_rate": 5.357002288492741e-06, + "loss": 0.0, + "num_input_tokens_seen": 63023184, + "step": 108625 + }, + { + "epoch": 16.179624664879356, + "grad_norm": 6.969437436055159e-06, + "learning_rate": 5.35499241939561e-06, + "loss": 0.0, + "num_input_tokens_seen": 63025936, + "step": 108630 + }, + { + "epoch": 16.180369377420316, + "grad_norm": 0.0008874900522641838, + "learning_rate": 5.3529828821809065e-06, + "loss": 0.0, + "num_input_tokens_seen": 63028848, + "step": 108635 + }, + { + "epoch": 16.181114089961277, + "grad_norm": 0.0002611593226902187, + "learning_rate": 5.350973676882601e-06, + "loss": 0.0, + "num_input_tokens_seen": 63031984, + "step": 108640 + }, + { + "epoch": 16.181858802502234, + "grad_norm": 0.0006044476758688688, + "learning_rate": 5.3489648035346144e-06, + "loss": 0.0, + "num_input_tokens_seen": 63034768, + "step": 108645 + }, + { + "epoch": 16.182603515043194, + "grad_norm": 6.093724095990183e-06, + "learning_rate": 5.346956262170902e-06, + "loss": 0.0, + "num_input_tokens_seen": 63037616, + "step": 108650 + }, + { + "epoch": 16.18334822758415, + "grad_norm": 5.0170920076197945e-06, + "learning_rate": 5.3449480528253825e-06, + "loss": 0.0, + "num_input_tokens_seen": 63040336, + "step": 108655 + }, + { + "epoch": 16.18409294012511, + "grad_norm": 1.3246045455161948e-05, + "learning_rate": 5.342940175531999e-06, + "loss": 0.0, + "num_input_tokens_seen": 63043280, + "step": 108660 + }, + { + "epoch": 16.184837652666072, + "grad_norm": 7.0193063947954215e-06, + "learning_rate": 5.3409326303246524e-06, + "loss": 0.0, + "num_input_tokens_seen": 63045936, + "step": 108665 + }, + { + "epoch": 16.18558236520703, + "grad_norm": 4.099993020645343e-05, + "learning_rate": 5.338925417237275e-06, + "loss": 0.0031, + "num_input_tokens_seen": 63048784, + "step": 108670 + }, + { + "epoch": 16.18632707774799, + "grad_norm": 5.435768616735004e-05, + "learning_rate": 5.336918536303773e-06, + "loss": 0.0, + "num_input_tokens_seen": 63051760, + "step": 108675 + }, + { + "epoch": 16.187071790288947, + "grad_norm": 7.49677637941204e-05, + "learning_rate": 5.334911987558045e-06, + "loss": 0.0, + "num_input_tokens_seen": 63054384, + "step": 108680 + }, + { + "epoch": 16.187816502829907, + "grad_norm": 5.0024496886180714e-05, + "learning_rate": 5.332905771033994e-06, + "loss": 0.0, + "num_input_tokens_seen": 63057040, + "step": 108685 + }, + { + "epoch": 16.188561215370868, + "grad_norm": 1.3407448022917379e-05, + "learning_rate": 5.330899886765503e-06, + "loss": 0.0, + "num_input_tokens_seen": 63059728, + "step": 108690 + }, + { + "epoch": 16.189305927911825, + "grad_norm": 1.5050873116706498e-05, + "learning_rate": 5.328894334786474e-06, + "loss": 0.0, + "num_input_tokens_seen": 63062544, + "step": 108695 + }, + { + "epoch": 16.190050640452785, + "grad_norm": 0.00010944580571958795, + "learning_rate": 5.326889115130779e-06, + "loss": 0.0, + "num_input_tokens_seen": 63065712, + "step": 108700 + }, + { + "epoch": 16.190795352993746, + "grad_norm": 9.39419351198012e-06, + "learning_rate": 5.324884227832302e-06, + "loss": 0.0, + "num_input_tokens_seen": 63068336, + "step": 108705 + }, + { + "epoch": 16.191540065534703, + "grad_norm": 5.8987436204915866e-05, + "learning_rate": 5.322879672924908e-06, + "loss": 0.0, + "num_input_tokens_seen": 63071376, + "step": 108710 + }, + { + "epoch": 16.192284778075663, + "grad_norm": 6.135831790743396e-05, + "learning_rate": 5.3208754504424585e-06, + "loss": 0.0, + "num_input_tokens_seen": 63074160, + "step": 108715 + }, + { + "epoch": 16.19302949061662, + "grad_norm": 6.45414957034518e-06, + "learning_rate": 5.318871560418822e-06, + "loss": 0.0, + "num_input_tokens_seen": 63076880, + "step": 108720 + }, + { + "epoch": 16.19377420315758, + "grad_norm": 8.308341057272628e-05, + "learning_rate": 5.316868002887843e-06, + "loss": 0.0, + "num_input_tokens_seen": 63079600, + "step": 108725 + }, + { + "epoch": 16.19451891569854, + "grad_norm": 1.4124545486993156e-05, + "learning_rate": 5.314864777883377e-06, + "loss": 0.0, + "num_input_tokens_seen": 63082544, + "step": 108730 + }, + { + "epoch": 16.1952636282395, + "grad_norm": 7.248318070196547e-06, + "learning_rate": 5.3128618854392655e-06, + "loss": 0.0002, + "num_input_tokens_seen": 63085840, + "step": 108735 + }, + { + "epoch": 16.19600834078046, + "grad_norm": 3.2246264254354173e-06, + "learning_rate": 5.3108593255893376e-06, + "loss": 0.0, + "num_input_tokens_seen": 63088400, + "step": 108740 + }, + { + "epoch": 16.19675305332142, + "grad_norm": 0.00012585462536662817, + "learning_rate": 5.308857098367437e-06, + "loss": 0.0, + "num_input_tokens_seen": 63091120, + "step": 108745 + }, + { + "epoch": 16.197497765862376, + "grad_norm": 1.9954442905145697e-05, + "learning_rate": 5.306855203807382e-06, + "loss": 0.0, + "num_input_tokens_seen": 63094096, + "step": 108750 + }, + { + "epoch": 16.198242478403337, + "grad_norm": 3.3299008919129847e-06, + "learning_rate": 5.304853641942995e-06, + "loss": 0.0, + "num_input_tokens_seen": 63096976, + "step": 108755 + }, + { + "epoch": 16.198987190944294, + "grad_norm": 2.9185401217546314e-05, + "learning_rate": 5.302852412808079e-06, + "loss": 0.0, + "num_input_tokens_seen": 63099568, + "step": 108760 + }, + { + "epoch": 16.199731903485254, + "grad_norm": 6.807584668422351e-06, + "learning_rate": 5.3008515164364585e-06, + "loss": 0.0, + "num_input_tokens_seen": 63102480, + "step": 108765 + }, + { + "epoch": 16.200476616026215, + "grad_norm": 1.2102657819923479e-05, + "learning_rate": 5.298850952861925e-06, + "loss": 0.0, + "num_input_tokens_seen": 63105392, + "step": 108770 + }, + { + "epoch": 16.201221328567172, + "grad_norm": 5.239772235654527e-06, + "learning_rate": 5.296850722118288e-06, + "loss": 0.0, + "num_input_tokens_seen": 63108240, + "step": 108775 + }, + { + "epoch": 16.201966041108133, + "grad_norm": 2.8038607524649706e-06, + "learning_rate": 5.294850824239325e-06, + "loss": 0.0, + "num_input_tokens_seen": 63111344, + "step": 108780 + }, + { + "epoch": 16.202710753649093, + "grad_norm": 3.3943062589969486e-05, + "learning_rate": 5.292851259258838e-06, + "loss": 0.0, + "num_input_tokens_seen": 63114384, + "step": 108785 + }, + { + "epoch": 16.20345546619005, + "grad_norm": 0.08924758434295654, + "learning_rate": 5.2908520272106e-06, + "loss": 0.0002, + "num_input_tokens_seen": 63117296, + "step": 108790 + }, + { + "epoch": 16.20420017873101, + "grad_norm": 0.00014598281995858997, + "learning_rate": 5.288853128128377e-06, + "loss": 0.0, + "num_input_tokens_seen": 63120016, + "step": 108795 + }, + { + "epoch": 16.204944891271968, + "grad_norm": 0.00015002839791122824, + "learning_rate": 5.2868545620459535e-06, + "loss": 0.0, + "num_input_tokens_seen": 63122800, + "step": 108800 + }, + { + "epoch": 16.205689603812928, + "grad_norm": 3.981570989708416e-05, + "learning_rate": 5.284856328997087e-06, + "loss": 0.0, + "num_input_tokens_seen": 63125552, + "step": 108805 + }, + { + "epoch": 16.20643431635389, + "grad_norm": 5.366186087485403e-06, + "learning_rate": 5.282858429015536e-06, + "loss": 0.0, + "num_input_tokens_seen": 63128400, + "step": 108810 + }, + { + "epoch": 16.207179028894846, + "grad_norm": 3.794147460212116e-06, + "learning_rate": 5.280860862135045e-06, + "loss": 0.0, + "num_input_tokens_seen": 63131376, + "step": 108815 + }, + { + "epoch": 16.207923741435806, + "grad_norm": 8.98280632100068e-06, + "learning_rate": 5.278863628389377e-06, + "loss": 0.0, + "num_input_tokens_seen": 63134320, + "step": 108820 + }, + { + "epoch": 16.208668453976767, + "grad_norm": 0.00045122395385988057, + "learning_rate": 5.276866727812255e-06, + "loss": 0.0, + "num_input_tokens_seen": 63137584, + "step": 108825 + }, + { + "epoch": 16.209413166517724, + "grad_norm": 9.862091246759519e-06, + "learning_rate": 5.274870160437431e-06, + "loss": 0.0, + "num_input_tokens_seen": 63140272, + "step": 108830 + }, + { + "epoch": 16.210157879058684, + "grad_norm": 0.015139954164624214, + "learning_rate": 5.272873926298627e-06, + "loss": 0.0, + "num_input_tokens_seen": 63143376, + "step": 108835 + }, + { + "epoch": 16.21090259159964, + "grad_norm": 1.456500558560947e-05, + "learning_rate": 5.270878025429565e-06, + "loss": 0.0, + "num_input_tokens_seen": 63145936, + "step": 108840 + }, + { + "epoch": 16.2116473041406, + "grad_norm": 4.7221801651176065e-06, + "learning_rate": 5.268882457863972e-06, + "loss": 0.0, + "num_input_tokens_seen": 63148912, + "step": 108845 + }, + { + "epoch": 16.212392016681562, + "grad_norm": 7.241396815516055e-06, + "learning_rate": 5.266887223635547e-06, + "loss": 0.0, + "num_input_tokens_seen": 63151600, + "step": 108850 + }, + { + "epoch": 16.21313672922252, + "grad_norm": 3.22615915138158e-06, + "learning_rate": 5.264892322778014e-06, + "loss": 0.0, + "num_input_tokens_seen": 63154512, + "step": 108855 + }, + { + "epoch": 16.21388144176348, + "grad_norm": 1.0570368431217503e-05, + "learning_rate": 5.262897755325064e-06, + "loss": 0.0, + "num_input_tokens_seen": 63157328, + "step": 108860 + }, + { + "epoch": 16.214626154304437, + "grad_norm": 8.345911737706047e-06, + "learning_rate": 5.260903521310401e-06, + "loss": 0.0, + "num_input_tokens_seen": 63160592, + "step": 108865 + }, + { + "epoch": 16.215370866845397, + "grad_norm": 2.6754958525998518e-05, + "learning_rate": 5.25890962076771e-06, + "loss": 0.0, + "num_input_tokens_seen": 63163696, + "step": 108870 + }, + { + "epoch": 16.216115579386358, + "grad_norm": 1.3083842532068957e-05, + "learning_rate": 5.256916053730679e-06, + "loss": 0.0, + "num_input_tokens_seen": 63166576, + "step": 108875 + }, + { + "epoch": 16.216860291927315, + "grad_norm": 1.0876166925299913e-05, + "learning_rate": 5.254922820232983e-06, + "loss": 0.0, + "num_input_tokens_seen": 63169392, + "step": 108880 + }, + { + "epoch": 16.217605004468275, + "grad_norm": 2.7021224013878964e-05, + "learning_rate": 5.2529299203082914e-06, + "loss": 0.0, + "num_input_tokens_seen": 63172176, + "step": 108885 + }, + { + "epoch": 16.218349717009236, + "grad_norm": 5.383250754675828e-05, + "learning_rate": 5.250937353990288e-06, + "loss": 0.0, + "num_input_tokens_seen": 63175120, + "step": 108890 + }, + { + "epoch": 16.219094429550193, + "grad_norm": 6.872232006571721e-06, + "learning_rate": 5.248945121312618e-06, + "loss": 0.0, + "num_input_tokens_seen": 63178032, + "step": 108895 + }, + { + "epoch": 16.219839142091153, + "grad_norm": 1.731753218336962e-05, + "learning_rate": 5.246953222308953e-06, + "loss": 0.0, + "num_input_tokens_seen": 63181072, + "step": 108900 + }, + { + "epoch": 16.22058385463211, + "grad_norm": 2.5637070848461008e-06, + "learning_rate": 5.244961657012928e-06, + "loss": 0.0, + "num_input_tokens_seen": 63183728, + "step": 108905 + }, + { + "epoch": 16.22132856717307, + "grad_norm": 1.4999373888713308e-05, + "learning_rate": 5.242970425458208e-06, + "loss": 0.0, + "num_input_tokens_seen": 63187024, + "step": 108910 + }, + { + "epoch": 16.22207327971403, + "grad_norm": 0.00048612733371555805, + "learning_rate": 5.240979527678422e-06, + "loss": 0.0, + "num_input_tokens_seen": 63190000, + "step": 108915 + }, + { + "epoch": 16.22281799225499, + "grad_norm": 6.28250782028772e-05, + "learning_rate": 5.238988963707195e-06, + "loss": 0.1563, + "num_input_tokens_seen": 63192944, + "step": 108920 + }, + { + "epoch": 16.22356270479595, + "grad_norm": 0.000391588342608884, + "learning_rate": 5.236998733578175e-06, + "loss": 0.0, + "num_input_tokens_seen": 63195760, + "step": 108925 + }, + { + "epoch": 16.22430741733691, + "grad_norm": 1.0865016520256177e-05, + "learning_rate": 5.235008837324967e-06, + "loss": 0.0, + "num_input_tokens_seen": 63198864, + "step": 108930 + }, + { + "epoch": 16.225052129877866, + "grad_norm": 0.0017072462942451239, + "learning_rate": 5.233019274981205e-06, + "loss": 0.0, + "num_input_tokens_seen": 63201680, + "step": 108935 + }, + { + "epoch": 16.225796842418827, + "grad_norm": 7.988755533006042e-05, + "learning_rate": 5.23103004658049e-06, + "loss": 0.0, + "num_input_tokens_seen": 63204560, + "step": 108940 + }, + { + "epoch": 16.226541554959784, + "grad_norm": 3.9041096897562966e-06, + "learning_rate": 5.2290411521564305e-06, + "loss": 0.0, + "num_input_tokens_seen": 63207216, + "step": 108945 + }, + { + "epoch": 16.227286267500745, + "grad_norm": 7.084612298058346e-05, + "learning_rate": 5.227052591742626e-06, + "loss": 0.0, + "num_input_tokens_seen": 63210288, + "step": 108950 + }, + { + "epoch": 16.228030980041705, + "grad_norm": 4.781485586136114e-06, + "learning_rate": 5.225064365372667e-06, + "loss": 0.0, + "num_input_tokens_seen": 63213168, + "step": 108955 + }, + { + "epoch": 16.228775692582662, + "grad_norm": 0.0005214344128035009, + "learning_rate": 5.223076473080152e-06, + "loss": 0.0, + "num_input_tokens_seen": 63216080, + "step": 108960 + }, + { + "epoch": 16.229520405123623, + "grad_norm": 6.772230699425563e-05, + "learning_rate": 5.221088914898653e-06, + "loss": 0.0, + "num_input_tokens_seen": 63218960, + "step": 108965 + }, + { + "epoch": 16.230265117664583, + "grad_norm": 2.7616082661552355e-05, + "learning_rate": 5.219101690861763e-06, + "loss": 0.0, + "num_input_tokens_seen": 63222160, + "step": 108970 + }, + { + "epoch": 16.23100983020554, + "grad_norm": 1.0774522706924472e-05, + "learning_rate": 5.217114801003037e-06, + "loss": 0.0, + "num_input_tokens_seen": 63224912, + "step": 108975 + }, + { + "epoch": 16.2317545427465, + "grad_norm": 2.3669466827414e-05, + "learning_rate": 5.215128245356057e-06, + "loss": 0.0, + "num_input_tokens_seen": 63227920, + "step": 108980 + }, + { + "epoch": 16.232499255287458, + "grad_norm": 5.019479431211948e-06, + "learning_rate": 5.2131420239543704e-06, + "loss": 0.0, + "num_input_tokens_seen": 63230768, + "step": 108985 + }, + { + "epoch": 16.233243967828418, + "grad_norm": 0.0036207998637109995, + "learning_rate": 5.211156136831546e-06, + "loss": 0.0, + "num_input_tokens_seen": 63233648, + "step": 108990 + }, + { + "epoch": 16.23398868036938, + "grad_norm": 3.279904558439739e-05, + "learning_rate": 5.209170584021125e-06, + "loss": 0.0, + "num_input_tokens_seen": 63236496, + "step": 108995 + }, + { + "epoch": 16.234733392910336, + "grad_norm": 3.331456537125632e-05, + "learning_rate": 5.207185365556646e-06, + "loss": 0.0, + "num_input_tokens_seen": 63239280, + "step": 109000 + }, + { + "epoch": 16.235478105451296, + "grad_norm": 4.714168426289689e-06, + "learning_rate": 5.205200481471662e-06, + "loss": 0.0, + "num_input_tokens_seen": 63241840, + "step": 109005 + }, + { + "epoch": 16.236222817992257, + "grad_norm": 5.303461875882931e-05, + "learning_rate": 5.2032159317996955e-06, + "loss": 0.0, + "num_input_tokens_seen": 63244848, + "step": 109010 + }, + { + "epoch": 16.236967530533214, + "grad_norm": 1.944964651556802e-06, + "learning_rate": 5.201231716574276e-06, + "loss": 0.0, + "num_input_tokens_seen": 63247696, + "step": 109015 + }, + { + "epoch": 16.237712243074174, + "grad_norm": 1.5133760825847276e-05, + "learning_rate": 5.199247835828916e-06, + "loss": 0.0, + "num_input_tokens_seen": 63250704, + "step": 109020 + }, + { + "epoch": 16.23845695561513, + "grad_norm": 2.2376698325388134e-06, + "learning_rate": 5.197264289597148e-06, + "loss": 0.0, + "num_input_tokens_seen": 63253456, + "step": 109025 + }, + { + "epoch": 16.239201668156092, + "grad_norm": 0.0002525192394386977, + "learning_rate": 5.195281077912473e-06, + "loss": 0.0, + "num_input_tokens_seen": 63256336, + "step": 109030 + }, + { + "epoch": 16.239946380697052, + "grad_norm": 6.41360311419703e-05, + "learning_rate": 5.193298200808389e-06, + "loss": 0.0, + "num_input_tokens_seen": 63259312, + "step": 109035 + }, + { + "epoch": 16.24069109323801, + "grad_norm": 4.902798082184745e-06, + "learning_rate": 5.191315658318408e-06, + "loss": 0.0, + "num_input_tokens_seen": 63262256, + "step": 109040 + }, + { + "epoch": 16.24143580577897, + "grad_norm": 2.7704887543222867e-05, + "learning_rate": 5.189333450476008e-06, + "loss": 0.0, + "num_input_tokens_seen": 63264976, + "step": 109045 + }, + { + "epoch": 16.242180518319927, + "grad_norm": 7.800637831678614e-06, + "learning_rate": 5.187351577314692e-06, + "loss": 0.0, + "num_input_tokens_seen": 63267888, + "step": 109050 + }, + { + "epoch": 16.242925230860887, + "grad_norm": 1.1428166544646956e-05, + "learning_rate": 5.185370038867929e-06, + "loss": 0.0, + "num_input_tokens_seen": 63270832, + "step": 109055 + }, + { + "epoch": 16.243669943401848, + "grad_norm": 5.9463742218213156e-05, + "learning_rate": 5.183388835169206e-06, + "loss": 0.0, + "num_input_tokens_seen": 63273488, + "step": 109060 + }, + { + "epoch": 16.244414655942805, + "grad_norm": 0.00011715728760464117, + "learning_rate": 5.181407966251986e-06, + "loss": 0.0, + "num_input_tokens_seen": 63276368, + "step": 109065 + }, + { + "epoch": 16.245159368483765, + "grad_norm": 7.410971193166915e-06, + "learning_rate": 5.179427432149733e-06, + "loss": 0.0, + "num_input_tokens_seen": 63279504, + "step": 109070 + }, + { + "epoch": 16.245904081024726, + "grad_norm": 4.391140009829542e-06, + "learning_rate": 5.177447232895913e-06, + "loss": 0.0, + "num_input_tokens_seen": 63282352, + "step": 109075 + }, + { + "epoch": 16.246648793565683, + "grad_norm": 4.621469543053536e-06, + "learning_rate": 5.1754673685239755e-06, + "loss": 0.0001, + "num_input_tokens_seen": 63285072, + "step": 109080 + }, + { + "epoch": 16.247393506106643, + "grad_norm": 9.56478834268637e-06, + "learning_rate": 5.173487839067371e-06, + "loss": 0.0, + "num_input_tokens_seen": 63287760, + "step": 109085 + }, + { + "epoch": 16.2481382186476, + "grad_norm": 2.0596844478859566e-05, + "learning_rate": 5.171508644559528e-06, + "loss": 0.0, + "num_input_tokens_seen": 63290896, + "step": 109090 + }, + { + "epoch": 16.24888293118856, + "grad_norm": 2.8778415526176104e-06, + "learning_rate": 5.169529785033903e-06, + "loss": 0.0, + "num_input_tokens_seen": 63293648, + "step": 109095 + }, + { + "epoch": 16.24962764372952, + "grad_norm": 0.0002107897453242913, + "learning_rate": 5.167551260523909e-06, + "loss": 0.0, + "num_input_tokens_seen": 63296336, + "step": 109100 + }, + { + "epoch": 16.25037235627048, + "grad_norm": 0.0006201416254043579, + "learning_rate": 5.165573071062985e-06, + "loss": 0.0, + "num_input_tokens_seen": 63298864, + "step": 109105 + }, + { + "epoch": 16.25111706881144, + "grad_norm": 4.413937858771533e-06, + "learning_rate": 5.163595216684541e-06, + "loss": 0.0, + "num_input_tokens_seen": 63302032, + "step": 109110 + }, + { + "epoch": 16.2518617813524, + "grad_norm": 8.658808837935794e-06, + "learning_rate": 5.161617697422003e-06, + "loss": 0.0, + "num_input_tokens_seen": 63304784, + "step": 109115 + }, + { + "epoch": 16.252606493893357, + "grad_norm": 3.018003553734161e-05, + "learning_rate": 5.159640513308767e-06, + "loss": 0.0, + "num_input_tokens_seen": 63307728, + "step": 109120 + }, + { + "epoch": 16.253351206434317, + "grad_norm": 0.00011450373131083325, + "learning_rate": 5.1576636643782376e-06, + "loss": 0.0, + "num_input_tokens_seen": 63310448, + "step": 109125 + }, + { + "epoch": 16.254095918975274, + "grad_norm": 338.27276611328125, + "learning_rate": 5.155687150663815e-06, + "loss": 0.2969, + "num_input_tokens_seen": 63313232, + "step": 109130 + }, + { + "epoch": 16.254840631516235, + "grad_norm": 1.71275605680421e-05, + "learning_rate": 5.153710972198894e-06, + "loss": 0.0, + "num_input_tokens_seen": 63316272, + "step": 109135 + }, + { + "epoch": 16.255585344057195, + "grad_norm": 6.97183349984698e-05, + "learning_rate": 5.151735129016855e-06, + "loss": 0.0, + "num_input_tokens_seen": 63319280, + "step": 109140 + }, + { + "epoch": 16.256330056598152, + "grad_norm": 2.983974354719976e-06, + "learning_rate": 5.149759621151068e-06, + "loss": 0.0, + "num_input_tokens_seen": 63322256, + "step": 109145 + }, + { + "epoch": 16.257074769139113, + "grad_norm": 2.27761975111207e-05, + "learning_rate": 5.147784448634926e-06, + "loss": 0.0, + "num_input_tokens_seen": 63325360, + "step": 109150 + }, + { + "epoch": 16.257819481680073, + "grad_norm": 4.591194010572508e-05, + "learning_rate": 5.145809611501789e-06, + "loss": 0.0, + "num_input_tokens_seen": 63328304, + "step": 109155 + }, + { + "epoch": 16.25856419422103, + "grad_norm": 5.835492174810497e-06, + "learning_rate": 5.143835109785014e-06, + "loss": 0.0, + "num_input_tokens_seen": 63331088, + "step": 109160 + }, + { + "epoch": 16.25930890676199, + "grad_norm": 2.7782536562881432e-05, + "learning_rate": 5.1418609435179676e-06, + "loss": 0.0, + "num_input_tokens_seen": 63333872, + "step": 109165 + }, + { + "epoch": 16.260053619302948, + "grad_norm": 4.4792896005674265e-06, + "learning_rate": 5.139887112733993e-06, + "loss": 0.0, + "num_input_tokens_seen": 63336400, + "step": 109170 + }, + { + "epoch": 16.260798331843908, + "grad_norm": 0.0008342459332197905, + "learning_rate": 5.137913617466447e-06, + "loss": 0.0, + "num_input_tokens_seen": 63339184, + "step": 109175 + }, + { + "epoch": 16.26154304438487, + "grad_norm": 1.1222595276194625e-05, + "learning_rate": 5.1359404577486585e-06, + "loss": 0.0, + "num_input_tokens_seen": 63342032, + "step": 109180 + }, + { + "epoch": 16.262287756925826, + "grad_norm": 0.0030087660998106003, + "learning_rate": 5.13396763361397e-06, + "loss": 0.0, + "num_input_tokens_seen": 63345232, + "step": 109185 + }, + { + "epoch": 16.263032469466786, + "grad_norm": 2.7809996936412062e-06, + "learning_rate": 5.131995145095705e-06, + "loss": 0.0, + "num_input_tokens_seen": 63348016, + "step": 109190 + }, + { + "epoch": 16.263777182007743, + "grad_norm": 6.153729191282764e-05, + "learning_rate": 5.130022992227193e-06, + "loss": 0.0, + "num_input_tokens_seen": 63350672, + "step": 109195 + }, + { + "epoch": 16.264521894548704, + "grad_norm": 1.61997159011662e-05, + "learning_rate": 5.128051175041748e-06, + "loss": 0.0, + "num_input_tokens_seen": 63353584, + "step": 109200 + }, + { + "epoch": 16.265266607089664, + "grad_norm": 5.05466123286169e-05, + "learning_rate": 5.126079693572683e-06, + "loss": 0.0, + "num_input_tokens_seen": 63356720, + "step": 109205 + }, + { + "epoch": 16.26601131963062, + "grad_norm": 3.3547376006026752e-06, + "learning_rate": 5.124108547853301e-06, + "loss": 0.0, + "num_input_tokens_seen": 63359472, + "step": 109210 + }, + { + "epoch": 16.266756032171582, + "grad_norm": 3.595277121348772e-06, + "learning_rate": 5.122137737916896e-06, + "loss": 0.0, + "num_input_tokens_seen": 63362832, + "step": 109215 + }, + { + "epoch": 16.267500744712542, + "grad_norm": 1.0355478480050806e-05, + "learning_rate": 5.120167263796779e-06, + "loss": 0.0, + "num_input_tokens_seen": 63365712, + "step": 109220 + }, + { + "epoch": 16.2682454572535, + "grad_norm": 5.01473514304962e-05, + "learning_rate": 5.118197125526228e-06, + "loss": 0.0, + "num_input_tokens_seen": 63368560, + "step": 109225 + }, + { + "epoch": 16.26899016979446, + "grad_norm": 0.0002505450393073261, + "learning_rate": 5.116227323138531e-06, + "loss": 0.0, + "num_input_tokens_seen": 63371600, + "step": 109230 + }, + { + "epoch": 16.269734882335417, + "grad_norm": 0.00013631832553073764, + "learning_rate": 5.114257856666968e-06, + "loss": 0.0, + "num_input_tokens_seen": 63374320, + "step": 109235 + }, + { + "epoch": 16.270479594876377, + "grad_norm": 3.441110720814322e-06, + "learning_rate": 5.112288726144798e-06, + "loss": 0.0, + "num_input_tokens_seen": 63377392, + "step": 109240 + }, + { + "epoch": 16.271224307417338, + "grad_norm": 7.092782652762253e-06, + "learning_rate": 5.110319931605306e-06, + "loss": 0.0, + "num_input_tokens_seen": 63380464, + "step": 109245 + }, + { + "epoch": 16.271969019958295, + "grad_norm": 0.00010697559628169984, + "learning_rate": 5.1083514730817375e-06, + "loss": 0.0, + "num_input_tokens_seen": 63383696, + "step": 109250 + }, + { + "epoch": 16.272713732499255, + "grad_norm": 1.1564950909814797e-05, + "learning_rate": 5.106383350607358e-06, + "loss": 0.0, + "num_input_tokens_seen": 63386384, + "step": 109255 + }, + { + "epoch": 16.273458445040216, + "grad_norm": 1.8843504221877083e-05, + "learning_rate": 5.104415564215409e-06, + "loss": 0.0, + "num_input_tokens_seen": 63389104, + "step": 109260 + }, + { + "epoch": 16.274203157581173, + "grad_norm": 5.347292608348653e-05, + "learning_rate": 5.102448113939143e-06, + "loss": 0.0, + "num_input_tokens_seen": 63391856, + "step": 109265 + }, + { + "epoch": 16.274947870122134, + "grad_norm": 0.004786031786352396, + "learning_rate": 5.100480999811794e-06, + "loss": 0.0, + "num_input_tokens_seen": 63394544, + "step": 109270 + }, + { + "epoch": 16.27569258266309, + "grad_norm": 4.917203477816656e-05, + "learning_rate": 5.09851422186659e-06, + "loss": 0.0, + "num_input_tokens_seen": 63397808, + "step": 109275 + }, + { + "epoch": 16.27643729520405, + "grad_norm": 2.5982349143305328e-06, + "learning_rate": 5.096547780136765e-06, + "loss": 0.0, + "num_input_tokens_seen": 63401040, + "step": 109280 + }, + { + "epoch": 16.27718200774501, + "grad_norm": 9.355443580716383e-06, + "learning_rate": 5.0945816746555295e-06, + "loss": 0.0, + "num_input_tokens_seen": 63403888, + "step": 109285 + }, + { + "epoch": 16.27792672028597, + "grad_norm": 3.589318657759577e-05, + "learning_rate": 5.092615905456111e-06, + "loss": 0.0, + "num_input_tokens_seen": 63406608, + "step": 109290 + }, + { + "epoch": 16.27867143282693, + "grad_norm": 1.8332828403799795e-05, + "learning_rate": 5.090650472571709e-06, + "loss": 0.0, + "num_input_tokens_seen": 63409840, + "step": 109295 + }, + { + "epoch": 16.27941614536789, + "grad_norm": 4.8078432882903144e-05, + "learning_rate": 5.088685376035538e-06, + "loss": 0.0, + "num_input_tokens_seen": 63412528, + "step": 109300 + }, + { + "epoch": 16.280160857908847, + "grad_norm": 0.00012882384180556983, + "learning_rate": 5.086720615880783e-06, + "loss": 0.0, + "num_input_tokens_seen": 63415376, + "step": 109305 + }, + { + "epoch": 16.280905570449807, + "grad_norm": 1.0088310773426201e-05, + "learning_rate": 5.084756192140652e-06, + "loss": 0.0373, + "num_input_tokens_seen": 63418096, + "step": 109310 + }, + { + "epoch": 16.281650282990764, + "grad_norm": 2.0373308871057816e-05, + "learning_rate": 5.082792104848325e-06, + "loss": 0.0, + "num_input_tokens_seen": 63421328, + "step": 109315 + }, + { + "epoch": 16.282394995531725, + "grad_norm": 2.3563939066661987e-06, + "learning_rate": 5.080828354036974e-06, + "loss": 0.0, + "num_input_tokens_seen": 63424464, + "step": 109320 + }, + { + "epoch": 16.283139708072685, + "grad_norm": 4.132772119191941e-06, + "learning_rate": 5.078864939739789e-06, + "loss": 0.0, + "num_input_tokens_seen": 63427184, + "step": 109325 + }, + { + "epoch": 16.283884420613642, + "grad_norm": 0.00014042569091543555, + "learning_rate": 5.076901861989927e-06, + "loss": 0.0, + "num_input_tokens_seen": 63430128, + "step": 109330 + }, + { + "epoch": 16.284629133154603, + "grad_norm": 3.228620334994048e-05, + "learning_rate": 5.074939120820568e-06, + "loss": 0.0, + "num_input_tokens_seen": 63433008, + "step": 109335 + }, + { + "epoch": 16.285373845695563, + "grad_norm": 4.184476347290911e-05, + "learning_rate": 5.072976716264863e-06, + "loss": 0.0, + "num_input_tokens_seen": 63435888, + "step": 109340 + }, + { + "epoch": 16.28611855823652, + "grad_norm": 3.1838187624089187e-06, + "learning_rate": 5.07101464835596e-06, + "loss": 0.0, + "num_input_tokens_seen": 63438864, + "step": 109345 + }, + { + "epoch": 16.28686327077748, + "grad_norm": 4.013576472061686e-06, + "learning_rate": 5.069052917127004e-06, + "loss": 0.0, + "num_input_tokens_seen": 63442064, + "step": 109350 + }, + { + "epoch": 16.287607983318438, + "grad_norm": 1.4095805454417132e-05, + "learning_rate": 5.06709152261115e-06, + "loss": 0.031, + "num_input_tokens_seen": 63445264, + "step": 109355 + }, + { + "epoch": 16.2883526958594, + "grad_norm": 0.0025459269527345896, + "learning_rate": 5.065130464841525e-06, + "loss": 0.0, + "num_input_tokens_seen": 63448208, + "step": 109360 + }, + { + "epoch": 16.28909740840036, + "grad_norm": 4.177210030320566e-06, + "learning_rate": 5.063169743851251e-06, + "loss": 0.0, + "num_input_tokens_seen": 63451152, + "step": 109365 + }, + { + "epoch": 16.289842120941316, + "grad_norm": 4.332160187914269e-06, + "learning_rate": 5.061209359673471e-06, + "loss": 0.0, + "num_input_tokens_seen": 63454160, + "step": 109370 + }, + { + "epoch": 16.290586833482276, + "grad_norm": 2.3520879040006548e-05, + "learning_rate": 5.059249312341286e-06, + "loss": 0.0158, + "num_input_tokens_seen": 63456944, + "step": 109375 + }, + { + "epoch": 16.291331546023233, + "grad_norm": 4.294719929021085e-06, + "learning_rate": 5.057289601887824e-06, + "loss": 0.0, + "num_input_tokens_seen": 63459632, + "step": 109380 + }, + { + "epoch": 16.292076258564194, + "grad_norm": 0.00047654641093686223, + "learning_rate": 5.055330228346178e-06, + "loss": 0.0, + "num_input_tokens_seen": 63462608, + "step": 109385 + }, + { + "epoch": 16.292820971105154, + "grad_norm": 2.3342085114563815e-05, + "learning_rate": 5.053371191749465e-06, + "loss": 0.0, + "num_input_tokens_seen": 63465712, + "step": 109390 + }, + { + "epoch": 16.29356568364611, + "grad_norm": 0.0007953591994009912, + "learning_rate": 5.051412492130772e-06, + "loss": 0.0, + "num_input_tokens_seen": 63468432, + "step": 109395 + }, + { + "epoch": 16.294310396187072, + "grad_norm": 5.309313564794138e-05, + "learning_rate": 5.049454129523185e-06, + "loss": 0.0, + "num_input_tokens_seen": 63471248, + "step": 109400 + }, + { + "epoch": 16.295055108728032, + "grad_norm": 1.3323310668056365e-05, + "learning_rate": 5.047496103959798e-06, + "loss": 0.0, + "num_input_tokens_seen": 63473776, + "step": 109405 + }, + { + "epoch": 16.29579982126899, + "grad_norm": 3.384469891898334e-06, + "learning_rate": 5.045538415473686e-06, + "loss": 0.0, + "num_input_tokens_seen": 63476592, + "step": 109410 + }, + { + "epoch": 16.29654453380995, + "grad_norm": 3.893213943229057e-06, + "learning_rate": 5.0435810640979215e-06, + "loss": 0.0, + "num_input_tokens_seen": 63479472, + "step": 109415 + }, + { + "epoch": 16.297289246350907, + "grad_norm": 9.41991493164096e-06, + "learning_rate": 5.041624049865567e-06, + "loss": 0.0, + "num_input_tokens_seen": 63482576, + "step": 109420 + }, + { + "epoch": 16.298033958891867, + "grad_norm": 1.1514167454151902e-05, + "learning_rate": 5.039667372809695e-06, + "loss": 0.0, + "num_input_tokens_seen": 63485392, + "step": 109425 + }, + { + "epoch": 16.298778671432828, + "grad_norm": 1.0929617019428406e-05, + "learning_rate": 5.0377110329633495e-06, + "loss": 0.1239, + "num_input_tokens_seen": 63488304, + "step": 109430 + }, + { + "epoch": 16.299523383973785, + "grad_norm": 7.365187684627017e-06, + "learning_rate": 5.035755030359593e-06, + "loss": 0.0, + "num_input_tokens_seen": 63491312, + "step": 109435 + }, + { + "epoch": 16.300268096514746, + "grad_norm": 1.0141696293430869e-05, + "learning_rate": 5.0337993650314665e-06, + "loss": 0.0, + "num_input_tokens_seen": 63494256, + "step": 109440 + }, + { + "epoch": 16.301012809055706, + "grad_norm": 0.00011659455776680261, + "learning_rate": 5.0318440370119985e-06, + "loss": 0.0, + "num_input_tokens_seen": 63497168, + "step": 109445 + }, + { + "epoch": 16.301757521596663, + "grad_norm": 40.3378791809082, + "learning_rate": 5.029889046334238e-06, + "loss": 0.0329, + "num_input_tokens_seen": 63500048, + "step": 109450 + }, + { + "epoch": 16.302502234137624, + "grad_norm": 0.00047754624392837286, + "learning_rate": 5.0279343930312e-06, + "loss": 0.0, + "num_input_tokens_seen": 63503216, + "step": 109455 + }, + { + "epoch": 16.30324694667858, + "grad_norm": 1.5657091353205033e-05, + "learning_rate": 5.025980077135917e-06, + "loss": 0.0, + "num_input_tokens_seen": 63506256, + "step": 109460 + }, + { + "epoch": 16.30399165921954, + "grad_norm": 9.729803423397243e-05, + "learning_rate": 5.0240260986814e-06, + "loss": 0.0, + "num_input_tokens_seen": 63509136, + "step": 109465 + }, + { + "epoch": 16.3047363717605, + "grad_norm": 2.6436944608576596e-05, + "learning_rate": 5.022072457700658e-06, + "loss": 0.0, + "num_input_tokens_seen": 63511952, + "step": 109470 + }, + { + "epoch": 16.30548108430146, + "grad_norm": 6.417661643354222e-05, + "learning_rate": 5.020119154226699e-06, + "loss": 0.0, + "num_input_tokens_seen": 63515024, + "step": 109475 + }, + { + "epoch": 16.30622579684242, + "grad_norm": 5.997741027385928e-06, + "learning_rate": 5.018166188292514e-06, + "loss": 0.0, + "num_input_tokens_seen": 63518000, + "step": 109480 + }, + { + "epoch": 16.30697050938338, + "grad_norm": 0.00010774387919809669, + "learning_rate": 5.016213559931107e-06, + "loss": 0.0, + "num_input_tokens_seen": 63520880, + "step": 109485 + }, + { + "epoch": 16.307715221924337, + "grad_norm": 9.296036296291277e-06, + "learning_rate": 5.014261269175457e-06, + "loss": 0.0, + "num_input_tokens_seen": 63524048, + "step": 109490 + }, + { + "epoch": 16.308459934465297, + "grad_norm": 3.1861707157077035e-06, + "learning_rate": 5.012309316058555e-06, + "loss": 0.0, + "num_input_tokens_seen": 63526736, + "step": 109495 + }, + { + "epoch": 16.309204647006254, + "grad_norm": 6.290483270277036e-06, + "learning_rate": 5.0103577006133685e-06, + "loss": 0.0, + "num_input_tokens_seen": 63529680, + "step": 109500 + }, + { + "epoch": 16.309949359547215, + "grad_norm": 6.825675882282667e-06, + "learning_rate": 5.008406422872878e-06, + "loss": 0.0, + "num_input_tokens_seen": 63532752, + "step": 109505 + }, + { + "epoch": 16.310694072088175, + "grad_norm": 0.0003472013049758971, + "learning_rate": 5.0064554828700345e-06, + "loss": 0.0, + "num_input_tokens_seen": 63535280, + "step": 109510 + }, + { + "epoch": 16.311438784629132, + "grad_norm": 0.00026219233404845, + "learning_rate": 5.004504880637812e-06, + "loss": 0.0, + "num_input_tokens_seen": 63538704, + "step": 109515 + }, + { + "epoch": 16.312183497170093, + "grad_norm": 1.5267987251281738, + "learning_rate": 5.002554616209157e-06, + "loss": 0.0168, + "num_input_tokens_seen": 63541424, + "step": 109520 + }, + { + "epoch": 16.312928209711053, + "grad_norm": 5.51435150555335e-05, + "learning_rate": 5.000604689617011e-06, + "loss": 0.0, + "num_input_tokens_seen": 63544304, + "step": 109525 + }, + { + "epoch": 16.31367292225201, + "grad_norm": 0.00016820593737065792, + "learning_rate": 4.998655100894328e-06, + "loss": 0.0, + "num_input_tokens_seen": 63547280, + "step": 109530 + }, + { + "epoch": 16.31441763479297, + "grad_norm": 8.287313903565519e-06, + "learning_rate": 4.996705850074041e-06, + "loss": 0.0, + "num_input_tokens_seen": 63549776, + "step": 109535 + }, + { + "epoch": 16.315162347333928, + "grad_norm": 1.0627933079376817e-05, + "learning_rate": 4.994756937189076e-06, + "loss": 0.0, + "num_input_tokens_seen": 63553104, + "step": 109540 + }, + { + "epoch": 16.31590705987489, + "grad_norm": 1.1053512025682721e-05, + "learning_rate": 4.992808362272353e-06, + "loss": 0.0, + "num_input_tokens_seen": 63556336, + "step": 109545 + }, + { + "epoch": 16.31665177241585, + "grad_norm": 6.63766149955336e-06, + "learning_rate": 4.990860125356806e-06, + "loss": 0.0, + "num_input_tokens_seen": 63559152, + "step": 109550 + }, + { + "epoch": 16.317396484956806, + "grad_norm": 3.731825199793093e-05, + "learning_rate": 4.988912226475342e-06, + "loss": 0.0674, + "num_input_tokens_seen": 63561680, + "step": 109555 + }, + { + "epoch": 16.318141197497766, + "grad_norm": 4.505162451096112e-06, + "learning_rate": 4.986964665660859e-06, + "loss": 0.0, + "num_input_tokens_seen": 63564368, + "step": 109560 + }, + { + "epoch": 16.318885910038723, + "grad_norm": 2.4320213469763985e-06, + "learning_rate": 4.985017442946274e-06, + "loss": 0.0, + "num_input_tokens_seen": 63567312, + "step": 109565 + }, + { + "epoch": 16.319630622579684, + "grad_norm": 2.3658240024815314e-06, + "learning_rate": 4.983070558364472e-06, + "loss": 0.0, + "num_input_tokens_seen": 63569904, + "step": 109570 + }, + { + "epoch": 16.320375335120644, + "grad_norm": 0.01869240030646324, + "learning_rate": 4.981124011948355e-06, + "loss": 0.0, + "num_input_tokens_seen": 63572688, + "step": 109575 + }, + { + "epoch": 16.3211200476616, + "grad_norm": 5.7064271459239535e-06, + "learning_rate": 4.979177803730794e-06, + "loss": 0.0, + "num_input_tokens_seen": 63575536, + "step": 109580 + }, + { + "epoch": 16.321864760202562, + "grad_norm": 0.00010020407353295013, + "learning_rate": 4.9772319337446835e-06, + "loss": 0.0, + "num_input_tokens_seen": 63578192, + "step": 109585 + }, + { + "epoch": 16.322609472743522, + "grad_norm": 3.03753399848938, + "learning_rate": 4.975286402022883e-06, + "loss": 0.0074, + "num_input_tokens_seen": 63581200, + "step": 109590 + }, + { + "epoch": 16.32335418528448, + "grad_norm": 4.6011704398551956e-05, + "learning_rate": 4.973341208598273e-06, + "loss": 0.0, + "num_input_tokens_seen": 63584048, + "step": 109595 + }, + { + "epoch": 16.32409889782544, + "grad_norm": 0.00016802742902655154, + "learning_rate": 4.971396353503707e-06, + "loss": 0.0004, + "num_input_tokens_seen": 63587152, + "step": 109600 + }, + { + "epoch": 16.324843610366397, + "grad_norm": 4.352501855464652e-05, + "learning_rate": 4.969451836772046e-06, + "loss": 0.0, + "num_input_tokens_seen": 63589744, + "step": 109605 + }, + { + "epoch": 16.325588322907358, + "grad_norm": 7.989824553078506e-06, + "learning_rate": 4.9675076584361355e-06, + "loss": 0.0, + "num_input_tokens_seen": 63592528, + "step": 109610 + }, + { + "epoch": 16.326333035448318, + "grad_norm": 1.5417281247209758e-05, + "learning_rate": 4.965563818528818e-06, + "loss": 0.0, + "num_input_tokens_seen": 63595088, + "step": 109615 + }, + { + "epoch": 16.327077747989275, + "grad_norm": 1.5193380932032596e-05, + "learning_rate": 4.9636203170829424e-06, + "loss": 0.0, + "num_input_tokens_seen": 63598064, + "step": 109620 + }, + { + "epoch": 16.327822460530236, + "grad_norm": 5.837910975969862e-06, + "learning_rate": 4.9616771541313335e-06, + "loss": 0.0, + "num_input_tokens_seen": 63601040, + "step": 109625 + }, + { + "epoch": 16.328567173071196, + "grad_norm": 3.072580511798151e-05, + "learning_rate": 4.9597343297068274e-06, + "loss": 0.0, + "num_input_tokens_seen": 63603696, + "step": 109630 + }, + { + "epoch": 16.329311885612153, + "grad_norm": 1.500360758655006e-05, + "learning_rate": 4.957791843842244e-06, + "loss": 0.0, + "num_input_tokens_seen": 63606352, + "step": 109635 + }, + { + "epoch": 16.330056598153114, + "grad_norm": 0.0011294063879176974, + "learning_rate": 4.955849696570392e-06, + "loss": 0.0, + "num_input_tokens_seen": 63609168, + "step": 109640 + }, + { + "epoch": 16.33080131069407, + "grad_norm": 0.00015649093256797642, + "learning_rate": 4.953907887924089e-06, + "loss": 0.0, + "num_input_tokens_seen": 63611984, + "step": 109645 + }, + { + "epoch": 16.33154602323503, + "grad_norm": 2.188604776165448e-05, + "learning_rate": 4.9519664179361355e-06, + "loss": 0.0, + "num_input_tokens_seen": 63614864, + "step": 109650 + }, + { + "epoch": 16.33229073577599, + "grad_norm": 1.846557279350236e-05, + "learning_rate": 4.95002528663934e-06, + "loss": 0.0, + "num_input_tokens_seen": 63617904, + "step": 109655 + }, + { + "epoch": 16.33303544831695, + "grad_norm": 0.0015441708965227008, + "learning_rate": 4.948084494066482e-06, + "loss": 0.0852, + "num_input_tokens_seen": 63620880, + "step": 109660 + }, + { + "epoch": 16.33378016085791, + "grad_norm": 0.0005011851317249238, + "learning_rate": 4.946144040250361e-06, + "loss": 0.0, + "num_input_tokens_seen": 63623728, + "step": 109665 + }, + { + "epoch": 16.33452487339887, + "grad_norm": 3.358517642482184e-05, + "learning_rate": 4.944203925223759e-06, + "loss": 0.0, + "num_input_tokens_seen": 63626736, + "step": 109670 + }, + { + "epoch": 16.335269585939827, + "grad_norm": 2.1088482753839344e-05, + "learning_rate": 4.942264149019446e-06, + "loss": 0.0, + "num_input_tokens_seen": 63629712, + "step": 109675 + }, + { + "epoch": 16.336014298480787, + "grad_norm": 1.064351454260759e-05, + "learning_rate": 4.940324711670194e-06, + "loss": 0.0, + "num_input_tokens_seen": 63632528, + "step": 109680 + }, + { + "epoch": 16.336759011021744, + "grad_norm": 2.417957375655533e-06, + "learning_rate": 4.93838561320876e-06, + "loss": 0.0, + "num_input_tokens_seen": 63635376, + "step": 109685 + }, + { + "epoch": 16.337503723562705, + "grad_norm": 3.4755659726215526e-05, + "learning_rate": 4.93644685366792e-06, + "loss": 0.0, + "num_input_tokens_seen": 63637968, + "step": 109690 + }, + { + "epoch": 16.338248436103665, + "grad_norm": 0.00016437767772004008, + "learning_rate": 4.934508433080412e-06, + "loss": 0.0, + "num_input_tokens_seen": 63640816, + "step": 109695 + }, + { + "epoch": 16.338993148644622, + "grad_norm": 3.1579531878378475e-06, + "learning_rate": 4.932570351478996e-06, + "loss": 0.0, + "num_input_tokens_seen": 63643408, + "step": 109700 + }, + { + "epoch": 16.339737861185583, + "grad_norm": 0.005031523294746876, + "learning_rate": 4.930632608896402e-06, + "loss": 0.0, + "num_input_tokens_seen": 63646416, + "step": 109705 + }, + { + "epoch": 16.34048257372654, + "grad_norm": 1.38209288707003e-05, + "learning_rate": 4.92869520536538e-06, + "loss": 0.0, + "num_input_tokens_seen": 63649136, + "step": 109710 + }, + { + "epoch": 16.3412272862675, + "grad_norm": 6.184043286339147e-06, + "learning_rate": 4.926758140918647e-06, + "loss": 0.0029, + "num_input_tokens_seen": 63652432, + "step": 109715 + }, + { + "epoch": 16.34197199880846, + "grad_norm": 7.68430527386954e-06, + "learning_rate": 4.924821415588937e-06, + "loss": 0.0025, + "num_input_tokens_seen": 63655248, + "step": 109720 + }, + { + "epoch": 16.342716711349418, + "grad_norm": 3.11933399643749e-05, + "learning_rate": 4.922885029408969e-06, + "loss": 0.0, + "num_input_tokens_seen": 63658448, + "step": 109725 + }, + { + "epoch": 16.34346142389038, + "grad_norm": 0.0017777710454538465, + "learning_rate": 4.920948982411444e-06, + "loss": 0.0, + "num_input_tokens_seen": 63661104, + "step": 109730 + }, + { + "epoch": 16.34420613643134, + "grad_norm": 0.010571091435849667, + "learning_rate": 4.919013274629087e-06, + "loss": 0.0, + "num_input_tokens_seen": 63664112, + "step": 109735 + }, + { + "epoch": 16.344950848972296, + "grad_norm": 0.0007749473443254828, + "learning_rate": 4.9170779060945916e-06, + "loss": 0.0001, + "num_input_tokens_seen": 63666736, + "step": 109740 + }, + { + "epoch": 16.345695561513256, + "grad_norm": 4.345101388025796e-06, + "learning_rate": 4.915142876840653e-06, + "loss": 0.0001, + "num_input_tokens_seen": 63669296, + "step": 109745 + }, + { + "epoch": 16.346440274054213, + "grad_norm": 2.743851382547291e-06, + "learning_rate": 4.9132081868999535e-06, + "loss": 0.0, + "num_input_tokens_seen": 63672528, + "step": 109750 + }, + { + "epoch": 16.347184986595174, + "grad_norm": 3.971301794081228e-06, + "learning_rate": 4.911273836305194e-06, + "loss": 0.0, + "num_input_tokens_seen": 63675376, + "step": 109755 + }, + { + "epoch": 16.347929699136134, + "grad_norm": 3.867754458042327e-06, + "learning_rate": 4.909339825089049e-06, + "loss": 0.0, + "num_input_tokens_seen": 63677904, + "step": 109760 + }, + { + "epoch": 16.34867441167709, + "grad_norm": 2.4835737349349074e-05, + "learning_rate": 4.9074061532841774e-06, + "loss": 0.0, + "num_input_tokens_seen": 63680784, + "step": 109765 + }, + { + "epoch": 16.349419124218052, + "grad_norm": 5.507024616235867e-05, + "learning_rate": 4.905472820923265e-06, + "loss": 0.0, + "num_input_tokens_seen": 63683568, + "step": 109770 + }, + { + "epoch": 16.350163836759013, + "grad_norm": 0.00016524968668818474, + "learning_rate": 4.903539828038961e-06, + "loss": 0.0, + "num_input_tokens_seen": 63686448, + "step": 109775 + }, + { + "epoch": 16.35090854929997, + "grad_norm": 0.013121972791850567, + "learning_rate": 4.901607174663933e-06, + "loss": 0.0, + "num_input_tokens_seen": 63689264, + "step": 109780 + }, + { + "epoch": 16.35165326184093, + "grad_norm": 0.00013284718443173915, + "learning_rate": 4.899674860830819e-06, + "loss": 0.0, + "num_input_tokens_seen": 63692208, + "step": 109785 + }, + { + "epoch": 16.352397974381887, + "grad_norm": 1.7661488527664915e-05, + "learning_rate": 4.897742886572274e-06, + "loss": 0.0, + "num_input_tokens_seen": 63695024, + "step": 109790 + }, + { + "epoch": 16.353142686922848, + "grad_norm": 8.262132723757531e-06, + "learning_rate": 4.8958112519209315e-06, + "loss": 0.0, + "num_input_tokens_seen": 63698128, + "step": 109795 + }, + { + "epoch": 16.353887399463808, + "grad_norm": 4.766801430378109e-05, + "learning_rate": 4.8938799569094275e-06, + "loss": 0.0, + "num_input_tokens_seen": 63701008, + "step": 109800 + }, + { + "epoch": 16.354632112004765, + "grad_norm": 6.688809662591666e-05, + "learning_rate": 4.891949001570384e-06, + "loss": 0.0, + "num_input_tokens_seen": 63703888, + "step": 109805 + }, + { + "epoch": 16.355376824545726, + "grad_norm": 6.064326953492127e-06, + "learning_rate": 4.890018385936421e-06, + "loss": 0.0, + "num_input_tokens_seen": 63706672, + "step": 109810 + }, + { + "epoch": 16.356121537086686, + "grad_norm": 1.180286653834628e-05, + "learning_rate": 4.888088110040162e-06, + "loss": 0.0, + "num_input_tokens_seen": 63709456, + "step": 109815 + }, + { + "epoch": 16.356866249627643, + "grad_norm": 0.0023928515147417784, + "learning_rate": 4.88615817391421e-06, + "loss": 0.0, + "num_input_tokens_seen": 63712336, + "step": 109820 + }, + { + "epoch": 16.357610962168604, + "grad_norm": 2.2306373921310296e-06, + "learning_rate": 4.884228577591177e-06, + "loss": 0.0, + "num_input_tokens_seen": 63715216, + "step": 109825 + }, + { + "epoch": 16.35835567470956, + "grad_norm": 7.015599749138346e-06, + "learning_rate": 4.882299321103653e-06, + "loss": 0.0, + "num_input_tokens_seen": 63718032, + "step": 109830 + }, + { + "epoch": 16.35910038725052, + "grad_norm": 8.013677870621905e-06, + "learning_rate": 4.880370404484242e-06, + "loss": 0.0, + "num_input_tokens_seen": 63721072, + "step": 109835 + }, + { + "epoch": 16.35984509979148, + "grad_norm": 3.903114702552557e-06, + "learning_rate": 4.87844182776552e-06, + "loss": 0.0, + "num_input_tokens_seen": 63724048, + "step": 109840 + }, + { + "epoch": 16.36058981233244, + "grad_norm": 0.0002848423318937421, + "learning_rate": 4.87651359098007e-06, + "loss": 0.0, + "num_input_tokens_seen": 63726640, + "step": 109845 + }, + { + "epoch": 16.3613345248734, + "grad_norm": 3.414246020838618e-05, + "learning_rate": 4.874585694160477e-06, + "loss": 0.0, + "num_input_tokens_seen": 63729456, + "step": 109850 + }, + { + "epoch": 16.36207923741436, + "grad_norm": 6.174901955091627e-06, + "learning_rate": 4.872658137339295e-06, + "loss": 0.0, + "num_input_tokens_seen": 63732560, + "step": 109855 + }, + { + "epoch": 16.362823949955317, + "grad_norm": 8.092054486041889e-06, + "learning_rate": 4.870730920549108e-06, + "loss": 0.0, + "num_input_tokens_seen": 63735376, + "step": 109860 + }, + { + "epoch": 16.363568662496277, + "grad_norm": 3.3932919905055314e-05, + "learning_rate": 4.868804043822458e-06, + "loss": 0.0, + "num_input_tokens_seen": 63738640, + "step": 109865 + }, + { + "epoch": 16.364313375037234, + "grad_norm": 5.3017615755379666e-06, + "learning_rate": 4.866877507191908e-06, + "loss": 0.0, + "num_input_tokens_seen": 63741456, + "step": 109870 + }, + { + "epoch": 16.365058087578195, + "grad_norm": 2.5725389605213422e-06, + "learning_rate": 4.864951310689991e-06, + "loss": 0.0, + "num_input_tokens_seen": 63744432, + "step": 109875 + }, + { + "epoch": 16.365802800119155, + "grad_norm": 3.401071808184497e-05, + "learning_rate": 4.863025454349266e-06, + "loss": 0.0, + "num_input_tokens_seen": 63747440, + "step": 109880 + }, + { + "epoch": 16.366547512660112, + "grad_norm": 7.937069312902167e-05, + "learning_rate": 4.861099938202257e-06, + "loss": 0.0, + "num_input_tokens_seen": 63750320, + "step": 109885 + }, + { + "epoch": 16.367292225201073, + "grad_norm": 6.999990091571817e-06, + "learning_rate": 4.859174762281493e-06, + "loss": 0.0, + "num_input_tokens_seen": 63753136, + "step": 109890 + }, + { + "epoch": 16.36803693774203, + "grad_norm": 9.908453648677096e-05, + "learning_rate": 4.857249926619506e-06, + "loss": 0.0, + "num_input_tokens_seen": 63756016, + "step": 109895 + }, + { + "epoch": 16.36878165028299, + "grad_norm": 8.77659931575181e-06, + "learning_rate": 4.855325431248803e-06, + "loss": 0.0, + "num_input_tokens_seen": 63758800, + "step": 109900 + }, + { + "epoch": 16.36952636282395, + "grad_norm": 1.0358667168475222e-05, + "learning_rate": 4.853401276201908e-06, + "loss": 0.0, + "num_input_tokens_seen": 63761680, + "step": 109905 + }, + { + "epoch": 16.370271075364908, + "grad_norm": 5.6622202464495786e-06, + "learning_rate": 4.851477461511317e-06, + "loss": 0.0, + "num_input_tokens_seen": 63764400, + "step": 109910 + }, + { + "epoch": 16.37101578790587, + "grad_norm": 3.6666449432232184e-06, + "learning_rate": 4.84955398720954e-06, + "loss": 0.0, + "num_input_tokens_seen": 63767248, + "step": 109915 + }, + { + "epoch": 16.37176050044683, + "grad_norm": 0.0003383636649232358, + "learning_rate": 4.8476308533290714e-06, + "loss": 0.0, + "num_input_tokens_seen": 63770000, + "step": 109920 + }, + { + "epoch": 16.372505212987786, + "grad_norm": 4.645656645152485e-06, + "learning_rate": 4.8457080599023905e-06, + "loss": 0.0, + "num_input_tokens_seen": 63772816, + "step": 109925 + }, + { + "epoch": 16.373249925528746, + "grad_norm": 4.718584386864677e-05, + "learning_rate": 4.843785606961995e-06, + "loss": 0.0, + "num_input_tokens_seen": 63775856, + "step": 109930 + }, + { + "epoch": 16.373994638069703, + "grad_norm": 3.6403018839337165e-06, + "learning_rate": 4.8418634945403555e-06, + "loss": 0.0, + "num_input_tokens_seen": 63778832, + "step": 109935 + }, + { + "epoch": 16.374739350610664, + "grad_norm": 4.684261148213409e-05, + "learning_rate": 4.839941722669944e-06, + "loss": 0.0, + "num_input_tokens_seen": 63781840, + "step": 109940 + }, + { + "epoch": 16.375484063151625, + "grad_norm": 1.7068645320250653e-05, + "learning_rate": 4.8380202913832215e-06, + "loss": 0.0, + "num_input_tokens_seen": 63784592, + "step": 109945 + }, + { + "epoch": 16.37622877569258, + "grad_norm": 3.91358616980142e-06, + "learning_rate": 4.83609920071266e-06, + "loss": 0.0, + "num_input_tokens_seen": 63787568, + "step": 109950 + }, + { + "epoch": 16.376973488233542, + "grad_norm": 1.4122249922365882e-05, + "learning_rate": 4.834178450690704e-06, + "loss": 0.0, + "num_input_tokens_seen": 63790352, + "step": 109955 + }, + { + "epoch": 16.377718200774503, + "grad_norm": 4.305413767724531e-06, + "learning_rate": 4.832258041349813e-06, + "loss": 0.0, + "num_input_tokens_seen": 63793520, + "step": 109960 + }, + { + "epoch": 16.37846291331546, + "grad_norm": 0.0008364116656593978, + "learning_rate": 4.830337972722424e-06, + "loss": 0.0, + "num_input_tokens_seen": 63796368, + "step": 109965 + }, + { + "epoch": 16.37920762585642, + "grad_norm": 0.00013123609824106097, + "learning_rate": 4.828418244840968e-06, + "loss": 0.0, + "num_input_tokens_seen": 63799280, + "step": 109970 + }, + { + "epoch": 16.379952338397377, + "grad_norm": 4.745001660921844e-06, + "learning_rate": 4.8264988577378934e-06, + "loss": 0.0, + "num_input_tokens_seen": 63802096, + "step": 109975 + }, + { + "epoch": 16.380697050938338, + "grad_norm": 8.352675649803132e-06, + "learning_rate": 4.824579811445609e-06, + "loss": 0.0, + "num_input_tokens_seen": 63804688, + "step": 109980 + }, + { + "epoch": 16.381441763479298, + "grad_norm": 8.83399770827964e-05, + "learning_rate": 4.822661105996551e-06, + "loss": 0.0, + "num_input_tokens_seen": 63807664, + "step": 109985 + }, + { + "epoch": 16.382186476020255, + "grad_norm": 3.743198612937704e-05, + "learning_rate": 4.82074274142312e-06, + "loss": 0.0, + "num_input_tokens_seen": 63810704, + "step": 109990 + }, + { + "epoch": 16.382931188561216, + "grad_norm": 4.021713721158449e-06, + "learning_rate": 4.818824717757736e-06, + "loss": 0.0, + "num_input_tokens_seen": 63813392, + "step": 109995 + }, + { + "epoch": 16.383675901102176, + "grad_norm": 6.0820406361017376e-05, + "learning_rate": 4.816907035032797e-06, + "loss": 0.2532, + "num_input_tokens_seen": 63816496, + "step": 110000 + }, + { + "epoch": 16.384420613643133, + "grad_norm": 0.0026012787129729986, + "learning_rate": 4.814989693280703e-06, + "loss": 0.0, + "num_input_tokens_seen": 63819344, + "step": 110005 + }, + { + "epoch": 16.385165326184094, + "grad_norm": 0.00012531509855762124, + "learning_rate": 4.81307269253384e-06, + "loss": 0.0, + "num_input_tokens_seen": 63822128, + "step": 110010 + }, + { + "epoch": 16.38591003872505, + "grad_norm": 9.16478456929326e-05, + "learning_rate": 4.811156032824593e-06, + "loss": 0.0, + "num_input_tokens_seen": 63825168, + "step": 110015 + }, + { + "epoch": 16.38665475126601, + "grad_norm": 9.591155503585469e-06, + "learning_rate": 4.8092397141853515e-06, + "loss": 0.0, + "num_input_tokens_seen": 63828080, + "step": 110020 + }, + { + "epoch": 16.38739946380697, + "grad_norm": 0.0002878454979509115, + "learning_rate": 4.807323736648475e-06, + "loss": 0.0, + "num_input_tokens_seen": 63830928, + "step": 110025 + }, + { + "epoch": 16.38814417634793, + "grad_norm": 4.8984697059495375e-05, + "learning_rate": 4.80540810024635e-06, + "loss": 0.0, + "num_input_tokens_seen": 63833872, + "step": 110030 + }, + { + "epoch": 16.38888888888889, + "grad_norm": 1.2064157090208028e-05, + "learning_rate": 4.8034928050113256e-06, + "loss": 0.0, + "num_input_tokens_seen": 63836688, + "step": 110035 + }, + { + "epoch": 16.38963360142985, + "grad_norm": 3.5113371268380433e-06, + "learning_rate": 4.8015778509757665e-06, + "loss": 0.0, + "num_input_tokens_seen": 63839376, + "step": 110040 + }, + { + "epoch": 16.390378313970807, + "grad_norm": 0.030457304790616035, + "learning_rate": 4.799663238172022e-06, + "loss": 0.0, + "num_input_tokens_seen": 63842160, + "step": 110045 + }, + { + "epoch": 16.391123026511767, + "grad_norm": 4.338319740782026e-06, + "learning_rate": 4.7977489666324285e-06, + "loss": 0.0, + "num_input_tokens_seen": 63844976, + "step": 110050 + }, + { + "epoch": 16.391867739052724, + "grad_norm": 0.000707295082975179, + "learning_rate": 4.7958350363893424e-06, + "loss": 0.0, + "num_input_tokens_seen": 63847760, + "step": 110055 + }, + { + "epoch": 16.392612451593685, + "grad_norm": 1.9562210582080297e-05, + "learning_rate": 4.793921447475083e-06, + "loss": 0.0097, + "num_input_tokens_seen": 63850704, + "step": 110060 + }, + { + "epoch": 16.393357164134645, + "grad_norm": 6.110693357186392e-05, + "learning_rate": 4.7920081999219875e-06, + "loss": 0.0, + "num_input_tokens_seen": 63854096, + "step": 110065 + }, + { + "epoch": 16.394101876675602, + "grad_norm": 3.0308090117614483e-06, + "learning_rate": 4.790095293762379e-06, + "loss": 0.0, + "num_input_tokens_seen": 63856848, + "step": 110070 + }, + { + "epoch": 16.394846589216563, + "grad_norm": 7.407119028357556e-06, + "learning_rate": 4.788182729028565e-06, + "loss": 0.0, + "num_input_tokens_seen": 63859984, + "step": 110075 + }, + { + "epoch": 16.39559130175752, + "grad_norm": 2.401611482127919e-06, + "learning_rate": 4.786270505752866e-06, + "loss": 0.0, + "num_input_tokens_seen": 63862768, + "step": 110080 + }, + { + "epoch": 16.39633601429848, + "grad_norm": 7.447817188221961e-05, + "learning_rate": 4.784358623967572e-06, + "loss": 0.0, + "num_input_tokens_seen": 63865712, + "step": 110085 + }, + { + "epoch": 16.39708072683944, + "grad_norm": 7.1779331847210415e-06, + "learning_rate": 4.782447083705002e-06, + "loss": 0.0, + "num_input_tokens_seen": 63868688, + "step": 110090 + }, + { + "epoch": 16.397825439380398, + "grad_norm": 0.0002540929417591542, + "learning_rate": 4.780535884997433e-06, + "loss": 0.0, + "num_input_tokens_seen": 63871952, + "step": 110095 + }, + { + "epoch": 16.39857015192136, + "grad_norm": 0.0006537502049468458, + "learning_rate": 4.7786250278771675e-06, + "loss": 0.0, + "num_input_tokens_seen": 63874608, + "step": 110100 + }, + { + "epoch": 16.39931486446232, + "grad_norm": 6.280086381593719e-06, + "learning_rate": 4.776714512376474e-06, + "loss": 0.0, + "num_input_tokens_seen": 63877616, + "step": 110105 + }, + { + "epoch": 16.400059577003276, + "grad_norm": 1.6794001567177474e-05, + "learning_rate": 4.774804338527639e-06, + "loss": 0.0, + "num_input_tokens_seen": 63880560, + "step": 110110 + }, + { + "epoch": 16.400804289544237, + "grad_norm": 9.477786079514772e-06, + "learning_rate": 4.772894506362924e-06, + "loss": 0.0, + "num_input_tokens_seen": 63883504, + "step": 110115 + }, + { + "epoch": 16.401549002085194, + "grad_norm": 1.2056569175911136e-05, + "learning_rate": 4.770985015914603e-06, + "loss": 0.0, + "num_input_tokens_seen": 63886352, + "step": 110120 + }, + { + "epoch": 16.402293714626154, + "grad_norm": 0.00018597726011648774, + "learning_rate": 4.769075867214931e-06, + "loss": 0.0, + "num_input_tokens_seen": 63889264, + "step": 110125 + }, + { + "epoch": 16.403038427167115, + "grad_norm": 6.938510523468722e-06, + "learning_rate": 4.767167060296163e-06, + "loss": 0.0, + "num_input_tokens_seen": 63891824, + "step": 110130 + }, + { + "epoch": 16.40378313970807, + "grad_norm": 2.2736701794201508e-05, + "learning_rate": 4.7652585951905415e-06, + "loss": 0.0, + "num_input_tokens_seen": 63894608, + "step": 110135 + }, + { + "epoch": 16.404527852249032, + "grad_norm": 2.2237469238461927e-05, + "learning_rate": 4.763350471930303e-06, + "loss": 0.0, + "num_input_tokens_seen": 63897552, + "step": 110140 + }, + { + "epoch": 16.405272564789993, + "grad_norm": 3.2623370316287037e-06, + "learning_rate": 4.761442690547699e-06, + "loss": 0.0, + "num_input_tokens_seen": 63900400, + "step": 110145 + }, + { + "epoch": 16.40601727733095, + "grad_norm": 1.1028393601009157e-05, + "learning_rate": 4.759535251074942e-06, + "loss": 0.0, + "num_input_tokens_seen": 63903248, + "step": 110150 + }, + { + "epoch": 16.40676198987191, + "grad_norm": 7.312445814022794e-05, + "learning_rate": 4.7576281535442745e-06, + "loss": 0.0004, + "num_input_tokens_seen": 63906320, + "step": 110155 + }, + { + "epoch": 16.407506702412867, + "grad_norm": 0.00015094137052074075, + "learning_rate": 4.755721397987906e-06, + "loss": 0.0, + "num_input_tokens_seen": 63909360, + "step": 110160 + }, + { + "epoch": 16.408251414953828, + "grad_norm": 3.708132862811908e-05, + "learning_rate": 4.753814984438043e-06, + "loss": 0.0, + "num_input_tokens_seen": 63912112, + "step": 110165 + }, + { + "epoch": 16.408996127494788, + "grad_norm": 4.757827991852537e-05, + "learning_rate": 4.7519089129269026e-06, + "loss": 0.0, + "num_input_tokens_seen": 63915088, + "step": 110170 + }, + { + "epoch": 16.409740840035745, + "grad_norm": 4.144244485360105e-06, + "learning_rate": 4.750003183486676e-06, + "loss": 0.0, + "num_input_tokens_seen": 63918000, + "step": 110175 + }, + { + "epoch": 16.410485552576706, + "grad_norm": 0.00148571259342134, + "learning_rate": 4.748097796149573e-06, + "loss": 0.0, + "num_input_tokens_seen": 63920816, + "step": 110180 + }, + { + "epoch": 16.411230265117666, + "grad_norm": 2.35859783970227e-06, + "learning_rate": 4.746192750947767e-06, + "loss": 0.0, + "num_input_tokens_seen": 63923664, + "step": 110185 + }, + { + "epoch": 16.411974977658623, + "grad_norm": 2.669133186340332, + "learning_rate": 4.744288047913456e-06, + "loss": 0.0063, + "num_input_tokens_seen": 63926704, + "step": 110190 + }, + { + "epoch": 16.412719690199584, + "grad_norm": 3.90071545552928e-06, + "learning_rate": 4.742383687078811e-06, + "loss": 0.0, + "num_input_tokens_seen": 63929392, + "step": 110195 + }, + { + "epoch": 16.41346440274054, + "grad_norm": 9.647307160776109e-05, + "learning_rate": 4.7404796684760055e-06, + "loss": 0.0, + "num_input_tokens_seen": 63932336, + "step": 110200 + }, + { + "epoch": 16.4142091152815, + "grad_norm": 6.986189873714466e-06, + "learning_rate": 4.738575992137203e-06, + "loss": 0.0, + "num_input_tokens_seen": 63935280, + "step": 110205 + }, + { + "epoch": 16.414953827822462, + "grad_norm": 9.983852578443475e-06, + "learning_rate": 4.736672658094562e-06, + "loss": 0.0, + "num_input_tokens_seen": 63938192, + "step": 110210 + }, + { + "epoch": 16.41569854036342, + "grad_norm": 0.00010867160017369315, + "learning_rate": 4.734769666380248e-06, + "loss": 0.0, + "num_input_tokens_seen": 63941168, + "step": 110215 + }, + { + "epoch": 16.41644325290438, + "grad_norm": 1.0421796105219983e-05, + "learning_rate": 4.732867017026396e-06, + "loss": 0.0, + "num_input_tokens_seen": 63944112, + "step": 110220 + }, + { + "epoch": 16.417187965445336, + "grad_norm": 1.2042699381709099e-05, + "learning_rate": 4.730964710065164e-06, + "loss": 0.0002, + "num_input_tokens_seen": 63946960, + "step": 110225 + }, + { + "epoch": 16.417932677986297, + "grad_norm": 1.1671627362375148e-05, + "learning_rate": 4.729062745528678e-06, + "loss": 0.0, + "num_input_tokens_seen": 63950192, + "step": 110230 + }, + { + "epoch": 16.418677390527257, + "grad_norm": 2.4844330255291425e-06, + "learning_rate": 4.727161123449078e-06, + "loss": 0.0, + "num_input_tokens_seen": 63953200, + "step": 110235 + }, + { + "epoch": 16.419422103068214, + "grad_norm": 5.486940608534496e-06, + "learning_rate": 4.72525984385849e-06, + "loss": 0.0, + "num_input_tokens_seen": 63956048, + "step": 110240 + }, + { + "epoch": 16.420166815609175, + "grad_norm": 0.0002010926982620731, + "learning_rate": 4.7233589067890215e-06, + "loss": 0.0, + "num_input_tokens_seen": 63959216, + "step": 110245 + }, + { + "epoch": 16.420911528150135, + "grad_norm": 2.4980505259009078e-05, + "learning_rate": 4.721458312272803e-06, + "loss": 0.0, + "num_input_tokens_seen": 63962224, + "step": 110250 + }, + { + "epoch": 16.421656240691092, + "grad_norm": 6.1853193074057344e-06, + "learning_rate": 4.719558060341931e-06, + "loss": 0.0, + "num_input_tokens_seen": 63965296, + "step": 110255 + }, + { + "epoch": 16.422400953232053, + "grad_norm": 0.00024030174245126545, + "learning_rate": 4.717658151028517e-06, + "loss": 0.0, + "num_input_tokens_seen": 63968592, + "step": 110260 + }, + { + "epoch": 16.42314566577301, + "grad_norm": 1.3983323697175365e-05, + "learning_rate": 4.715758584364657e-06, + "loss": 0.0, + "num_input_tokens_seen": 63971504, + "step": 110265 + }, + { + "epoch": 16.42389037831397, + "grad_norm": 131.6249237060547, + "learning_rate": 4.713859360382439e-06, + "loss": 0.0173, + "num_input_tokens_seen": 63974224, + "step": 110270 + }, + { + "epoch": 16.42463509085493, + "grad_norm": 1.728664392430801e-05, + "learning_rate": 4.7119604791139414e-06, + "loss": 0.0, + "num_input_tokens_seen": 63977392, + "step": 110275 + }, + { + "epoch": 16.425379803395888, + "grad_norm": 1.8375787476543337e-05, + "learning_rate": 4.7100619405912625e-06, + "loss": 0.0, + "num_input_tokens_seen": 63980368, + "step": 110280 + }, + { + "epoch": 16.42612451593685, + "grad_norm": 3.5618668334791437e-06, + "learning_rate": 4.708163744846461e-06, + "loss": 0.0, + "num_input_tokens_seen": 63983056, + "step": 110285 + }, + { + "epoch": 16.42686922847781, + "grad_norm": 2.1048899725428782e-06, + "learning_rate": 4.706265891911604e-06, + "loss": 0.0, + "num_input_tokens_seen": 63985840, + "step": 110290 + }, + { + "epoch": 16.427613941018766, + "grad_norm": 9.314316230302211e-06, + "learning_rate": 4.704368381818766e-06, + "loss": 0.0, + "num_input_tokens_seen": 63988816, + "step": 110295 + }, + { + "epoch": 16.428358653559727, + "grad_norm": 1.453892582503613e-05, + "learning_rate": 4.70247121459999e-06, + "loss": 0.0, + "num_input_tokens_seen": 63991568, + "step": 110300 + }, + { + "epoch": 16.429103366100684, + "grad_norm": 3.277121095379698e-06, + "learning_rate": 4.700574390287341e-06, + "loss": 0.0, + "num_input_tokens_seen": 63994352, + "step": 110305 + }, + { + "epoch": 16.429848078641644, + "grad_norm": 6.410439527826384e-05, + "learning_rate": 4.698677908912846e-06, + "loss": 0.0, + "num_input_tokens_seen": 63997296, + "step": 110310 + }, + { + "epoch": 16.430592791182605, + "grad_norm": 1.2418494407029357e-05, + "learning_rate": 4.696781770508566e-06, + "loss": 0.0, + "num_input_tokens_seen": 63999984, + "step": 110315 + }, + { + "epoch": 16.43133750372356, + "grad_norm": 8.035982318688184e-05, + "learning_rate": 4.694885975106511e-06, + "loss": 0.0, + "num_input_tokens_seen": 64002992, + "step": 110320 + }, + { + "epoch": 16.432082216264522, + "grad_norm": 3.283011210442055e-06, + "learning_rate": 4.6929905227387295e-06, + "loss": 0.0, + "num_input_tokens_seen": 64005840, + "step": 110325 + }, + { + "epoch": 16.432826928805483, + "grad_norm": 7.610574812133564e-06, + "learning_rate": 4.691095413437235e-06, + "loss": 0.0, + "num_input_tokens_seen": 64008688, + "step": 110330 + }, + { + "epoch": 16.43357164134644, + "grad_norm": 0.9236194491386414, + "learning_rate": 4.6892006472340405e-06, + "loss": 0.0005, + "num_input_tokens_seen": 64011472, + "step": 110335 + }, + { + "epoch": 16.4343163538874, + "grad_norm": 0.00011554967204574496, + "learning_rate": 4.687306224161159e-06, + "loss": 0.0, + "num_input_tokens_seen": 64014672, + "step": 110340 + }, + { + "epoch": 16.435061066428357, + "grad_norm": 5.214518751017749e-06, + "learning_rate": 4.685412144250586e-06, + "loss": 0.0, + "num_input_tokens_seen": 64017360, + "step": 110345 + }, + { + "epoch": 16.435805778969318, + "grad_norm": 3.681469706862117e-06, + "learning_rate": 4.683518407534338e-06, + "loss": 0.0, + "num_input_tokens_seen": 64020304, + "step": 110350 + }, + { + "epoch": 16.43655049151028, + "grad_norm": 4.056216312164906e-06, + "learning_rate": 4.6816250140443884e-06, + "loss": 0.0, + "num_input_tokens_seen": 64023504, + "step": 110355 + }, + { + "epoch": 16.437295204051235, + "grad_norm": 4.737563813250745e-06, + "learning_rate": 4.679731963812742e-06, + "loss": 0.0, + "num_input_tokens_seen": 64026544, + "step": 110360 + }, + { + "epoch": 16.438039916592196, + "grad_norm": 0.019357955083251, + "learning_rate": 4.6778392568713695e-06, + "loss": 0.0, + "num_input_tokens_seen": 64029232, + "step": 110365 + }, + { + "epoch": 16.438784629133156, + "grad_norm": 7.865840598242357e-05, + "learning_rate": 4.675946893252242e-06, + "loss": 0.0, + "num_input_tokens_seen": 64032432, + "step": 110370 + }, + { + "epoch": 16.439529341674113, + "grad_norm": 1.7212073544214945e-06, + "learning_rate": 4.674054872987344e-06, + "loss": 0.0, + "num_input_tokens_seen": 64035376, + "step": 110375 + }, + { + "epoch": 16.440274054215074, + "grad_norm": 0.0012978706508874893, + "learning_rate": 4.67216319610862e-06, + "loss": 0.0, + "num_input_tokens_seen": 64038384, + "step": 110380 + }, + { + "epoch": 16.44101876675603, + "grad_norm": 2.3904096451587975e-05, + "learning_rate": 4.670271862648049e-06, + "loss": 0.0, + "num_input_tokens_seen": 64041520, + "step": 110385 + }, + { + "epoch": 16.44176347929699, + "grad_norm": 0.0002720931370276958, + "learning_rate": 4.668380872637562e-06, + "loss": 0.0, + "num_input_tokens_seen": 64044080, + "step": 110390 + }, + { + "epoch": 16.442508191837952, + "grad_norm": 6.7305468292033765e-06, + "learning_rate": 4.666490226109127e-06, + "loss": 0.0, + "num_input_tokens_seen": 64046832, + "step": 110395 + }, + { + "epoch": 16.44325290437891, + "grad_norm": 6.496963032986969e-05, + "learning_rate": 4.66459992309467e-06, + "loss": 0.0, + "num_input_tokens_seen": 64049392, + "step": 110400 + }, + { + "epoch": 16.44399761691987, + "grad_norm": 2.1247880795272067e-05, + "learning_rate": 4.662709963626133e-06, + "loss": 0.0, + "num_input_tokens_seen": 64052144, + "step": 110405 + }, + { + "epoch": 16.44474232946083, + "grad_norm": 4.875723971053958e-05, + "learning_rate": 4.660820347735437e-06, + "loss": 0.1407, + "num_input_tokens_seen": 64055024, + "step": 110410 + }, + { + "epoch": 16.445487042001787, + "grad_norm": 1.006729326036293e-05, + "learning_rate": 4.658931075454507e-06, + "loss": 0.0, + "num_input_tokens_seen": 64057936, + "step": 110415 + }, + { + "epoch": 16.446231754542747, + "grad_norm": 0.026189446449279785, + "learning_rate": 4.657042146815266e-06, + "loss": 0.0, + "num_input_tokens_seen": 64060656, + "step": 110420 + }, + { + "epoch": 16.446976467083704, + "grad_norm": 2.691431473067496e-05, + "learning_rate": 4.655153561849618e-06, + "loss": 0.0, + "num_input_tokens_seen": 64063568, + "step": 110425 + }, + { + "epoch": 16.447721179624665, + "grad_norm": 0.00012425666500348598, + "learning_rate": 4.6532653205894786e-06, + "loss": 0.0, + "num_input_tokens_seen": 64066288, + "step": 110430 + }, + { + "epoch": 16.448465892165625, + "grad_norm": 4.532470029516844e-06, + "learning_rate": 4.651377423066736e-06, + "loss": 0.0004, + "num_input_tokens_seen": 64069392, + "step": 110435 + }, + { + "epoch": 16.449210604706582, + "grad_norm": 0.00036553165409713984, + "learning_rate": 4.649489869313295e-06, + "loss": 0.0, + "num_input_tokens_seen": 64072400, + "step": 110440 + }, + { + "epoch": 16.449955317247543, + "grad_norm": 2.1988258595229127e-05, + "learning_rate": 4.647602659361042e-06, + "loss": 0.0, + "num_input_tokens_seen": 64075440, + "step": 110445 + }, + { + "epoch": 16.4507000297885, + "grad_norm": 1.2273285392438993e-05, + "learning_rate": 4.645715793241848e-06, + "loss": 0.0, + "num_input_tokens_seen": 64078192, + "step": 110450 + }, + { + "epoch": 16.45144474232946, + "grad_norm": 3.033210077774129e-06, + "learning_rate": 4.6438292709876065e-06, + "loss": 0.0, + "num_input_tokens_seen": 64081264, + "step": 110455 + }, + { + "epoch": 16.45218945487042, + "grad_norm": 2.2731468561687507e-05, + "learning_rate": 4.64194309263018e-06, + "loss": 0.0, + "num_input_tokens_seen": 64084208, + "step": 110460 + }, + { + "epoch": 16.452934167411378, + "grad_norm": 3.6026265206601238e-06, + "learning_rate": 4.6400572582014325e-06, + "loss": 0.0, + "num_input_tokens_seen": 64086896, + "step": 110465 + }, + { + "epoch": 16.45367887995234, + "grad_norm": 0.0017096201190724969, + "learning_rate": 4.638171767733221e-06, + "loss": 0.0, + "num_input_tokens_seen": 64089936, + "step": 110470 + }, + { + "epoch": 16.4544235924933, + "grad_norm": 1.3895694792154245e-05, + "learning_rate": 4.636286621257407e-06, + "loss": 0.0148, + "num_input_tokens_seen": 64092848, + "step": 110475 + }, + { + "epoch": 16.455168305034256, + "grad_norm": 0.00017349908011965454, + "learning_rate": 4.634401818805828e-06, + "loss": 0.0, + "num_input_tokens_seen": 64096112, + "step": 110480 + }, + { + "epoch": 16.455913017575217, + "grad_norm": 0.0006268041906878352, + "learning_rate": 4.632517360410338e-06, + "loss": 0.0, + "num_input_tokens_seen": 64098928, + "step": 110485 + }, + { + "epoch": 16.456657730116174, + "grad_norm": 0.00010880573972826824, + "learning_rate": 4.630633246102767e-06, + "loss": 0.0, + "num_input_tokens_seen": 64101776, + "step": 110490 + }, + { + "epoch": 16.457402442657134, + "grad_norm": 3.1023103019833798e-06, + "learning_rate": 4.62874947591494e-06, + "loss": 0.0001, + "num_input_tokens_seen": 64104784, + "step": 110495 + }, + { + "epoch": 16.458147155198095, + "grad_norm": 5.731010332965525e-06, + "learning_rate": 4.62686604987869e-06, + "loss": 0.0, + "num_input_tokens_seen": 64107920, + "step": 110500 + }, + { + "epoch": 16.45889186773905, + "grad_norm": 6.719802058796631e-06, + "learning_rate": 4.624982968025826e-06, + "loss": 0.0, + "num_input_tokens_seen": 64110864, + "step": 110505 + }, + { + "epoch": 16.459636580280012, + "grad_norm": 3.378377641638508e-06, + "learning_rate": 4.623100230388172e-06, + "loss": 0.0, + "num_input_tokens_seen": 64113584, + "step": 110510 + }, + { + "epoch": 16.460381292820973, + "grad_norm": 7.178343366831541e-06, + "learning_rate": 4.621217836997524e-06, + "loss": 0.0, + "num_input_tokens_seen": 64116624, + "step": 110515 + }, + { + "epoch": 16.46112600536193, + "grad_norm": 2.9030488803982735e-05, + "learning_rate": 4.619335787885695e-06, + "loss": 0.0, + "num_input_tokens_seen": 64119408, + "step": 110520 + }, + { + "epoch": 16.46187071790289, + "grad_norm": 6.9351985985122155e-06, + "learning_rate": 4.617454083084474e-06, + "loss": 0.0, + "num_input_tokens_seen": 64122256, + "step": 110525 + }, + { + "epoch": 16.462615430443847, + "grad_norm": 1.2560046343423892e-05, + "learning_rate": 4.615572722625649e-06, + "loss": 0.0, + "num_input_tokens_seen": 64124944, + "step": 110530 + }, + { + "epoch": 16.463360142984808, + "grad_norm": 4.300172804505564e-06, + "learning_rate": 4.6136917065410065e-06, + "loss": 0.0, + "num_input_tokens_seen": 64127824, + "step": 110535 + }, + { + "epoch": 16.46410485552577, + "grad_norm": 0.0001297134585911408, + "learning_rate": 4.611811034862318e-06, + "loss": 0.0, + "num_input_tokens_seen": 64130640, + "step": 110540 + }, + { + "epoch": 16.464849568066725, + "grad_norm": 8.743204671191052e-05, + "learning_rate": 4.609930707621366e-06, + "loss": 0.004, + "num_input_tokens_seen": 64133584, + "step": 110545 + }, + { + "epoch": 16.465594280607686, + "grad_norm": 0.00098686502315104, + "learning_rate": 4.608050724849902e-06, + "loss": 0.0, + "num_input_tokens_seen": 64136624, + "step": 110550 + }, + { + "epoch": 16.466338993148646, + "grad_norm": 0.0020431678276509047, + "learning_rate": 4.6061710865797055e-06, + "loss": 0.0, + "num_input_tokens_seen": 64139824, + "step": 110555 + }, + { + "epoch": 16.467083705689603, + "grad_norm": 1.0767233106889762e-05, + "learning_rate": 4.604291792842513e-06, + "loss": 0.0, + "num_input_tokens_seen": 64142608, + "step": 110560 + }, + { + "epoch": 16.467828418230564, + "grad_norm": 5.8449468269827776e-06, + "learning_rate": 4.602412843670087e-06, + "loss": 0.0, + "num_input_tokens_seen": 64145392, + "step": 110565 + }, + { + "epoch": 16.46857313077152, + "grad_norm": 4.237367829773575e-05, + "learning_rate": 4.600534239094165e-06, + "loss": 0.0, + "num_input_tokens_seen": 64148176, + "step": 110570 + }, + { + "epoch": 16.46931784331248, + "grad_norm": 0.0001988841249840334, + "learning_rate": 4.598655979146479e-06, + "loss": 0.0, + "num_input_tokens_seen": 64150992, + "step": 110575 + }, + { + "epoch": 16.470062555853442, + "grad_norm": 5.3747107813251205e-06, + "learning_rate": 4.59677806385877e-06, + "loss": 0.0, + "num_input_tokens_seen": 64153936, + "step": 110580 + }, + { + "epoch": 16.4708072683944, + "grad_norm": 7.66030734666856e-06, + "learning_rate": 4.5949004932627545e-06, + "loss": 0.0, + "num_input_tokens_seen": 64156848, + "step": 110585 + }, + { + "epoch": 16.47155198093536, + "grad_norm": 2.789383415802149e-06, + "learning_rate": 4.593023267390162e-06, + "loss": 0.0, + "num_input_tokens_seen": 64160016, + "step": 110590 + }, + { + "epoch": 16.472296693476316, + "grad_norm": 1.3691293133888394e-05, + "learning_rate": 4.591146386272699e-06, + "loss": 0.0, + "num_input_tokens_seen": 64162864, + "step": 110595 + }, + { + "epoch": 16.473041406017277, + "grad_norm": 0.000822798814624548, + "learning_rate": 4.5892698499420764e-06, + "loss": 0.0, + "num_input_tokens_seen": 64165840, + "step": 110600 + }, + { + "epoch": 16.473786118558237, + "grad_norm": 3.5913919873564737e-06, + "learning_rate": 4.5873936584299946e-06, + "loss": 0.0, + "num_input_tokens_seen": 64168784, + "step": 110605 + }, + { + "epoch": 16.474530831099194, + "grad_norm": 1.9255247025284916e-05, + "learning_rate": 4.5855178117681444e-06, + "loss": 0.0, + "num_input_tokens_seen": 64171728, + "step": 110610 + }, + { + "epoch": 16.475275543640155, + "grad_norm": 0.029754895716905594, + "learning_rate": 4.583642309988229e-06, + "loss": 0.0001, + "num_input_tokens_seen": 64174672, + "step": 110615 + }, + { + "epoch": 16.476020256181116, + "grad_norm": 4.306328628445044e-05, + "learning_rate": 4.581767153121922e-06, + "loss": 0.0, + "num_input_tokens_seen": 64177648, + "step": 110620 + }, + { + "epoch": 16.476764968722073, + "grad_norm": 1.1961662494286429e-05, + "learning_rate": 4.579892341200911e-06, + "loss": 0.0, + "num_input_tokens_seen": 64180528, + "step": 110625 + }, + { + "epoch": 16.477509681263033, + "grad_norm": 0.00016723618318792433, + "learning_rate": 4.578017874256857e-06, + "loss": 0.0, + "num_input_tokens_seen": 64183312, + "step": 110630 + }, + { + "epoch": 16.47825439380399, + "grad_norm": 8.7448199337814e-05, + "learning_rate": 4.5761437523214435e-06, + "loss": 0.0, + "num_input_tokens_seen": 64186160, + "step": 110635 + }, + { + "epoch": 16.47899910634495, + "grad_norm": 1.6364248949685134e-05, + "learning_rate": 4.574269975426318e-06, + "loss": 0.0, + "num_input_tokens_seen": 64189008, + "step": 110640 + }, + { + "epoch": 16.47974381888591, + "grad_norm": 1.4186370208335575e-05, + "learning_rate": 4.572396543603147e-06, + "loss": 0.0, + "num_input_tokens_seen": 64191824, + "step": 110645 + }, + { + "epoch": 16.480488531426868, + "grad_norm": 4.3112345338158775e-06, + "learning_rate": 4.570523456883574e-06, + "loss": 0.0, + "num_input_tokens_seen": 64194736, + "step": 110650 + }, + { + "epoch": 16.48123324396783, + "grad_norm": 2.9544207791332155e-06, + "learning_rate": 4.568650715299236e-06, + "loss": 0.0, + "num_input_tokens_seen": 64197392, + "step": 110655 + }, + { + "epoch": 16.48197795650879, + "grad_norm": 9.105406206799671e-06, + "learning_rate": 4.566778318881787e-06, + "loss": 0.0, + "num_input_tokens_seen": 64200432, + "step": 110660 + }, + { + "epoch": 16.482722669049746, + "grad_norm": 0.0001787135552149266, + "learning_rate": 4.56490626766285e-06, + "loss": 0.0, + "num_input_tokens_seen": 64203568, + "step": 110665 + }, + { + "epoch": 16.483467381590707, + "grad_norm": 6.299303095147479e-06, + "learning_rate": 4.563034561674054e-06, + "loss": 0.0, + "num_input_tokens_seen": 64206960, + "step": 110670 + }, + { + "epoch": 16.484212094131664, + "grad_norm": 6.740695971529931e-05, + "learning_rate": 4.561163200947008e-06, + "loss": 0.0, + "num_input_tokens_seen": 64209936, + "step": 110675 + }, + { + "epoch": 16.484956806672624, + "grad_norm": 1.2354274076642469e-05, + "learning_rate": 4.559292185513347e-06, + "loss": 0.0, + "num_input_tokens_seen": 64212784, + "step": 110680 + }, + { + "epoch": 16.485701519213585, + "grad_norm": 0.0001838480238802731, + "learning_rate": 4.557421515404667e-06, + "loss": 0.0, + "num_input_tokens_seen": 64215568, + "step": 110685 + }, + { + "epoch": 16.48644623175454, + "grad_norm": 381.0152282714844, + "learning_rate": 4.555551190652568e-06, + "loss": 0.0189, + "num_input_tokens_seen": 64218448, + "step": 110690 + }, + { + "epoch": 16.487190944295502, + "grad_norm": 0.00026483647525310516, + "learning_rate": 4.55368121128866e-06, + "loss": 0.0, + "num_input_tokens_seen": 64221360, + "step": 110695 + }, + { + "epoch": 16.487935656836463, + "grad_norm": 3.3283163247688208e-06, + "learning_rate": 4.55181157734452e-06, + "loss": 0.0, + "num_input_tokens_seen": 64224208, + "step": 110700 + }, + { + "epoch": 16.48868036937742, + "grad_norm": 1.0267341167491395e-05, + "learning_rate": 4.549942288851747e-06, + "loss": 0.0, + "num_input_tokens_seen": 64226992, + "step": 110705 + }, + { + "epoch": 16.48942508191838, + "grad_norm": 1.9111125766357873e-06, + "learning_rate": 4.5480733458419074e-06, + "loss": 0.0645, + "num_input_tokens_seen": 64230032, + "step": 110710 + }, + { + "epoch": 16.490169794459337, + "grad_norm": 0.00011638436262728646, + "learning_rate": 4.5462047483465886e-06, + "loss": 0.0774, + "num_input_tokens_seen": 64233040, + "step": 110715 + }, + { + "epoch": 16.490914507000298, + "grad_norm": 0.00010277607361786067, + "learning_rate": 4.5443364963973475e-06, + "loss": 0.0, + "num_input_tokens_seen": 64236016, + "step": 110720 + }, + { + "epoch": 16.49165921954126, + "grad_norm": 5.023818630434107e-06, + "learning_rate": 4.542468590025756e-06, + "loss": 0.0, + "num_input_tokens_seen": 64238640, + "step": 110725 + }, + { + "epoch": 16.492403932082215, + "grad_norm": 0.00032268656650558114, + "learning_rate": 4.540601029263367e-06, + "loss": 0.0, + "num_input_tokens_seen": 64241744, + "step": 110730 + }, + { + "epoch": 16.493148644623176, + "grad_norm": 1.4748675312148407e-05, + "learning_rate": 4.538733814141729e-06, + "loss": 0.0, + "num_input_tokens_seen": 64244336, + "step": 110735 + }, + { + "epoch": 16.493893357164133, + "grad_norm": 1.7353157090838067e-05, + "learning_rate": 4.536866944692386e-06, + "loss": 0.1067, + "num_input_tokens_seen": 64247408, + "step": 110740 + }, + { + "epoch": 16.494638069705093, + "grad_norm": 532.5748291015625, + "learning_rate": 4.535000420946875e-06, + "loss": 0.04, + "num_input_tokens_seen": 64250448, + "step": 110745 + }, + { + "epoch": 16.495382782246054, + "grad_norm": 0.0004318737192079425, + "learning_rate": 4.533134242936735e-06, + "loss": 0.0, + "num_input_tokens_seen": 64253008, + "step": 110750 + }, + { + "epoch": 16.49612749478701, + "grad_norm": 3.31782530338387e-06, + "learning_rate": 4.531268410693488e-06, + "loss": 0.0, + "num_input_tokens_seen": 64255856, + "step": 110755 + }, + { + "epoch": 16.49687220732797, + "grad_norm": 0.047971051186323166, + "learning_rate": 4.52940292424866e-06, + "loss": 0.0001, + "num_input_tokens_seen": 64258704, + "step": 110760 + }, + { + "epoch": 16.497616919868932, + "grad_norm": 5.869675078429282e-06, + "learning_rate": 4.527537783633764e-06, + "loss": 0.0, + "num_input_tokens_seen": 64262000, + "step": 110765 + }, + { + "epoch": 16.49836163240989, + "grad_norm": 8.188202627934515e-05, + "learning_rate": 4.525672988880308e-06, + "loss": 0.0, + "num_input_tokens_seen": 64264944, + "step": 110770 + }, + { + "epoch": 16.49910634495085, + "grad_norm": 0.00019102674559690058, + "learning_rate": 4.5238085400198e-06, + "loss": 0.0, + "num_input_tokens_seen": 64268080, + "step": 110775 + }, + { + "epoch": 16.499851057491806, + "grad_norm": 3.534670395310968e-05, + "learning_rate": 4.521944437083731e-06, + "loss": 0.0, + "num_input_tokens_seen": 64271216, + "step": 110780 + }, + { + "epoch": 16.500595770032767, + "grad_norm": 7.086600817274302e-06, + "learning_rate": 4.520080680103603e-06, + "loss": 0.0, + "num_input_tokens_seen": 64273904, + "step": 110785 + }, + { + "epoch": 16.501340482573728, + "grad_norm": 2.8259573809918948e-06, + "learning_rate": 4.5182172691108996e-06, + "loss": 0.0645, + "num_input_tokens_seen": 64276880, + "step": 110790 + }, + { + "epoch": 16.502085195114685, + "grad_norm": 8.042426088650245e-06, + "learning_rate": 4.5163542041370965e-06, + "loss": 0.0, + "num_input_tokens_seen": 64279472, + "step": 110795 + }, + { + "epoch": 16.502829907655645, + "grad_norm": 2.8118578484281898e-05, + "learning_rate": 4.514491485213665e-06, + "loss": 0.0, + "num_input_tokens_seen": 64282384, + "step": 110800 + }, + { + "epoch": 16.503574620196606, + "grad_norm": 7.898156582086813e-06, + "learning_rate": 4.512629112372085e-06, + "loss": 0.0, + "num_input_tokens_seen": 64285392, + "step": 110805 + }, + { + "epoch": 16.504319332737563, + "grad_norm": 2.8862883482361212e-05, + "learning_rate": 4.510767085643814e-06, + "loss": 0.0, + "num_input_tokens_seen": 64288368, + "step": 110810 + }, + { + "epoch": 16.505064045278523, + "grad_norm": 0.00029994972283020616, + "learning_rate": 4.508905405060301e-06, + "loss": 0.0, + "num_input_tokens_seen": 64291120, + "step": 110815 + }, + { + "epoch": 16.50580875781948, + "grad_norm": 1.0433779607410543e-05, + "learning_rate": 4.5070440706530135e-06, + "loss": 0.0, + "num_input_tokens_seen": 64293808, + "step": 110820 + }, + { + "epoch": 16.50655347036044, + "grad_norm": 2.8191543606226332e-05, + "learning_rate": 4.505183082453382e-06, + "loss": 0.0, + "num_input_tokens_seen": 64296720, + "step": 110825 + }, + { + "epoch": 16.5072981829014, + "grad_norm": 7.488141272915527e-05, + "learning_rate": 4.503322440492858e-06, + "loss": 0.0, + "num_input_tokens_seen": 64299856, + "step": 110830 + }, + { + "epoch": 16.508042895442358, + "grad_norm": 7.516278856201097e-05, + "learning_rate": 4.501462144802862e-06, + "loss": 0.0, + "num_input_tokens_seen": 64302640, + "step": 110835 + }, + { + "epoch": 16.50878760798332, + "grad_norm": 1.4678729712613858e-05, + "learning_rate": 4.4996021954148375e-06, + "loss": 0.0, + "num_input_tokens_seen": 64305296, + "step": 110840 + }, + { + "epoch": 16.50953232052428, + "grad_norm": 0.3109484612941742, + "learning_rate": 4.497742592360196e-06, + "loss": 0.0001, + "num_input_tokens_seen": 64308368, + "step": 110845 + }, + { + "epoch": 16.510277033065236, + "grad_norm": 0.0001289523352170363, + "learning_rate": 4.495883335670351e-06, + "loss": 0.0, + "num_input_tokens_seen": 64311440, + "step": 110850 + }, + { + "epoch": 16.511021745606197, + "grad_norm": 3.744303103303537e-05, + "learning_rate": 4.494024425376722e-06, + "loss": 0.0, + "num_input_tokens_seen": 64314352, + "step": 110855 + }, + { + "epoch": 16.511766458147154, + "grad_norm": 1.2785124454239849e-05, + "learning_rate": 4.4921658615107106e-06, + "loss": 0.0, + "num_input_tokens_seen": 64317328, + "step": 110860 + }, + { + "epoch": 16.512511170688114, + "grad_norm": 1.2842725482187234e-05, + "learning_rate": 4.490307644103717e-06, + "loss": 0.0, + "num_input_tokens_seen": 64320560, + "step": 110865 + }, + { + "epoch": 16.513255883229075, + "grad_norm": 0.00033232453279197216, + "learning_rate": 4.48844977318712e-06, + "loss": 0.0, + "num_input_tokens_seen": 64323536, + "step": 110870 + }, + { + "epoch": 16.51400059577003, + "grad_norm": 0.00014119833940640092, + "learning_rate": 4.486592248792323e-06, + "loss": 0.0, + "num_input_tokens_seen": 64326192, + "step": 110875 + }, + { + "epoch": 16.514745308310992, + "grad_norm": 1.6764708561822772e-05, + "learning_rate": 4.484735070950696e-06, + "loss": 0.0, + "num_input_tokens_seen": 64329040, + "step": 110880 + }, + { + "epoch": 16.515490020851953, + "grad_norm": 0.0007244004518724978, + "learning_rate": 4.482878239693628e-06, + "loss": 0.0, + "num_input_tokens_seen": 64331984, + "step": 110885 + }, + { + "epoch": 16.51623473339291, + "grad_norm": 3.5917208151659e-05, + "learning_rate": 4.481021755052476e-06, + "loss": 0.0, + "num_input_tokens_seen": 64334960, + "step": 110890 + }, + { + "epoch": 16.51697944593387, + "grad_norm": 0.00044671850628219545, + "learning_rate": 4.479165617058603e-06, + "loss": 0.0, + "num_input_tokens_seen": 64338288, + "step": 110895 + }, + { + "epoch": 16.517724158474827, + "grad_norm": 4.8734409574535675e-06, + "learning_rate": 4.4773098257433754e-06, + "loss": 0.0, + "num_input_tokens_seen": 64341328, + "step": 110900 + }, + { + "epoch": 16.518468871015788, + "grad_norm": 2.5336617909488268e-05, + "learning_rate": 4.4754543811381335e-06, + "loss": 0.0, + "num_input_tokens_seen": 64344080, + "step": 110905 + }, + { + "epoch": 16.51921358355675, + "grad_norm": 3.7438945582835004e-05, + "learning_rate": 4.473599283274235e-06, + "loss": 0.0, + "num_input_tokens_seen": 64346896, + "step": 110910 + }, + { + "epoch": 16.519958296097705, + "grad_norm": 0.0002453472698107362, + "learning_rate": 4.471744532183012e-06, + "loss": 0.0001, + "num_input_tokens_seen": 64349584, + "step": 110915 + }, + { + "epoch": 16.520703008638666, + "grad_norm": 3.4786678497766843e-06, + "learning_rate": 4.469890127895804e-06, + "loss": 0.0, + "num_input_tokens_seen": 64352336, + "step": 110920 + }, + { + "epoch": 16.521447721179626, + "grad_norm": 1.4111775271885563e-05, + "learning_rate": 4.468036070443938e-06, + "loss": 0.0, + "num_input_tokens_seen": 64355408, + "step": 110925 + }, + { + "epoch": 16.522192433720583, + "grad_norm": 7.812674448359758e-05, + "learning_rate": 4.466182359858734e-06, + "loss": 0.0081, + "num_input_tokens_seen": 64358512, + "step": 110930 + }, + { + "epoch": 16.522937146261544, + "grad_norm": 1.5408815670525655e-05, + "learning_rate": 4.4643289961715076e-06, + "loss": 0.0, + "num_input_tokens_seen": 64361648, + "step": 110935 + }, + { + "epoch": 16.5236818588025, + "grad_norm": 7.277752501977375e-06, + "learning_rate": 4.462475979413569e-06, + "loss": 0.0, + "num_input_tokens_seen": 64364464, + "step": 110940 + }, + { + "epoch": 16.52442657134346, + "grad_norm": 6.41965607428574e-06, + "learning_rate": 4.46062330961623e-06, + "loss": 0.0, + "num_input_tokens_seen": 64367024, + "step": 110945 + }, + { + "epoch": 16.525171283884422, + "grad_norm": 7.391790404653875e-06, + "learning_rate": 4.458770986810776e-06, + "loss": 0.0, + "num_input_tokens_seen": 64370096, + "step": 110950 + }, + { + "epoch": 16.52591599642538, + "grad_norm": 0.00019491169950924814, + "learning_rate": 4.456919011028518e-06, + "loss": 0.0, + "num_input_tokens_seen": 64373168, + "step": 110955 + }, + { + "epoch": 16.52666070896634, + "grad_norm": 1.3267569556774106e-05, + "learning_rate": 4.4550673823007284e-06, + "loss": 0.0, + "num_input_tokens_seen": 64375888, + "step": 110960 + }, + { + "epoch": 16.527405421507297, + "grad_norm": 2.7093792596133426e-06, + "learning_rate": 4.4532161006587e-06, + "loss": 0.0, + "num_input_tokens_seen": 64378896, + "step": 110965 + }, + { + "epoch": 16.528150134048257, + "grad_norm": 5.665786829922581e-06, + "learning_rate": 4.4513651661337e-06, + "loss": 0.0, + "num_input_tokens_seen": 64381520, + "step": 110970 + }, + { + "epoch": 16.528894846589218, + "grad_norm": 2.6361985874245875e-05, + "learning_rate": 4.449514578757e-06, + "loss": 0.0, + "num_input_tokens_seen": 64384240, + "step": 110975 + }, + { + "epoch": 16.529639559130175, + "grad_norm": 4.5033457354293205e-06, + "learning_rate": 4.447664338559867e-06, + "loss": 0.0, + "num_input_tokens_seen": 64387248, + "step": 110980 + }, + { + "epoch": 16.530384271671135, + "grad_norm": 0.0008752990397624671, + "learning_rate": 4.445814445573551e-06, + "loss": 0.0, + "num_input_tokens_seen": 64390000, + "step": 110985 + }, + { + "epoch": 16.531128984212096, + "grad_norm": 5.328966653905809e-05, + "learning_rate": 4.443964899829317e-06, + "loss": 0.0, + "num_input_tokens_seen": 64392848, + "step": 110990 + }, + { + "epoch": 16.531873696753053, + "grad_norm": 6.083751213736832e-05, + "learning_rate": 4.442115701358401e-06, + "loss": 0.0, + "num_input_tokens_seen": 64395696, + "step": 110995 + }, + { + "epoch": 16.532618409294013, + "grad_norm": 5.084177701064618e-06, + "learning_rate": 4.440266850192049e-06, + "loss": 0.0, + "num_input_tokens_seen": 64398608, + "step": 111000 + }, + { + "epoch": 16.53336312183497, + "grad_norm": 6.457891140598804e-05, + "learning_rate": 4.4384183463614865e-06, + "loss": 0.0451, + "num_input_tokens_seen": 64401520, + "step": 111005 + }, + { + "epoch": 16.53410783437593, + "grad_norm": 3.354449745529564e-06, + "learning_rate": 4.436570189897951e-06, + "loss": 0.0, + "num_input_tokens_seen": 64404528, + "step": 111010 + }, + { + "epoch": 16.53485254691689, + "grad_norm": 3.7189875001786277e-06, + "learning_rate": 4.434722380832665e-06, + "loss": 0.0001, + "num_input_tokens_seen": 64407536, + "step": 111015 + }, + { + "epoch": 16.535597259457848, + "grad_norm": 1.2784465070581064e-05, + "learning_rate": 4.432874919196836e-06, + "loss": 0.0, + "num_input_tokens_seen": 64410512, + "step": 111020 + }, + { + "epoch": 16.53634197199881, + "grad_norm": 6.006508556311019e-06, + "learning_rate": 4.4310278050216895e-06, + "loss": 0.0, + "num_input_tokens_seen": 64413488, + "step": 111025 + }, + { + "epoch": 16.53708668453977, + "grad_norm": 1.0857350389414933e-05, + "learning_rate": 4.429181038338415e-06, + "loss": 0.0, + "num_input_tokens_seen": 64416272, + "step": 111030 + }, + { + "epoch": 16.537831397080726, + "grad_norm": 1.8957218344439752e-05, + "learning_rate": 4.427334619178225e-06, + "loss": 0.0, + "num_input_tokens_seen": 64418736, + "step": 111035 + }, + { + "epoch": 16.538576109621687, + "grad_norm": 4.642231942852959e-06, + "learning_rate": 4.425488547572304e-06, + "loss": 0.0, + "num_input_tokens_seen": 64421616, + "step": 111040 + }, + { + "epoch": 16.539320822162644, + "grad_norm": 6.765198122593574e-06, + "learning_rate": 4.4236428235518465e-06, + "loss": 0.0, + "num_input_tokens_seen": 64424528, + "step": 111045 + }, + { + "epoch": 16.540065534703604, + "grad_norm": 0.0006591276614926755, + "learning_rate": 4.421797447148032e-06, + "loss": 0.0, + "num_input_tokens_seen": 64427440, + "step": 111050 + }, + { + "epoch": 16.540810247244565, + "grad_norm": 1.9984678147011437e-05, + "learning_rate": 4.419952418392029e-06, + "loss": 0.0, + "num_input_tokens_seen": 64430480, + "step": 111055 + }, + { + "epoch": 16.541554959785522, + "grad_norm": 0.001343362731859088, + "learning_rate": 4.418107737315019e-06, + "loss": 0.0, + "num_input_tokens_seen": 64433296, + "step": 111060 + }, + { + "epoch": 16.542299672326482, + "grad_norm": 5.618018803943414e-06, + "learning_rate": 4.416263403948159e-06, + "loss": 0.0005, + "num_input_tokens_seen": 64436176, + "step": 111065 + }, + { + "epoch": 16.543044384867443, + "grad_norm": 6.463631052611163e-06, + "learning_rate": 4.41441941832261e-06, + "loss": 0.0, + "num_input_tokens_seen": 64439440, + "step": 111070 + }, + { + "epoch": 16.5437890974084, + "grad_norm": 2.2625355995842256e-05, + "learning_rate": 4.412575780469516e-06, + "loss": 0.0, + "num_input_tokens_seen": 64442320, + "step": 111075 + }, + { + "epoch": 16.54453380994936, + "grad_norm": 8.219029041356407e-06, + "learning_rate": 4.410732490420036e-06, + "loss": 0.0, + "num_input_tokens_seen": 64445456, + "step": 111080 + }, + { + "epoch": 16.545278522490317, + "grad_norm": 1.2738072655338328e-05, + "learning_rate": 4.4088895482053e-06, + "loss": 0.0, + "num_input_tokens_seen": 64448432, + "step": 111085 + }, + { + "epoch": 16.546023235031278, + "grad_norm": 4.8861102186492644e-06, + "learning_rate": 4.40704695385645e-06, + "loss": 0.0, + "num_input_tokens_seen": 64451024, + "step": 111090 + }, + { + "epoch": 16.54676794757224, + "grad_norm": 2.093735383823514e-05, + "learning_rate": 4.405204707404614e-06, + "loss": 0.0, + "num_input_tokens_seen": 64453936, + "step": 111095 + }, + { + "epoch": 16.547512660113195, + "grad_norm": 0.0001255395618500188, + "learning_rate": 4.403362808880909e-06, + "loss": 0.0, + "num_input_tokens_seen": 64456720, + "step": 111100 + }, + { + "epoch": 16.548257372654156, + "grad_norm": 9.675895125837997e-05, + "learning_rate": 4.40152125831646e-06, + "loss": 0.0, + "num_input_tokens_seen": 64459728, + "step": 111105 + }, + { + "epoch": 16.549002085195113, + "grad_norm": 6.4001367718447e-05, + "learning_rate": 4.3996800557423665e-06, + "loss": 0.0, + "num_input_tokens_seen": 64462480, + "step": 111110 + }, + { + "epoch": 16.549746797736073, + "grad_norm": 1.8948643628391437e-05, + "learning_rate": 4.397839201189749e-06, + "loss": 0.0, + "num_input_tokens_seen": 64465488, + "step": 111115 + }, + { + "epoch": 16.550491510277034, + "grad_norm": 0.0009526044595986605, + "learning_rate": 4.395998694689699e-06, + "loss": 0.0001, + "num_input_tokens_seen": 64468464, + "step": 111120 + }, + { + "epoch": 16.55123622281799, + "grad_norm": 8.437295036856085e-06, + "learning_rate": 4.39415853627331e-06, + "loss": 0.0, + "num_input_tokens_seen": 64471088, + "step": 111125 + }, + { + "epoch": 16.55198093535895, + "grad_norm": 3.3580963645363227e-06, + "learning_rate": 4.3923187259716615e-06, + "loss": 0.0, + "num_input_tokens_seen": 64473808, + "step": 111130 + }, + { + "epoch": 16.552725647899912, + "grad_norm": 0.023366328328847885, + "learning_rate": 4.390479263815852e-06, + "loss": 0.0, + "num_input_tokens_seen": 64476560, + "step": 111135 + }, + { + "epoch": 16.55347036044087, + "grad_norm": 0.00027745222905650735, + "learning_rate": 4.388640149836948e-06, + "loss": 0.0, + "num_input_tokens_seen": 64479376, + "step": 111140 + }, + { + "epoch": 16.55421507298183, + "grad_norm": 4.1500945371808484e-05, + "learning_rate": 4.3868013840660135e-06, + "loss": 0.0, + "num_input_tokens_seen": 64482032, + "step": 111145 + }, + { + "epoch": 16.554959785522787, + "grad_norm": 6.606383522012038e-06, + "learning_rate": 4.3849629665341255e-06, + "loss": 0.0003, + "num_input_tokens_seen": 64484752, + "step": 111150 + }, + { + "epoch": 16.555704498063747, + "grad_norm": 1.2311693353694864e-05, + "learning_rate": 4.383124897272331e-06, + "loss": 0.0, + "num_input_tokens_seen": 64487664, + "step": 111155 + }, + { + "epoch": 16.556449210604708, + "grad_norm": 5.405125193647109e-06, + "learning_rate": 4.381287176311694e-06, + "loss": 0.0008, + "num_input_tokens_seen": 64490448, + "step": 111160 + }, + { + "epoch": 16.557193923145665, + "grad_norm": 2.749596933426801e-05, + "learning_rate": 4.379449803683247e-06, + "loss": 0.0, + "num_input_tokens_seen": 64493168, + "step": 111165 + }, + { + "epoch": 16.557938635686625, + "grad_norm": 0.00010274966189172119, + "learning_rate": 4.377612779418041e-06, + "loss": 0.0, + "num_input_tokens_seen": 64495856, + "step": 111170 + }, + { + "epoch": 16.558683348227586, + "grad_norm": 7.23757784726331e-06, + "learning_rate": 4.375776103547114e-06, + "loss": 0.4219, + "num_input_tokens_seen": 64498672, + "step": 111175 + }, + { + "epoch": 16.559428060768543, + "grad_norm": 4.227547833579592e-06, + "learning_rate": 4.373939776101476e-06, + "loss": 0.0, + "num_input_tokens_seen": 64501552, + "step": 111180 + }, + { + "epoch": 16.560172773309503, + "grad_norm": 1.4639182154496666e-05, + "learning_rate": 4.37210379711217e-06, + "loss": 0.0007, + "num_input_tokens_seen": 64505040, + "step": 111185 + }, + { + "epoch": 16.56091748585046, + "grad_norm": 4.182419161224971e-06, + "learning_rate": 4.370268166610206e-06, + "loss": 0.0, + "num_input_tokens_seen": 64507856, + "step": 111190 + }, + { + "epoch": 16.56166219839142, + "grad_norm": 0.00019026591326110065, + "learning_rate": 4.368432884626594e-06, + "loss": 0.3188, + "num_input_tokens_seen": 64510928, + "step": 111195 + }, + { + "epoch": 16.56240691093238, + "grad_norm": 3.466035195742734e-05, + "learning_rate": 4.366597951192333e-06, + "loss": 0.0, + "num_input_tokens_seen": 64513680, + "step": 111200 + }, + { + "epoch": 16.56315162347334, + "grad_norm": 0.00021036715770605952, + "learning_rate": 4.364763366338437e-06, + "loss": 0.0, + "num_input_tokens_seen": 64516528, + "step": 111205 + }, + { + "epoch": 16.5638963360143, + "grad_norm": 2.4962631869129837e-05, + "learning_rate": 4.362929130095888e-06, + "loss": 0.0, + "num_input_tokens_seen": 64519344, + "step": 111210 + }, + { + "epoch": 16.56464104855526, + "grad_norm": 4.52713265985949e-06, + "learning_rate": 4.361095242495672e-06, + "loss": 0.0, + "num_input_tokens_seen": 64522000, + "step": 111215 + }, + { + "epoch": 16.565385761096216, + "grad_norm": 5.869281085324474e-05, + "learning_rate": 4.359261703568781e-06, + "loss": 0.0, + "num_input_tokens_seen": 64524400, + "step": 111220 + }, + { + "epoch": 16.566130473637177, + "grad_norm": 0.00010713673691498116, + "learning_rate": 4.357428513346179e-06, + "loss": 0.0, + "num_input_tokens_seen": 64527184, + "step": 111225 + }, + { + "epoch": 16.566875186178134, + "grad_norm": 0.00012808042811229825, + "learning_rate": 4.35559567185885e-06, + "loss": 0.0, + "num_input_tokens_seen": 64529968, + "step": 111230 + }, + { + "epoch": 16.567619898719094, + "grad_norm": 2.675632458704058e-06, + "learning_rate": 4.353763179137743e-06, + "loss": 0.0, + "num_input_tokens_seen": 64532976, + "step": 111235 + }, + { + "epoch": 16.568364611260055, + "grad_norm": 2.456321999488864e-05, + "learning_rate": 4.351931035213827e-06, + "loss": 0.0, + "num_input_tokens_seen": 64535856, + "step": 111240 + }, + { + "epoch": 16.569109323801012, + "grad_norm": 1.033509943226818e-05, + "learning_rate": 4.350099240118047e-06, + "loss": 0.0, + "num_input_tokens_seen": 64538768, + "step": 111245 + }, + { + "epoch": 16.569854036341972, + "grad_norm": 0.00026345878723077476, + "learning_rate": 4.348267793881358e-06, + "loss": 0.0, + "num_input_tokens_seen": 64541616, + "step": 111250 + }, + { + "epoch": 16.57059874888293, + "grad_norm": 2.876838880183641e-05, + "learning_rate": 4.346436696534698e-06, + "loss": 0.0, + "num_input_tokens_seen": 64544496, + "step": 111255 + }, + { + "epoch": 16.57134346142389, + "grad_norm": 1.0712463335948996e-05, + "learning_rate": 4.344605948108993e-06, + "loss": 0.0, + "num_input_tokens_seen": 64547408, + "step": 111260 + }, + { + "epoch": 16.57208817396485, + "grad_norm": 6.045268673915416e-05, + "learning_rate": 4.342775548635181e-06, + "loss": 0.0, + "num_input_tokens_seen": 64550448, + "step": 111265 + }, + { + "epoch": 16.572832886505807, + "grad_norm": 7.85884476499632e-05, + "learning_rate": 4.340945498144175e-06, + "loss": 0.0, + "num_input_tokens_seen": 64553552, + "step": 111270 + }, + { + "epoch": 16.573577599046768, + "grad_norm": 0.00019135612819809467, + "learning_rate": 4.3391157966669036e-06, + "loss": 0.0, + "num_input_tokens_seen": 64556336, + "step": 111275 + }, + { + "epoch": 16.57432231158773, + "grad_norm": 8.992327821033541e-06, + "learning_rate": 4.337286444234265e-06, + "loss": 0.0, + "num_input_tokens_seen": 64559408, + "step": 111280 + }, + { + "epoch": 16.575067024128685, + "grad_norm": 0.00022821903985459358, + "learning_rate": 4.335457440877177e-06, + "loss": 0.0, + "num_input_tokens_seen": 64562096, + "step": 111285 + }, + { + "epoch": 16.575811736669646, + "grad_norm": 5.461706314235926e-05, + "learning_rate": 4.333628786626534e-06, + "loss": 0.0, + "num_input_tokens_seen": 64565264, + "step": 111290 + }, + { + "epoch": 16.576556449210603, + "grad_norm": 0.0001233089278684929, + "learning_rate": 4.331800481513223e-06, + "loss": 0.0, + "num_input_tokens_seen": 64568144, + "step": 111295 + }, + { + "epoch": 16.577301161751564, + "grad_norm": 0.00012517962022684515, + "learning_rate": 4.329972525568141e-06, + "loss": 0.0, + "num_input_tokens_seen": 64570672, + "step": 111300 + }, + { + "epoch": 16.578045874292524, + "grad_norm": 3.9460778680222575e-06, + "learning_rate": 4.3281449188221605e-06, + "loss": 0.0, + "num_input_tokens_seen": 64573232, + "step": 111305 + }, + { + "epoch": 16.57879058683348, + "grad_norm": 5.340303421020508, + "learning_rate": 4.326317661306168e-06, + "loss": 0.0221, + "num_input_tokens_seen": 64575984, + "step": 111310 + }, + { + "epoch": 16.57953529937444, + "grad_norm": 0.00015916280972305685, + "learning_rate": 4.324490753051019e-06, + "loss": 0.0, + "num_input_tokens_seen": 64578832, + "step": 111315 + }, + { + "epoch": 16.580280011915402, + "grad_norm": 3.188144546584226e-05, + "learning_rate": 4.322664194087591e-06, + "loss": 0.0, + "num_input_tokens_seen": 64581712, + "step": 111320 + }, + { + "epoch": 16.58102472445636, + "grad_norm": 0.00021811660553794354, + "learning_rate": 4.320837984446738e-06, + "loss": 0.0476, + "num_input_tokens_seen": 64584528, + "step": 111325 + }, + { + "epoch": 16.58176943699732, + "grad_norm": 7.4266836236347444e-06, + "learning_rate": 4.319012124159308e-06, + "loss": 0.0108, + "num_input_tokens_seen": 64587376, + "step": 111330 + }, + { + "epoch": 16.582514149538277, + "grad_norm": 0.00034673124901019037, + "learning_rate": 4.317186613256149e-06, + "loss": 0.0, + "num_input_tokens_seen": 64590160, + "step": 111335 + }, + { + "epoch": 16.583258862079237, + "grad_norm": 2.122195110132452e-05, + "learning_rate": 4.3153614517680965e-06, + "loss": 0.0, + "num_input_tokens_seen": 64593072, + "step": 111340 + }, + { + "epoch": 16.584003574620198, + "grad_norm": 0.0011893732007592916, + "learning_rate": 4.313536639725996e-06, + "loss": 0.0, + "num_input_tokens_seen": 64596016, + "step": 111345 + }, + { + "epoch": 16.584748287161155, + "grad_norm": 1.1131975952594075e-05, + "learning_rate": 4.311712177160662e-06, + "loss": 0.0, + "num_input_tokens_seen": 64598832, + "step": 111350 + }, + { + "epoch": 16.585492999702115, + "grad_norm": 7.51438710722141e-06, + "learning_rate": 4.30988806410293e-06, + "loss": 0.0, + "num_input_tokens_seen": 64601872, + "step": 111355 + }, + { + "epoch": 16.586237712243076, + "grad_norm": 5.230381702858722e-06, + "learning_rate": 4.308064300583603e-06, + "loss": 0.0, + "num_input_tokens_seen": 64604848, + "step": 111360 + }, + { + "epoch": 16.586982424784033, + "grad_norm": 0.00012999889440834522, + "learning_rate": 4.3062408866335085e-06, + "loss": 0.0007, + "num_input_tokens_seen": 64607792, + "step": 111365 + }, + { + "epoch": 16.587727137324993, + "grad_norm": 0.0002124673774233088, + "learning_rate": 4.30441782228344e-06, + "loss": 0.0, + "num_input_tokens_seen": 64610576, + "step": 111370 + }, + { + "epoch": 16.58847184986595, + "grad_norm": 4.265721145202406e-06, + "learning_rate": 4.302595107564192e-06, + "loss": 0.0, + "num_input_tokens_seen": 64613584, + "step": 111375 + }, + { + "epoch": 16.58921656240691, + "grad_norm": 2.890095674956683e-05, + "learning_rate": 4.300772742506571e-06, + "loss": 0.0, + "num_input_tokens_seen": 64616624, + "step": 111380 + }, + { + "epoch": 16.58996127494787, + "grad_norm": 8.718178833078127e-06, + "learning_rate": 4.2989507271413515e-06, + "loss": 0.0, + "num_input_tokens_seen": 64619760, + "step": 111385 + }, + { + "epoch": 16.59070598748883, + "grad_norm": 2.016025064222049e-05, + "learning_rate": 4.297129061499324e-06, + "loss": 0.0, + "num_input_tokens_seen": 64622896, + "step": 111390 + }, + { + "epoch": 16.59145070002979, + "grad_norm": 0.05401238426566124, + "learning_rate": 4.29530774561126e-06, + "loss": 0.0001, + "num_input_tokens_seen": 64625616, + "step": 111395 + }, + { + "epoch": 16.59219541257075, + "grad_norm": 8.137004442687612e-06, + "learning_rate": 4.29348677950793e-06, + "loss": 0.0, + "num_input_tokens_seen": 64628528, + "step": 111400 + }, + { + "epoch": 16.592940125111706, + "grad_norm": 2.791060251183808e-05, + "learning_rate": 4.291666163220087e-06, + "loss": 0.0, + "num_input_tokens_seen": 64631376, + "step": 111405 + }, + { + "epoch": 16.593684837652667, + "grad_norm": 3.7656249332940206e-05, + "learning_rate": 4.289845896778505e-06, + "loss": 0.0, + "num_input_tokens_seen": 64633968, + "step": 111410 + }, + { + "epoch": 16.594429550193624, + "grad_norm": 3.7771809729747474e-05, + "learning_rate": 4.2880259802139276e-06, + "loss": 0.0, + "num_input_tokens_seen": 64636784, + "step": 111415 + }, + { + "epoch": 16.595174262734584, + "grad_norm": 3.8856933315400966e-06, + "learning_rate": 4.286206413557092e-06, + "loss": 0.0, + "num_input_tokens_seen": 64639760, + "step": 111420 + }, + { + "epoch": 16.595918975275545, + "grad_norm": 2.5246847599191824e-06, + "learning_rate": 4.284387196838755e-06, + "loss": 0.0, + "num_input_tokens_seen": 64642480, + "step": 111425 + }, + { + "epoch": 16.596663687816502, + "grad_norm": 6.59577053738758e-05, + "learning_rate": 4.282568330089637e-06, + "loss": 0.0001, + "num_input_tokens_seen": 64645360, + "step": 111430 + }, + { + "epoch": 16.597408400357462, + "grad_norm": 6.635909812757745e-05, + "learning_rate": 4.280749813340473e-06, + "loss": 0.0, + "num_input_tokens_seen": 64647952, + "step": 111435 + }, + { + "epoch": 16.598153112898423, + "grad_norm": 4.81071365356911e-05, + "learning_rate": 4.278931646621981e-06, + "loss": 0.0, + "num_input_tokens_seen": 64651152, + "step": 111440 + }, + { + "epoch": 16.59889782543938, + "grad_norm": 3.371116690686904e-06, + "learning_rate": 4.2771138299648825e-06, + "loss": 0.0, + "num_input_tokens_seen": 64654224, + "step": 111445 + }, + { + "epoch": 16.59964253798034, + "grad_norm": 0.0002107552718371153, + "learning_rate": 4.275296363399883e-06, + "loss": 0.0, + "num_input_tokens_seen": 64657232, + "step": 111450 + }, + { + "epoch": 16.600387250521297, + "grad_norm": 1.732502096274402e-05, + "learning_rate": 4.27347924695769e-06, + "loss": 0.0, + "num_input_tokens_seen": 64660208, + "step": 111455 + }, + { + "epoch": 16.601131963062258, + "grad_norm": 0.00035637771361507475, + "learning_rate": 4.271662480668995e-06, + "loss": 0.0, + "num_input_tokens_seen": 64663312, + "step": 111460 + }, + { + "epoch": 16.60187667560322, + "grad_norm": 2.8988640679017408e-06, + "learning_rate": 4.269846064564498e-06, + "loss": 0.0, + "num_input_tokens_seen": 64665808, + "step": 111465 + }, + { + "epoch": 16.602621388144176, + "grad_norm": 2.5865048883133568e-05, + "learning_rate": 4.268029998674883e-06, + "loss": 0.0, + "num_input_tokens_seen": 64668752, + "step": 111470 + }, + { + "epoch": 16.603366100685136, + "grad_norm": 3.8847745599923655e-05, + "learning_rate": 4.266214283030825e-06, + "loss": 0.0, + "num_input_tokens_seen": 64671952, + "step": 111475 + }, + { + "epoch": 16.604110813226093, + "grad_norm": 2.551845318521373e-05, + "learning_rate": 4.2643989176630095e-06, + "loss": 0.0, + "num_input_tokens_seen": 64674832, + "step": 111480 + }, + { + "epoch": 16.604855525767054, + "grad_norm": 9.38711964408867e-05, + "learning_rate": 4.262583902602094e-06, + "loss": 0.0, + "num_input_tokens_seen": 64677616, + "step": 111485 + }, + { + "epoch": 16.605600238308014, + "grad_norm": 3.377209941390902e-05, + "learning_rate": 4.2607692378787535e-06, + "loss": 0.0, + "num_input_tokens_seen": 64680784, + "step": 111490 + }, + { + "epoch": 16.60634495084897, + "grad_norm": 0.0008080463157966733, + "learning_rate": 4.258954923523636e-06, + "loss": 0.0, + "num_input_tokens_seen": 64683888, + "step": 111495 + }, + { + "epoch": 16.60708966338993, + "grad_norm": 7.82179176894715e-06, + "learning_rate": 4.25714095956739e-06, + "loss": 0.0, + "num_input_tokens_seen": 64686704, + "step": 111500 + }, + { + "epoch": 16.607834375930892, + "grad_norm": 1.5944506230880506e-05, + "learning_rate": 4.255327346040672e-06, + "loss": 0.0, + "num_input_tokens_seen": 64690000, + "step": 111505 + }, + { + "epoch": 16.60857908847185, + "grad_norm": 4.009476924693445e-06, + "learning_rate": 4.253514082974108e-06, + "loss": 0.0, + "num_input_tokens_seen": 64692880, + "step": 111510 + }, + { + "epoch": 16.60932380101281, + "grad_norm": 2.3075775970937684e-05, + "learning_rate": 4.251701170398342e-06, + "loss": 0.0, + "num_input_tokens_seen": 64695856, + "step": 111515 + }, + { + "epoch": 16.610068513553767, + "grad_norm": 0.0028434796258807182, + "learning_rate": 4.2498886083439995e-06, + "loss": 0.0, + "num_input_tokens_seen": 64698736, + "step": 111520 + }, + { + "epoch": 16.610813226094727, + "grad_norm": 1.406038791174069e-05, + "learning_rate": 4.2480763968416996e-06, + "loss": 0.0, + "num_input_tokens_seen": 64701392, + "step": 111525 + }, + { + "epoch": 16.611557938635688, + "grad_norm": 0.8591538071632385, + "learning_rate": 4.246264535922051e-06, + "loss": 0.0001, + "num_input_tokens_seen": 64704560, + "step": 111530 + }, + { + "epoch": 16.612302651176645, + "grad_norm": 5.660881470248569e-06, + "learning_rate": 4.244453025615674e-06, + "loss": 0.0, + "num_input_tokens_seen": 64707792, + "step": 111535 + }, + { + "epoch": 16.613047363717605, + "grad_norm": 0.00017627672059461474, + "learning_rate": 4.242641865953173e-06, + "loss": 0.0, + "num_input_tokens_seen": 64710544, + "step": 111540 + }, + { + "epoch": 16.613792076258566, + "grad_norm": 6.480058800661936e-05, + "learning_rate": 4.240831056965131e-06, + "loss": 0.0, + "num_input_tokens_seen": 64713136, + "step": 111545 + }, + { + "epoch": 16.614536788799523, + "grad_norm": 0.06895441561937332, + "learning_rate": 4.239020598682155e-06, + "loss": 0.0, + "num_input_tokens_seen": 64715792, + "step": 111550 + }, + { + "epoch": 16.615281501340483, + "grad_norm": 2.506154942238936e-06, + "learning_rate": 4.237210491134821e-06, + "loss": 0.0, + "num_input_tokens_seen": 64718672, + "step": 111555 + }, + { + "epoch": 16.61602621388144, + "grad_norm": 0.0011976680252701044, + "learning_rate": 4.23540073435372e-06, + "loss": 0.0, + "num_input_tokens_seen": 64721360, + "step": 111560 + }, + { + "epoch": 16.6167709264224, + "grad_norm": 1.732243072183337e-05, + "learning_rate": 4.2335913283694126e-06, + "loss": 0.0006, + "num_input_tokens_seen": 64724208, + "step": 111565 + }, + { + "epoch": 16.61751563896336, + "grad_norm": 1.605042052688077e-05, + "learning_rate": 4.231782273212481e-06, + "loss": 0.0, + "num_input_tokens_seen": 64727376, + "step": 111570 + }, + { + "epoch": 16.61826035150432, + "grad_norm": 1.0970044058922213e-05, + "learning_rate": 4.2299735689134784e-06, + "loss": 0.0, + "num_input_tokens_seen": 64730032, + "step": 111575 + }, + { + "epoch": 16.61900506404528, + "grad_norm": 0.0005049816099926829, + "learning_rate": 4.228165215502958e-06, + "loss": 0.0, + "num_input_tokens_seen": 64733040, + "step": 111580 + }, + { + "epoch": 16.61974977658624, + "grad_norm": 8.00296948000323e-06, + "learning_rate": 4.226357213011478e-06, + "loss": 0.0, + "num_input_tokens_seen": 64736240, + "step": 111585 + }, + { + "epoch": 16.620494489127196, + "grad_norm": 5.845354826305993e-05, + "learning_rate": 4.224549561469582e-06, + "loss": 0.0, + "num_input_tokens_seen": 64739216, + "step": 111590 + }, + { + "epoch": 16.621239201668157, + "grad_norm": 0.0001497511111665517, + "learning_rate": 4.222742260907806e-06, + "loss": 0.0, + "num_input_tokens_seen": 64742032, + "step": 111595 + }, + { + "epoch": 16.621983914209114, + "grad_norm": 4.33010882261442e-06, + "learning_rate": 4.220935311356675e-06, + "loss": 0.0, + "num_input_tokens_seen": 64745136, + "step": 111600 + }, + { + "epoch": 16.622728626750074, + "grad_norm": 0.004096046090126038, + "learning_rate": 4.219128712846729e-06, + "loss": 0.0, + "num_input_tokens_seen": 64748112, + "step": 111605 + }, + { + "epoch": 16.623473339291035, + "grad_norm": 1.615447581571061e-05, + "learning_rate": 4.217322465408477e-06, + "loss": 0.0452, + "num_input_tokens_seen": 64751216, + "step": 111610 + }, + { + "epoch": 16.624218051831992, + "grad_norm": 1.253045593330171e-05, + "learning_rate": 4.2155165690724476e-06, + "loss": 0.0, + "num_input_tokens_seen": 64754480, + "step": 111615 + }, + { + "epoch": 16.624962764372953, + "grad_norm": 2.2692027414450422e-05, + "learning_rate": 4.213711023869138e-06, + "loss": 0.0, + "num_input_tokens_seen": 64757264, + "step": 111620 + }, + { + "epoch": 16.62570747691391, + "grad_norm": 9.321442485088482e-05, + "learning_rate": 4.211905829829049e-06, + "loss": 0.0, + "num_input_tokens_seen": 64760336, + "step": 111625 + }, + { + "epoch": 16.62645218945487, + "grad_norm": 5.691310434485786e-06, + "learning_rate": 4.21010098698269e-06, + "loss": 0.0, + "num_input_tokens_seen": 64763024, + "step": 111630 + }, + { + "epoch": 16.62719690199583, + "grad_norm": 0.000243769318331033, + "learning_rate": 4.208296495360539e-06, + "loss": 0.0, + "num_input_tokens_seen": 64765744, + "step": 111635 + }, + { + "epoch": 16.627941614536788, + "grad_norm": 2.0527004380710423e-05, + "learning_rate": 4.206492354993094e-06, + "loss": 0.0, + "num_input_tokens_seen": 64768656, + "step": 111640 + }, + { + "epoch": 16.628686327077748, + "grad_norm": 0.00023261664318852127, + "learning_rate": 4.204688565910819e-06, + "loss": 0.0, + "num_input_tokens_seen": 64771600, + "step": 111645 + }, + { + "epoch": 16.62943103961871, + "grad_norm": 7.325291335291695e-06, + "learning_rate": 4.202885128144202e-06, + "loss": 0.0, + "num_input_tokens_seen": 64774416, + "step": 111650 + }, + { + "epoch": 16.630175752159666, + "grad_norm": 0.0006097371806390584, + "learning_rate": 4.201082041723703e-06, + "loss": 0.0, + "num_input_tokens_seen": 64777264, + "step": 111655 + }, + { + "epoch": 16.630920464700626, + "grad_norm": 5.66797680221498e-05, + "learning_rate": 4.1992793066797845e-06, + "loss": 0.0, + "num_input_tokens_seen": 64779952, + "step": 111660 + }, + { + "epoch": 16.631665177241583, + "grad_norm": 2.9715649816353107e-06, + "learning_rate": 4.197476923042901e-06, + "loss": 0.0, + "num_input_tokens_seen": 64782704, + "step": 111665 + }, + { + "epoch": 16.632409889782544, + "grad_norm": 7.533380539825885e-06, + "learning_rate": 4.195674890843495e-06, + "loss": 0.0, + "num_input_tokens_seen": 64786256, + "step": 111670 + }, + { + "epoch": 16.633154602323504, + "grad_norm": 5.6741241678537335e-06, + "learning_rate": 4.193873210112026e-06, + "loss": 0.0, + "num_input_tokens_seen": 64789424, + "step": 111675 + }, + { + "epoch": 16.63389931486446, + "grad_norm": 4.898632596450625e-06, + "learning_rate": 4.192071880878914e-06, + "loss": 0.0, + "num_input_tokens_seen": 64792208, + "step": 111680 + }, + { + "epoch": 16.63464402740542, + "grad_norm": 3.377038592589088e-05, + "learning_rate": 4.1902709031746094e-06, + "loss": 0.0, + "num_input_tokens_seen": 64794992, + "step": 111685 + }, + { + "epoch": 16.635388739946382, + "grad_norm": 6.923323631286621, + "learning_rate": 4.188470277029516e-06, + "loss": 0.0083, + "num_input_tokens_seen": 64797872, + "step": 111690 + }, + { + "epoch": 16.63613345248734, + "grad_norm": 0.00016337775741703808, + "learning_rate": 4.1866700024740745e-06, + "loss": 0.0003, + "num_input_tokens_seen": 64800624, + "step": 111695 + }, + { + "epoch": 16.6368781650283, + "grad_norm": 0.00027053712983615696, + "learning_rate": 4.184870079538692e-06, + "loss": 0.0431, + "num_input_tokens_seen": 64803472, + "step": 111700 + }, + { + "epoch": 16.637622877569257, + "grad_norm": 5.250136837275932e-06, + "learning_rate": 4.183070508253764e-06, + "loss": 0.0097, + "num_input_tokens_seen": 64806704, + "step": 111705 + }, + { + "epoch": 16.638367590110217, + "grad_norm": 6.241683149710298e-05, + "learning_rate": 4.18127128864971e-06, + "loss": 0.0, + "num_input_tokens_seen": 64809552, + "step": 111710 + }, + { + "epoch": 16.639112302651178, + "grad_norm": 0.0017032293835654855, + "learning_rate": 4.179472420756911e-06, + "loss": 0.0, + "num_input_tokens_seen": 64812304, + "step": 111715 + }, + { + "epoch": 16.639857015192135, + "grad_norm": 8.128773515636567e-06, + "learning_rate": 4.177673904605773e-06, + "loss": 0.0, + "num_input_tokens_seen": 64815472, + "step": 111720 + }, + { + "epoch": 16.640601727733095, + "grad_norm": 1.241711652255617e-05, + "learning_rate": 4.1758757402266675e-06, + "loss": 0.0, + "num_input_tokens_seen": 64818032, + "step": 111725 + }, + { + "epoch": 16.641346440274056, + "grad_norm": 0.00039680689224042, + "learning_rate": 4.1740779276499805e-06, + "loss": 0.0, + "num_input_tokens_seen": 64820816, + "step": 111730 + }, + { + "epoch": 16.642091152815013, + "grad_norm": 4.897638518741587e-06, + "learning_rate": 4.172280466906079e-06, + "loss": 0.0, + "num_input_tokens_seen": 64823664, + "step": 111735 + }, + { + "epoch": 16.642835865355973, + "grad_norm": 0.0001297314156545326, + "learning_rate": 4.170483358025323e-06, + "loss": 0.0, + "num_input_tokens_seen": 64826608, + "step": 111740 + }, + { + "epoch": 16.64358057789693, + "grad_norm": 2.6564672225504182e-05, + "learning_rate": 4.168686601038091e-06, + "loss": 0.0, + "num_input_tokens_seen": 64829456, + "step": 111745 + }, + { + "epoch": 16.64432529043789, + "grad_norm": 2.987924381159246e-05, + "learning_rate": 4.1668901959747155e-06, + "loss": 0.0, + "num_input_tokens_seen": 64832432, + "step": 111750 + }, + { + "epoch": 16.64507000297885, + "grad_norm": 1.5137281479837839e-05, + "learning_rate": 4.165094142865566e-06, + "loss": 0.0, + "num_input_tokens_seen": 64835024, + "step": 111755 + }, + { + "epoch": 16.64581471551981, + "grad_norm": 6.520347142213723e-06, + "learning_rate": 4.163298441740968e-06, + "loss": 0.0, + "num_input_tokens_seen": 64837968, + "step": 111760 + }, + { + "epoch": 16.64655942806077, + "grad_norm": 3.967831617046613e-06, + "learning_rate": 4.161503092631272e-06, + "loss": 0.0, + "num_input_tokens_seen": 64840720, + "step": 111765 + }, + { + "epoch": 16.647304140601726, + "grad_norm": 9.166263043880463e-05, + "learning_rate": 4.159708095566794e-06, + "loss": 0.0539, + "num_input_tokens_seen": 64843568, + "step": 111770 + }, + { + "epoch": 16.648048853142686, + "grad_norm": 0.0003288076550234109, + "learning_rate": 4.157913450577875e-06, + "loss": 0.0, + "num_input_tokens_seen": 64847024, + "step": 111775 + }, + { + "epoch": 16.648793565683647, + "grad_norm": 0.0001001561977318488, + "learning_rate": 4.1561191576948235e-06, + "loss": 0.0, + "num_input_tokens_seen": 64849936, + "step": 111780 + }, + { + "epoch": 16.649538278224604, + "grad_norm": 1.3429002137854695e-05, + "learning_rate": 4.1543252169479546e-06, + "loss": 0.0, + "num_input_tokens_seen": 64852944, + "step": 111785 + }, + { + "epoch": 16.650282990765565, + "grad_norm": 1.4294492757471744e-05, + "learning_rate": 4.15253162836757e-06, + "loss": 0.0, + "num_input_tokens_seen": 64855856, + "step": 111790 + }, + { + "epoch": 16.651027703306525, + "grad_norm": 0.00018657947657629848, + "learning_rate": 4.1507383919839795e-06, + "loss": 0.0, + "num_input_tokens_seen": 64858640, + "step": 111795 + }, + { + "epoch": 16.651772415847482, + "grad_norm": 0.00010041944915428758, + "learning_rate": 4.148945507827476e-06, + "loss": 0.0, + "num_input_tokens_seen": 64861616, + "step": 111800 + }, + { + "epoch": 16.652517128388443, + "grad_norm": 0.00015244503447320312, + "learning_rate": 4.147152975928336e-06, + "loss": 0.0, + "num_input_tokens_seen": 64864592, + "step": 111805 + }, + { + "epoch": 16.6532618409294, + "grad_norm": 3.0968098144512624e-05, + "learning_rate": 4.1453607963168604e-06, + "loss": 0.0, + "num_input_tokens_seen": 64867440, + "step": 111810 + }, + { + "epoch": 16.65400655347036, + "grad_norm": 0.00025672282208688557, + "learning_rate": 4.1435689690233205e-06, + "loss": 0.0, + "num_input_tokens_seen": 64870256, + "step": 111815 + }, + { + "epoch": 16.65475126601132, + "grad_norm": 4.316214472055435e-06, + "learning_rate": 4.141777494077978e-06, + "loss": 0.0, + "num_input_tokens_seen": 64873232, + "step": 111820 + }, + { + "epoch": 16.655495978552278, + "grad_norm": 0.00013137870701029897, + "learning_rate": 4.139986371511109e-06, + "loss": 0.0, + "num_input_tokens_seen": 64876304, + "step": 111825 + }, + { + "epoch": 16.656240691093238, + "grad_norm": 8.00523193902336e-05, + "learning_rate": 4.138195601352968e-06, + "loss": 0.0, + "num_input_tokens_seen": 64879056, + "step": 111830 + }, + { + "epoch": 16.6569854036342, + "grad_norm": 9.826890891417861e-05, + "learning_rate": 4.1364051836338125e-06, + "loss": 0.0, + "num_input_tokens_seen": 64882064, + "step": 111835 + }, + { + "epoch": 16.657730116175156, + "grad_norm": 7.237826594064245e-06, + "learning_rate": 4.134615118383878e-06, + "loss": 0.0, + "num_input_tokens_seen": 64885040, + "step": 111840 + }, + { + "epoch": 16.658474828716116, + "grad_norm": 24.804006576538086, + "learning_rate": 4.132825405633425e-06, + "loss": 0.0367, + "num_input_tokens_seen": 64888208, + "step": 111845 + }, + { + "epoch": 16.659219541257073, + "grad_norm": 2.2457131763076177e-06, + "learning_rate": 4.131036045412675e-06, + "loss": 0.0, + "num_input_tokens_seen": 64891088, + "step": 111850 + }, + { + "epoch": 16.659964253798034, + "grad_norm": 8.848573088471312e-06, + "learning_rate": 4.1292470377518625e-06, + "loss": 0.0, + "num_input_tokens_seen": 64893904, + "step": 111855 + }, + { + "epoch": 16.660708966338994, + "grad_norm": 8.709267603990156e-06, + "learning_rate": 4.12745838268121e-06, + "loss": 0.0, + "num_input_tokens_seen": 64896912, + "step": 111860 + }, + { + "epoch": 16.66145367887995, + "grad_norm": 7.223999546113191e-06, + "learning_rate": 4.125670080230926e-06, + "loss": 0.0, + "num_input_tokens_seen": 64900016, + "step": 111865 + }, + { + "epoch": 16.66219839142091, + "grad_norm": 2.015804420807399e-05, + "learning_rate": 4.123882130431236e-06, + "loss": 0.0, + "num_input_tokens_seen": 64902896, + "step": 111870 + }, + { + "epoch": 16.662943103961872, + "grad_norm": 7.695540261920542e-06, + "learning_rate": 4.122094533312337e-06, + "loss": 0.0009, + "num_input_tokens_seen": 64905840, + "step": 111875 + }, + { + "epoch": 16.66368781650283, + "grad_norm": 0.00010088104318128899, + "learning_rate": 4.120307288904435e-06, + "loss": 0.0, + "num_input_tokens_seen": 64908656, + "step": 111880 + }, + { + "epoch": 16.66443252904379, + "grad_norm": 9.430483623873442e-05, + "learning_rate": 4.118520397237715e-06, + "loss": 0.0, + "num_input_tokens_seen": 64911600, + "step": 111885 + }, + { + "epoch": 16.665177241584747, + "grad_norm": 0.0014313593273982406, + "learning_rate": 4.1167338583423755e-06, + "loss": 0.0, + "num_input_tokens_seen": 64914384, + "step": 111890 + }, + { + "epoch": 16.665921954125707, + "grad_norm": 0.0021872608922421932, + "learning_rate": 4.114947672248593e-06, + "loss": 0.0, + "num_input_tokens_seen": 64917232, + "step": 111895 + }, + { + "epoch": 16.666666666666668, + "grad_norm": 1.9675704606925137e-05, + "learning_rate": 4.113161838986537e-06, + "loss": 0.0, + "num_input_tokens_seen": 64919888, + "step": 111900 + }, + { + "epoch": 16.667411379207625, + "grad_norm": 5.278389289742336e-05, + "learning_rate": 4.111376358586388e-06, + "loss": 0.0, + "num_input_tokens_seen": 64922800, + "step": 111905 + }, + { + "epoch": 16.668156091748585, + "grad_norm": 0.0007947304402478039, + "learning_rate": 4.109591231078303e-06, + "loss": 0.0, + "num_input_tokens_seen": 64925552, + "step": 111910 + }, + { + "epoch": 16.668900804289546, + "grad_norm": 0.00038944551488384604, + "learning_rate": 4.107806456492444e-06, + "loss": 0.0, + "num_input_tokens_seen": 64928432, + "step": 111915 + }, + { + "epoch": 16.669645516830503, + "grad_norm": 9.897213749354705e-05, + "learning_rate": 4.106022034858961e-06, + "loss": 0.0, + "num_input_tokens_seen": 64931248, + "step": 111920 + }, + { + "epoch": 16.670390229371463, + "grad_norm": 1.844712096499279e-05, + "learning_rate": 4.104237966208002e-06, + "loss": 0.0, + "num_input_tokens_seen": 64934320, + "step": 111925 + }, + { + "epoch": 16.67113494191242, + "grad_norm": 3.295973510830663e-05, + "learning_rate": 4.102454250569698e-06, + "loss": 0.0001, + "num_input_tokens_seen": 64937072, + "step": 111930 + }, + { + "epoch": 16.67187965445338, + "grad_norm": 2.189735369029222e-06, + "learning_rate": 4.1006708879741975e-06, + "loss": 0.0, + "num_input_tokens_seen": 64939984, + "step": 111935 + }, + { + "epoch": 16.67262436699434, + "grad_norm": 8.515040462953039e-06, + "learning_rate": 4.098887878451621e-06, + "loss": 0.0, + "num_input_tokens_seen": 64942800, + "step": 111940 + }, + { + "epoch": 16.6733690795353, + "grad_norm": 3.6610908864531666e-05, + "learning_rate": 4.097105222032083e-06, + "loss": 0.0, + "num_input_tokens_seen": 64945488, + "step": 111945 + }, + { + "epoch": 16.67411379207626, + "grad_norm": 2.886073343688622e-05, + "learning_rate": 4.095322918745717e-06, + "loss": 0.0001, + "num_input_tokens_seen": 64948368, + "step": 111950 + }, + { + "epoch": 16.67485850461722, + "grad_norm": 0.0007461833884008229, + "learning_rate": 4.093540968622614e-06, + "loss": 0.0, + "num_input_tokens_seen": 64951216, + "step": 111955 + }, + { + "epoch": 16.675603217158177, + "grad_norm": 2.8497150196926668e-05, + "learning_rate": 4.091759371692896e-06, + "loss": 0.0, + "num_input_tokens_seen": 64954352, + "step": 111960 + }, + { + "epoch": 16.676347929699137, + "grad_norm": 7.496876060031354e-05, + "learning_rate": 4.089978127986646e-06, + "loss": 0.0, + "num_input_tokens_seen": 64957232, + "step": 111965 + }, + { + "epoch": 16.677092642240094, + "grad_norm": 0.00011651503155007958, + "learning_rate": 4.08819723753397e-06, + "loss": 0.0, + "num_input_tokens_seen": 64960240, + "step": 111970 + }, + { + "epoch": 16.677837354781055, + "grad_norm": 1.545744453324005e-05, + "learning_rate": 4.086416700364948e-06, + "loss": 0.0, + "num_input_tokens_seen": 64963280, + "step": 111975 + }, + { + "epoch": 16.678582067322015, + "grad_norm": 1.6639518435113132e-05, + "learning_rate": 4.084636516509654e-06, + "loss": 0.0003, + "num_input_tokens_seen": 64966352, + "step": 111980 + }, + { + "epoch": 16.679326779862972, + "grad_norm": 7.122482202248648e-06, + "learning_rate": 4.0828566859981765e-06, + "loss": 0.0, + "num_input_tokens_seen": 64969392, + "step": 111985 + }, + { + "epoch": 16.680071492403933, + "grad_norm": 0.00010122730600414798, + "learning_rate": 4.081077208860573e-06, + "loss": 0.0, + "num_input_tokens_seen": 64972304, + "step": 111990 + }, + { + "epoch": 16.68081620494489, + "grad_norm": 7.946605910547078e-05, + "learning_rate": 4.079298085126912e-06, + "loss": 0.0, + "num_input_tokens_seen": 64975664, + "step": 111995 + }, + { + "epoch": 16.68156091748585, + "grad_norm": 3.291710891062394e-05, + "learning_rate": 4.0775193148272385e-06, + "loss": 0.0, + "num_input_tokens_seen": 64978640, + "step": 112000 + }, + { + "epoch": 16.68230563002681, + "grad_norm": 0.5326766967773438, + "learning_rate": 4.07574089799162e-06, + "loss": 0.0007, + "num_input_tokens_seen": 64981904, + "step": 112005 + }, + { + "epoch": 16.683050342567768, + "grad_norm": 3.372018181835301e-05, + "learning_rate": 4.073962834650083e-06, + "loss": 0.0, + "num_input_tokens_seen": 64984656, + "step": 112010 + }, + { + "epoch": 16.683795055108728, + "grad_norm": 1.1341363460815046e-05, + "learning_rate": 4.072185124832684e-06, + "loss": 0.0, + "num_input_tokens_seen": 64987248, + "step": 112015 + }, + { + "epoch": 16.68453976764969, + "grad_norm": 1.5841274944250472e-05, + "learning_rate": 4.070407768569448e-06, + "loss": 0.0, + "num_input_tokens_seen": 64990096, + "step": 112020 + }, + { + "epoch": 16.685284480190646, + "grad_norm": 4.7596520744264126e-05, + "learning_rate": 4.068630765890393e-06, + "loss": 0.0, + "num_input_tokens_seen": 64992784, + "step": 112025 + }, + { + "epoch": 16.686029192731606, + "grad_norm": 0.0005784334498457611, + "learning_rate": 4.0668541168255556e-06, + "loss": 0.0, + "num_input_tokens_seen": 64995536, + "step": 112030 + }, + { + "epoch": 16.686773905272563, + "grad_norm": 5.954514563200064e-06, + "learning_rate": 4.065077821404934e-06, + "loss": 0.0, + "num_input_tokens_seen": 64998448, + "step": 112035 + }, + { + "epoch": 16.687518617813524, + "grad_norm": 4.625937890523346e-06, + "learning_rate": 4.0633018796585514e-06, + "loss": 0.0, + "num_input_tokens_seen": 65001040, + "step": 112040 + }, + { + "epoch": 16.688263330354484, + "grad_norm": 3.177717007929459e-05, + "learning_rate": 4.061526291616399e-06, + "loss": 0.0, + "num_input_tokens_seen": 65003984, + "step": 112045 + }, + { + "epoch": 16.68900804289544, + "grad_norm": 6.2907151914259885e-06, + "learning_rate": 4.059751057308486e-06, + "loss": 0.0, + "num_input_tokens_seen": 65007056, + "step": 112050 + }, + { + "epoch": 16.689752755436402, + "grad_norm": 2.6392011932330206e-05, + "learning_rate": 4.057976176764797e-06, + "loss": 0.0, + "num_input_tokens_seen": 65009968, + "step": 112055 + }, + { + "epoch": 16.690497467977362, + "grad_norm": 1.0633038073137868e-05, + "learning_rate": 4.056201650015315e-06, + "loss": 0.0, + "num_input_tokens_seen": 65012976, + "step": 112060 + }, + { + "epoch": 16.69124218051832, + "grad_norm": 7.183381512732012e-06, + "learning_rate": 4.054427477090019e-06, + "loss": 0.0, + "num_input_tokens_seen": 65016144, + "step": 112065 + }, + { + "epoch": 16.69198689305928, + "grad_norm": 2.7615478757070377e-05, + "learning_rate": 4.0526536580188766e-06, + "loss": 0.0, + "num_input_tokens_seen": 65018960, + "step": 112070 + }, + { + "epoch": 16.692731605600237, + "grad_norm": 2.1440589534904575e-06, + "learning_rate": 4.050880192831868e-06, + "loss": 0.0, + "num_input_tokens_seen": 65021872, + "step": 112075 + }, + { + "epoch": 16.693476318141197, + "grad_norm": 5.690749276254792e-06, + "learning_rate": 4.04910708155894e-06, + "loss": 0.0, + "num_input_tokens_seen": 65024656, + "step": 112080 + }, + { + "epoch": 16.694221030682158, + "grad_norm": 2.357635366934119e-06, + "learning_rate": 4.047334324230059e-06, + "loss": 0.0, + "num_input_tokens_seen": 65027536, + "step": 112085 + }, + { + "epoch": 16.694965743223115, + "grad_norm": 6.160351040307432e-05, + "learning_rate": 4.045561920875165e-06, + "loss": 0.0, + "num_input_tokens_seen": 65030576, + "step": 112090 + }, + { + "epoch": 16.695710455764075, + "grad_norm": 4.068631824338809e-06, + "learning_rate": 4.043789871524212e-06, + "loss": 0.0, + "num_input_tokens_seen": 65033456, + "step": 112095 + }, + { + "epoch": 16.696455168305036, + "grad_norm": 3.8957710785325617e-05, + "learning_rate": 4.042018176207127e-06, + "loss": 0.0, + "num_input_tokens_seen": 65036464, + "step": 112100 + }, + { + "epoch": 16.697199880845993, + "grad_norm": 2.5059909603442065e-05, + "learning_rate": 4.04024683495384e-06, + "loss": 0.0, + "num_input_tokens_seen": 65039696, + "step": 112105 + }, + { + "epoch": 16.697944593386953, + "grad_norm": 8.291652193292975e-06, + "learning_rate": 4.038475847794287e-06, + "loss": 0.0, + "num_input_tokens_seen": 65042224, + "step": 112110 + }, + { + "epoch": 16.69868930592791, + "grad_norm": 8.616538980277255e-05, + "learning_rate": 4.036705214758379e-06, + "loss": 0.0, + "num_input_tokens_seen": 65045104, + "step": 112115 + }, + { + "epoch": 16.69943401846887, + "grad_norm": 9.87843577604508e-06, + "learning_rate": 4.0349349358760255e-06, + "loss": 0.001, + "num_input_tokens_seen": 65047984, + "step": 112120 + }, + { + "epoch": 16.70017873100983, + "grad_norm": 7.00817909091711e-05, + "learning_rate": 4.0331650111771426e-06, + "loss": 0.0, + "num_input_tokens_seen": 65050832, + "step": 112125 + }, + { + "epoch": 16.70092344355079, + "grad_norm": 7.110586011549458e-06, + "learning_rate": 4.031395440691629e-06, + "loss": 0.0, + "num_input_tokens_seen": 65053744, + "step": 112130 + }, + { + "epoch": 16.70166815609175, + "grad_norm": 3.485266279312782e-05, + "learning_rate": 4.02962622444937e-06, + "loss": 0.0, + "num_input_tokens_seen": 65056688, + "step": 112135 + }, + { + "epoch": 16.702412868632706, + "grad_norm": 7.597115200042026e-06, + "learning_rate": 4.0278573624802695e-06, + "loss": 0.0, + "num_input_tokens_seen": 65059696, + "step": 112140 + }, + { + "epoch": 16.703157581173667, + "grad_norm": 0.0003235965850763023, + "learning_rate": 4.026088854814205e-06, + "loss": 0.0, + "num_input_tokens_seen": 65062672, + "step": 112145 + }, + { + "epoch": 16.703902293714627, + "grad_norm": 0.0021849425975233316, + "learning_rate": 4.024320701481044e-06, + "loss": 0.0, + "num_input_tokens_seen": 65065648, + "step": 112150 + }, + { + "epoch": 16.704647006255584, + "grad_norm": 8.033949598029722e-06, + "learning_rate": 4.0225529025106735e-06, + "loss": 0.0, + "num_input_tokens_seen": 65068592, + "step": 112155 + }, + { + "epoch": 16.705391718796545, + "grad_norm": 8.141586295096204e-05, + "learning_rate": 4.020785457932946e-06, + "loss": 0.0, + "num_input_tokens_seen": 65071632, + "step": 112160 + }, + { + "epoch": 16.706136431337505, + "grad_norm": 5.4128300689626485e-05, + "learning_rate": 4.01901836777773e-06, + "loss": 0.0, + "num_input_tokens_seen": 65074480, + "step": 112165 + }, + { + "epoch": 16.706881143878462, + "grad_norm": 9.910551852954086e-06, + "learning_rate": 4.01725163207487e-06, + "loss": 0.0, + "num_input_tokens_seen": 65077200, + "step": 112170 + }, + { + "epoch": 16.707625856419423, + "grad_norm": 1.2058172615070362e-05, + "learning_rate": 4.015485250854223e-06, + "loss": 0.0, + "num_input_tokens_seen": 65079888, + "step": 112175 + }, + { + "epoch": 16.70837056896038, + "grad_norm": 6.1438040575012565e-06, + "learning_rate": 4.013719224145623e-06, + "loss": 0.0, + "num_input_tokens_seen": 65082832, + "step": 112180 + }, + { + "epoch": 16.70911528150134, + "grad_norm": 0.0004166668513789773, + "learning_rate": 4.011953551978911e-06, + "loss": 0.0, + "num_input_tokens_seen": 65085520, + "step": 112185 + }, + { + "epoch": 16.7098599940423, + "grad_norm": 1.958255779754836e-05, + "learning_rate": 4.0101882343839105e-06, + "loss": 0.0, + "num_input_tokens_seen": 65088624, + "step": 112190 + }, + { + "epoch": 16.710604706583258, + "grad_norm": 1.0420165381219704e-05, + "learning_rate": 4.0084232713904405e-06, + "loss": 0.0, + "num_input_tokens_seen": 65091600, + "step": 112195 + }, + { + "epoch": 16.71134941912422, + "grad_norm": 0.004530100151896477, + "learning_rate": 4.006658663028331e-06, + "loss": 0.0, + "num_input_tokens_seen": 65094512, + "step": 112200 + }, + { + "epoch": 16.71209413166518, + "grad_norm": 0.00011285638902336359, + "learning_rate": 4.004894409327381e-06, + "loss": 0.0, + "num_input_tokens_seen": 65097360, + "step": 112205 + }, + { + "epoch": 16.712838844206136, + "grad_norm": 0.0008080004481598735, + "learning_rate": 4.0031305103174076e-06, + "loss": 0.0, + "num_input_tokens_seen": 65100464, + "step": 112210 + }, + { + "epoch": 16.713583556747096, + "grad_norm": 7.1164276960189454e-06, + "learning_rate": 4.001366966028197e-06, + "loss": 0.0, + "num_input_tokens_seen": 65103376, + "step": 112215 + }, + { + "epoch": 16.714328269288053, + "grad_norm": 4.9622271944826934e-06, + "learning_rate": 3.999603776489555e-06, + "loss": 0.0, + "num_input_tokens_seen": 65106384, + "step": 112220 + }, + { + "epoch": 16.715072981829014, + "grad_norm": 0.00014382344670593739, + "learning_rate": 3.997840941731265e-06, + "loss": 0.0, + "num_input_tokens_seen": 65108944, + "step": 112225 + }, + { + "epoch": 16.715817694369974, + "grad_norm": 0.00765964575111866, + "learning_rate": 3.996078461783098e-06, + "loss": 0.0, + "num_input_tokens_seen": 65112048, + "step": 112230 + }, + { + "epoch": 16.71656240691093, + "grad_norm": 7.424709474435076e-05, + "learning_rate": 3.994316336674847e-06, + "loss": 0.0, + "num_input_tokens_seen": 65114640, + "step": 112235 + }, + { + "epoch": 16.717307119451892, + "grad_norm": 1.0885731171583757e-05, + "learning_rate": 3.992554566436263e-06, + "loss": 0.0, + "num_input_tokens_seen": 65117552, + "step": 112240 + }, + { + "epoch": 16.718051831992852, + "grad_norm": 3.815335730905645e-06, + "learning_rate": 3.990793151097128e-06, + "loss": 0.0, + "num_input_tokens_seen": 65120656, + "step": 112245 + }, + { + "epoch": 16.71879654453381, + "grad_norm": 9.268612302548718e-06, + "learning_rate": 3.989032090687189e-06, + "loss": 0.0, + "num_input_tokens_seen": 65123600, + "step": 112250 + }, + { + "epoch": 16.71954125707477, + "grad_norm": 0.00010266334720654413, + "learning_rate": 3.987271385236197e-06, + "loss": 0.0, + "num_input_tokens_seen": 65126288, + "step": 112255 + }, + { + "epoch": 16.720285969615727, + "grad_norm": 4.259853812982328e-05, + "learning_rate": 3.9855110347739e-06, + "loss": 0.0, + "num_input_tokens_seen": 65129136, + "step": 112260 + }, + { + "epoch": 16.721030682156687, + "grad_norm": 2.7960832085227594e-05, + "learning_rate": 3.983751039330028e-06, + "loss": 0.0, + "num_input_tokens_seen": 65132112, + "step": 112265 + }, + { + "epoch": 16.721775394697648, + "grad_norm": 2.115669849445112e-05, + "learning_rate": 3.981991398934329e-06, + "loss": 0.0, + "num_input_tokens_seen": 65134832, + "step": 112270 + }, + { + "epoch": 16.722520107238605, + "grad_norm": 4.8701127525419e-05, + "learning_rate": 3.980232113616519e-06, + "loss": 0.0, + "num_input_tokens_seen": 65137648, + "step": 112275 + }, + { + "epoch": 16.723264819779565, + "grad_norm": 0.0001158110608230345, + "learning_rate": 3.978473183406328e-06, + "loss": 0.0, + "num_input_tokens_seen": 65140368, + "step": 112280 + }, + { + "epoch": 16.724009532320522, + "grad_norm": 2.7136072731082095e-06, + "learning_rate": 3.9767146083334625e-06, + "loss": 0.0, + "num_input_tokens_seen": 65143408, + "step": 112285 + }, + { + "epoch": 16.724754244861483, + "grad_norm": 7.573399489047006e-05, + "learning_rate": 3.974956388427642e-06, + "loss": 0.0, + "num_input_tokens_seen": 65146256, + "step": 112290 + }, + { + "epoch": 16.725498957402444, + "grad_norm": 1.0358174222346861e-05, + "learning_rate": 3.97319852371856e-06, + "loss": 0.0, + "num_input_tokens_seen": 65149392, + "step": 112295 + }, + { + "epoch": 16.7262436699434, + "grad_norm": 3.3594613341847435e-05, + "learning_rate": 3.971441014235921e-06, + "loss": 0.0, + "num_input_tokens_seen": 65152240, + "step": 112300 + }, + { + "epoch": 16.72698838248436, + "grad_norm": 4.6663903958688024e-06, + "learning_rate": 3.969683860009415e-06, + "loss": 0.0, + "num_input_tokens_seen": 65154992, + "step": 112305 + }, + { + "epoch": 16.72773309502532, + "grad_norm": 3.6844678106717765e-05, + "learning_rate": 3.967927061068721e-06, + "loss": 0.0, + "num_input_tokens_seen": 65157840, + "step": 112310 + }, + { + "epoch": 16.72847780756628, + "grad_norm": 4.309186988393776e-05, + "learning_rate": 3.966170617443529e-06, + "loss": 0.0, + "num_input_tokens_seen": 65160528, + "step": 112315 + }, + { + "epoch": 16.72922252010724, + "grad_norm": 9.807554306462407e-06, + "learning_rate": 3.964414529163507e-06, + "loss": 0.0, + "num_input_tokens_seen": 65163120, + "step": 112320 + }, + { + "epoch": 16.7299672326482, + "grad_norm": 0.00037167605478316545, + "learning_rate": 3.96265879625832e-06, + "loss": 0.0, + "num_input_tokens_seen": 65166064, + "step": 112325 + }, + { + "epoch": 16.730711945189157, + "grad_norm": 7.051602096908027e-06, + "learning_rate": 3.960903418757628e-06, + "loss": 0.0, + "num_input_tokens_seen": 65169168, + "step": 112330 + }, + { + "epoch": 16.731456657730117, + "grad_norm": 214.72357177734375, + "learning_rate": 3.959148396691092e-06, + "loss": 0.0175, + "num_input_tokens_seen": 65172304, + "step": 112335 + }, + { + "epoch": 16.732201370271074, + "grad_norm": 0.0003211283765267581, + "learning_rate": 3.957393730088363e-06, + "loss": 0.0, + "num_input_tokens_seen": 65175344, + "step": 112340 + }, + { + "epoch": 16.732946082812035, + "grad_norm": 8.156269905157387e-05, + "learning_rate": 3.9556394189790705e-06, + "loss": 0.0, + "num_input_tokens_seen": 65178288, + "step": 112345 + }, + { + "epoch": 16.733690795352995, + "grad_norm": 2.376669044679147e-06, + "learning_rate": 3.95388546339287e-06, + "loss": 0.0, + "num_input_tokens_seen": 65181296, + "step": 112350 + }, + { + "epoch": 16.734435507893952, + "grad_norm": 5.1244624046375975e-05, + "learning_rate": 3.9521318633593765e-06, + "loss": 0.0, + "num_input_tokens_seen": 65184080, + "step": 112355 + }, + { + "epoch": 16.735180220434913, + "grad_norm": 6.316709914244711e-05, + "learning_rate": 3.950378618908232e-06, + "loss": 0.0, + "num_input_tokens_seen": 65186832, + "step": 112360 + }, + { + "epoch": 16.73592493297587, + "grad_norm": 0.0009204164380207658, + "learning_rate": 3.948625730069039e-06, + "loss": 0.0271, + "num_input_tokens_seen": 65189840, + "step": 112365 + }, + { + "epoch": 16.73666964551683, + "grad_norm": 0.00011907439329661429, + "learning_rate": 3.946873196871423e-06, + "loss": 0.0, + "num_input_tokens_seen": 65192880, + "step": 112370 + }, + { + "epoch": 16.73741435805779, + "grad_norm": 5.360208433558e-06, + "learning_rate": 3.945121019344983e-06, + "loss": 0.0, + "num_input_tokens_seen": 65195728, + "step": 112375 + }, + { + "epoch": 16.738159070598748, + "grad_norm": 1.2932167919643689e-05, + "learning_rate": 3.943369197519328e-06, + "loss": 0.0, + "num_input_tokens_seen": 65198512, + "step": 112380 + }, + { + "epoch": 16.73890378313971, + "grad_norm": 0.7127500772476196, + "learning_rate": 3.941617731424052e-06, + "loss": 0.0003, + "num_input_tokens_seen": 65201200, + "step": 112385 + }, + { + "epoch": 16.73964849568067, + "grad_norm": 4.800685201189481e-05, + "learning_rate": 3.9398666210887395e-06, + "loss": 0.0, + "num_input_tokens_seen": 65204080, + "step": 112390 + }, + { + "epoch": 16.740393208221626, + "grad_norm": 4.716965486295521e-05, + "learning_rate": 3.938115866542977e-06, + "loss": 0.0, + "num_input_tokens_seen": 65207216, + "step": 112395 + }, + { + "epoch": 16.741137920762586, + "grad_norm": 3.2907031709328294e-05, + "learning_rate": 3.936365467816333e-06, + "loss": 0.0, + "num_input_tokens_seen": 65209968, + "step": 112400 + }, + { + "epoch": 16.741882633303543, + "grad_norm": 0.00011706459190463647, + "learning_rate": 3.934615424938395e-06, + "loss": 0.0, + "num_input_tokens_seen": 65213008, + "step": 112405 + }, + { + "epoch": 16.742627345844504, + "grad_norm": 3.5290493087813957e-06, + "learning_rate": 3.932865737938712e-06, + "loss": 0.0, + "num_input_tokens_seen": 65215728, + "step": 112410 + }, + { + "epoch": 16.743372058385464, + "grad_norm": 1.3127122656442225e-05, + "learning_rate": 3.9311164068468576e-06, + "loss": 0.0, + "num_input_tokens_seen": 65218928, + "step": 112415 + }, + { + "epoch": 16.74411677092642, + "grad_norm": 4.948114565195283e-06, + "learning_rate": 3.929367431692377e-06, + "loss": 0.0, + "num_input_tokens_seen": 65221936, + "step": 112420 + }, + { + "epoch": 16.744861483467382, + "grad_norm": 7.179229669418419e-06, + "learning_rate": 3.927618812504813e-06, + "loss": 0.0, + "num_input_tokens_seen": 65224912, + "step": 112425 + }, + { + "epoch": 16.745606196008342, + "grad_norm": 0.0006519059534184635, + "learning_rate": 3.925870549313718e-06, + "loss": 0.0, + "num_input_tokens_seen": 65227920, + "step": 112430 + }, + { + "epoch": 16.7463509085493, + "grad_norm": 0.00027183975907973945, + "learning_rate": 3.9241226421486145e-06, + "loss": 0.0, + "num_input_tokens_seen": 65231152, + "step": 112435 + }, + { + "epoch": 16.74709562109026, + "grad_norm": 2.4213726646848954e-05, + "learning_rate": 3.922375091039046e-06, + "loss": 0.0, + "num_input_tokens_seen": 65233968, + "step": 112440 + }, + { + "epoch": 16.747840333631217, + "grad_norm": 4.916551188216545e-05, + "learning_rate": 3.9206278960145255e-06, + "loss": 0.0, + "num_input_tokens_seen": 65236912, + "step": 112445 + }, + { + "epoch": 16.748585046172177, + "grad_norm": 6.683958417852409e-06, + "learning_rate": 3.918881057104567e-06, + "loss": 0.0, + "num_input_tokens_seen": 65239984, + "step": 112450 + }, + { + "epoch": 16.749329758713138, + "grad_norm": 9.351557673653588e-05, + "learning_rate": 3.917134574338696e-06, + "loss": 0.0, + "num_input_tokens_seen": 65242992, + "step": 112455 + }, + { + "epoch": 16.750074471254095, + "grad_norm": 2.1507960354938405e-06, + "learning_rate": 3.915388447746407e-06, + "loss": 0.0, + "num_input_tokens_seen": 65246000, + "step": 112460 + }, + { + "epoch": 16.750819183795056, + "grad_norm": 5.48513662579353e-06, + "learning_rate": 3.913642677357201e-06, + "loss": 0.0, + "num_input_tokens_seen": 65249008, + "step": 112465 + }, + { + "epoch": 16.751563896336016, + "grad_norm": 0.00021279968495946378, + "learning_rate": 3.911897263200564e-06, + "loss": 0.0, + "num_input_tokens_seen": 65251824, + "step": 112470 + }, + { + "epoch": 16.752308608876973, + "grad_norm": 4.478718255995773e-05, + "learning_rate": 3.910152205305998e-06, + "loss": 0.0, + "num_input_tokens_seen": 65254864, + "step": 112475 + }, + { + "epoch": 16.753053321417934, + "grad_norm": 4.18808212998556e-06, + "learning_rate": 3.908407503702966e-06, + "loss": 0.0, + "num_input_tokens_seen": 65257808, + "step": 112480 + }, + { + "epoch": 16.75379803395889, + "grad_norm": 1.2632466678041965e-05, + "learning_rate": 3.906663158420962e-06, + "loss": 0.0, + "num_input_tokens_seen": 65260560, + "step": 112485 + }, + { + "epoch": 16.75454274649985, + "grad_norm": 3.3382907531631645e-06, + "learning_rate": 3.904919169489438e-06, + "loss": 0.0, + "num_input_tokens_seen": 65263440, + "step": 112490 + }, + { + "epoch": 16.75528745904081, + "grad_norm": 1.334950684395153e-05, + "learning_rate": 3.90317553693787e-06, + "loss": 0.0, + "num_input_tokens_seen": 65266064, + "step": 112495 + }, + { + "epoch": 16.75603217158177, + "grad_norm": 7.326020568143576e-05, + "learning_rate": 3.90143226079571e-06, + "loss": 0.0, + "num_input_tokens_seen": 65269040, + "step": 112500 + }, + { + "epoch": 16.75677688412273, + "grad_norm": 0.0004657339595723897, + "learning_rate": 3.899689341092402e-06, + "loss": 0.0, + "num_input_tokens_seen": 65271920, + "step": 112505 + }, + { + "epoch": 16.757521596663686, + "grad_norm": 0.0001395081344526261, + "learning_rate": 3.897946777857406e-06, + "loss": 0.0, + "num_input_tokens_seen": 65274960, + "step": 112510 + }, + { + "epoch": 16.758266309204647, + "grad_norm": 0.00024217442842200398, + "learning_rate": 3.896204571120149e-06, + "loss": 0.0, + "num_input_tokens_seen": 65277904, + "step": 112515 + }, + { + "epoch": 16.759011021745607, + "grad_norm": 2.553205194999464e-05, + "learning_rate": 3.894462720910067e-06, + "loss": 0.0, + "num_input_tokens_seen": 65281040, + "step": 112520 + }, + { + "epoch": 16.759755734286564, + "grad_norm": 1.1464509952929802e-05, + "learning_rate": 3.892721227256582e-06, + "loss": 0.0, + "num_input_tokens_seen": 65283984, + "step": 112525 + }, + { + "epoch": 16.760500446827525, + "grad_norm": 3.4498665627324954e-05, + "learning_rate": 3.890980090189126e-06, + "loss": 0.0, + "num_input_tokens_seen": 65286800, + "step": 112530 + }, + { + "epoch": 16.761245159368485, + "grad_norm": 0.0007249368936754763, + "learning_rate": 3.8892393097370975e-06, + "loss": 0.0, + "num_input_tokens_seen": 65289712, + "step": 112535 + }, + { + "epoch": 16.761989871909442, + "grad_norm": 1.896515823318623e-05, + "learning_rate": 3.887498885929924e-06, + "loss": 0.0, + "num_input_tokens_seen": 65292400, + "step": 112540 + }, + { + "epoch": 16.762734584450403, + "grad_norm": 4.509588961809641e-06, + "learning_rate": 3.8857588187969975e-06, + "loss": 0.0, + "num_input_tokens_seen": 65295152, + "step": 112545 + }, + { + "epoch": 16.76347929699136, + "grad_norm": 1.3207847587182187e-05, + "learning_rate": 3.884019108367712e-06, + "loss": 0.0, + "num_input_tokens_seen": 65297776, + "step": 112550 + }, + { + "epoch": 16.76422400953232, + "grad_norm": 0.00015495423576794565, + "learning_rate": 3.882279754671467e-06, + "loss": 0.0, + "num_input_tokens_seen": 65300240, + "step": 112555 + }, + { + "epoch": 16.76496872207328, + "grad_norm": 1.0447919368743896, + "learning_rate": 3.880540757737636e-06, + "loss": 0.0001, + "num_input_tokens_seen": 65303184, + "step": 112560 + }, + { + "epoch": 16.765713434614238, + "grad_norm": 8.263128256658092e-05, + "learning_rate": 3.878802117595609e-06, + "loss": 0.0, + "num_input_tokens_seen": 65305872, + "step": 112565 + }, + { + "epoch": 16.7664581471552, + "grad_norm": 0.0008156992262229323, + "learning_rate": 3.877063834274749e-06, + "loss": 0.0, + "num_input_tokens_seen": 65309040, + "step": 112570 + }, + { + "epoch": 16.76720285969616, + "grad_norm": 1.311009873461444e-05, + "learning_rate": 3.8753259078044365e-06, + "loss": 0.0, + "num_input_tokens_seen": 65311856, + "step": 112575 + }, + { + "epoch": 16.767947572237116, + "grad_norm": 4.283075213606935e-06, + "learning_rate": 3.8735883382140184e-06, + "loss": 0.0, + "num_input_tokens_seen": 65314992, + "step": 112580 + }, + { + "epoch": 16.768692284778076, + "grad_norm": 2.3850310753914528e-05, + "learning_rate": 3.871851125532855e-06, + "loss": 0.0, + "num_input_tokens_seen": 65318160, + "step": 112585 + }, + { + "epoch": 16.769436997319033, + "grad_norm": 6.849669443909079e-05, + "learning_rate": 3.870114269790292e-06, + "loss": 0.0, + "num_input_tokens_seen": 65321200, + "step": 112590 + }, + { + "epoch": 16.770181709859994, + "grad_norm": 1.226033600687515e-05, + "learning_rate": 3.8683777710156685e-06, + "loss": 0.0, + "num_input_tokens_seen": 65324144, + "step": 112595 + }, + { + "epoch": 16.770926422400954, + "grad_norm": 4.334613095124951e-06, + "learning_rate": 3.866641629238329e-06, + "loss": 0.0014, + "num_input_tokens_seen": 65326992, + "step": 112600 + }, + { + "epoch": 16.77167113494191, + "grad_norm": 7.453733269358054e-05, + "learning_rate": 3.864905844487596e-06, + "loss": 0.0, + "num_input_tokens_seen": 65329808, + "step": 112605 + }, + { + "epoch": 16.772415847482872, + "grad_norm": 4.365209861134645e-06, + "learning_rate": 3.8631704167928025e-06, + "loss": 0.0, + "num_input_tokens_seen": 65332624, + "step": 112610 + }, + { + "epoch": 16.773160560023832, + "grad_norm": 3.8097856304375455e-05, + "learning_rate": 3.861435346183259e-06, + "loss": 0.0, + "num_input_tokens_seen": 65335632, + "step": 112615 + }, + { + "epoch": 16.77390527256479, + "grad_norm": 11.30898666381836, + "learning_rate": 3.859700632688285e-06, + "loss": 0.0008, + "num_input_tokens_seen": 65338640, + "step": 112620 + }, + { + "epoch": 16.77464998510575, + "grad_norm": 2.18390937334334e-06, + "learning_rate": 3.857966276337183e-06, + "loss": 0.0, + "num_input_tokens_seen": 65341296, + "step": 112625 + }, + { + "epoch": 16.775394697646707, + "grad_norm": 4.8150533984880894e-05, + "learning_rate": 3.856232277159247e-06, + "loss": 0.0, + "num_input_tokens_seen": 65344368, + "step": 112630 + }, + { + "epoch": 16.776139410187668, + "grad_norm": 0.00028835557168349624, + "learning_rate": 3.8544986351837845e-06, + "loss": 0.0, + "num_input_tokens_seen": 65347184, + "step": 112635 + }, + { + "epoch": 16.776884122728628, + "grad_norm": 5.799569862574572e-06, + "learning_rate": 3.852765350440069e-06, + "loss": 0.0, + "num_input_tokens_seen": 65349840, + "step": 112640 + }, + { + "epoch": 16.777628835269585, + "grad_norm": 3.7606334899464855e-06, + "learning_rate": 3.8510324229573956e-06, + "loss": 0.0, + "num_input_tokens_seen": 65352528, + "step": 112645 + }, + { + "epoch": 16.778373547810546, + "grad_norm": 8.202598110074177e-05, + "learning_rate": 3.849299852765034e-06, + "loss": 0.0, + "num_input_tokens_seen": 65355504, + "step": 112650 + }, + { + "epoch": 16.779118260351503, + "grad_norm": 7.190164978965186e-06, + "learning_rate": 3.847567639892255e-06, + "loss": 0.0, + "num_input_tokens_seen": 65358256, + "step": 112655 + }, + { + "epoch": 16.779862972892463, + "grad_norm": 4.127305601286935e-06, + "learning_rate": 3.845835784368324e-06, + "loss": 0.0, + "num_input_tokens_seen": 65361232, + "step": 112660 + }, + { + "epoch": 16.780607685433424, + "grad_norm": 1.5251394870574586e-05, + "learning_rate": 3.844104286222492e-06, + "loss": 0.0, + "num_input_tokens_seen": 65364176, + "step": 112665 + }, + { + "epoch": 16.78135239797438, + "grad_norm": 3.2249031391984317e-06, + "learning_rate": 3.8423731454840185e-06, + "loss": 0.0, + "num_input_tokens_seen": 65367184, + "step": 112670 + }, + { + "epoch": 16.78209711051534, + "grad_norm": 1.8597926100483164e-05, + "learning_rate": 3.840642362182143e-06, + "loss": 0.0, + "num_input_tokens_seen": 65369904, + "step": 112675 + }, + { + "epoch": 16.7828418230563, + "grad_norm": 1.02494914244744e-05, + "learning_rate": 3.838911936346115e-06, + "loss": 0.0, + "num_input_tokens_seen": 65372688, + "step": 112680 + }, + { + "epoch": 16.78358653559726, + "grad_norm": 9.069820953300223e-05, + "learning_rate": 3.837181868005158e-06, + "loss": 0.0, + "num_input_tokens_seen": 65375600, + "step": 112685 + }, + { + "epoch": 16.78433124813822, + "grad_norm": 0.0010698545956984162, + "learning_rate": 3.83545215718851e-06, + "loss": 0.0, + "num_input_tokens_seen": 65378736, + "step": 112690 + }, + { + "epoch": 16.785075960679176, + "grad_norm": 0.00038117432268336415, + "learning_rate": 3.83372280392538e-06, + "loss": 0.0, + "num_input_tokens_seen": 65381520, + "step": 112695 + }, + { + "epoch": 16.785820673220137, + "grad_norm": 0.00016957870684564114, + "learning_rate": 3.831993808244996e-06, + "loss": 0.0, + "num_input_tokens_seen": 65384432, + "step": 112700 + }, + { + "epoch": 16.786565385761097, + "grad_norm": 3.5933644539909437e-06, + "learning_rate": 3.830265170176564e-06, + "loss": 0.0, + "num_input_tokens_seen": 65387184, + "step": 112705 + }, + { + "epoch": 16.787310098302054, + "grad_norm": 1.1021002137567848e-05, + "learning_rate": 3.828536889749282e-06, + "loss": 0.0, + "num_input_tokens_seen": 65390128, + "step": 112710 + }, + { + "epoch": 16.788054810843015, + "grad_norm": 3.5451562325761188e-06, + "learning_rate": 3.826808966992354e-06, + "loss": 0.0, + "num_input_tokens_seen": 65392592, + "step": 112715 + }, + { + "epoch": 16.788799523383975, + "grad_norm": 4.144776994507993e-06, + "learning_rate": 3.825081401934971e-06, + "loss": 0.0, + "num_input_tokens_seen": 65395472, + "step": 112720 + }, + { + "epoch": 16.789544235924932, + "grad_norm": 1.1885019375768024e-05, + "learning_rate": 3.823354194606316e-06, + "loss": 0.0, + "num_input_tokens_seen": 65398544, + "step": 112725 + }, + { + "epoch": 16.790288948465893, + "grad_norm": 2.6692782739701215e-06, + "learning_rate": 3.821627345035561e-06, + "loss": 0.0, + "num_input_tokens_seen": 65401328, + "step": 112730 + }, + { + "epoch": 16.79103366100685, + "grad_norm": 4.139550128456904e-06, + "learning_rate": 3.819900853251896e-06, + "loss": 0.0, + "num_input_tokens_seen": 65404464, + "step": 112735 + }, + { + "epoch": 16.79177837354781, + "grad_norm": 1.1799599633377511e-05, + "learning_rate": 3.818174719284473e-06, + "loss": 0.0, + "num_input_tokens_seen": 65407632, + "step": 112740 + }, + { + "epoch": 16.79252308608877, + "grad_norm": 4.855135557590984e-06, + "learning_rate": 3.816448943162465e-06, + "loss": 0.0, + "num_input_tokens_seen": 65410384, + "step": 112745 + }, + { + "epoch": 16.793267798629728, + "grad_norm": 1.330572285951348e-05, + "learning_rate": 3.814723524915023e-06, + "loss": 0.0, + "num_input_tokens_seen": 65413328, + "step": 112750 + }, + { + "epoch": 16.79401251117069, + "grad_norm": 3.943577212339733e-06, + "learning_rate": 3.812998464571288e-06, + "loss": 0.0, + "num_input_tokens_seen": 65416336, + "step": 112755 + }, + { + "epoch": 16.79475722371165, + "grad_norm": 7.830821232346352e-06, + "learning_rate": 3.8112737621604168e-06, + "loss": 0.0, + "num_input_tokens_seen": 65419472, + "step": 112760 + }, + { + "epoch": 16.795501936252606, + "grad_norm": 15.409187316894531, + "learning_rate": 3.8095494177115358e-06, + "loss": 0.0852, + "num_input_tokens_seen": 65422128, + "step": 112765 + }, + { + "epoch": 16.796246648793566, + "grad_norm": 1.017122485791333e-05, + "learning_rate": 3.8078254312537836e-06, + "loss": 0.0, + "num_input_tokens_seen": 65425072, + "step": 112770 + }, + { + "epoch": 16.796991361334523, + "grad_norm": 1.1729011535644531, + "learning_rate": 3.806101802816284e-06, + "loss": 0.0, + "num_input_tokens_seen": 65427952, + "step": 112775 + }, + { + "epoch": 16.797736073875484, + "grad_norm": 4.166062808508286e-06, + "learning_rate": 3.804378532428149e-06, + "loss": 0.0, + "num_input_tokens_seen": 65431088, + "step": 112780 + }, + { + "epoch": 16.798480786416444, + "grad_norm": 7.730774996161927e-06, + "learning_rate": 3.8026556201185003e-06, + "loss": 0.0, + "num_input_tokens_seen": 65433840, + "step": 112785 + }, + { + "epoch": 16.7992254989574, + "grad_norm": 1.1142170478706248e-05, + "learning_rate": 3.8009330659164425e-06, + "loss": 0.0, + "num_input_tokens_seen": 65436784, + "step": 112790 + }, + { + "epoch": 16.799970211498362, + "grad_norm": 3.213016270819935e-06, + "learning_rate": 3.799210869851072e-06, + "loss": 0.0, + "num_input_tokens_seen": 65439568, + "step": 112795 + }, + { + "epoch": 16.800714924039323, + "grad_norm": 5.3376923460746184e-05, + "learning_rate": 3.7974890319514816e-06, + "loss": 0.0, + "num_input_tokens_seen": 65442352, + "step": 112800 + }, + { + "epoch": 16.80145963658028, + "grad_norm": 4.619610990630463e-06, + "learning_rate": 3.7957675522467735e-06, + "loss": 0.0, + "num_input_tokens_seen": 65445232, + "step": 112805 + }, + { + "epoch": 16.80220434912124, + "grad_norm": 3.6235738662071526e-05, + "learning_rate": 3.7940464307660126e-06, + "loss": 0.0, + "num_input_tokens_seen": 65447824, + "step": 112810 + }, + { + "epoch": 16.802949061662197, + "grad_norm": 4.832907961827004e-06, + "learning_rate": 3.792325667538291e-06, + "loss": 0.0, + "num_input_tokens_seen": 65450800, + "step": 112815 + }, + { + "epoch": 16.803693774203158, + "grad_norm": 6.616574864892755e-06, + "learning_rate": 3.790605262592667e-06, + "loss": 0.0, + "num_input_tokens_seen": 65453712, + "step": 112820 + }, + { + "epoch": 16.804438486744118, + "grad_norm": 6.738078354828758e-06, + "learning_rate": 3.7888852159582154e-06, + "loss": 0.0, + "num_input_tokens_seen": 65456784, + "step": 112825 + }, + { + "epoch": 16.805183199285075, + "grad_norm": 9.21806349651888e-06, + "learning_rate": 3.7871655276639937e-06, + "loss": 0.0, + "num_input_tokens_seen": 65459696, + "step": 112830 + }, + { + "epoch": 16.805927911826036, + "grad_norm": 0.0004995937342755497, + "learning_rate": 3.7854461977390426e-06, + "loss": 0.0002, + "num_input_tokens_seen": 65462768, + "step": 112835 + }, + { + "epoch": 16.806672624366996, + "grad_norm": 3.7910256196482806e-06, + "learning_rate": 3.7837272262124216e-06, + "loss": 0.0, + "num_input_tokens_seen": 65465392, + "step": 112840 + }, + { + "epoch": 16.807417336907953, + "grad_norm": 3.5572365959524177e-06, + "learning_rate": 3.7820086131131667e-06, + "loss": 0.0, + "num_input_tokens_seen": 65468496, + "step": 112845 + }, + { + "epoch": 16.808162049448914, + "grad_norm": 7.523505337303504e-05, + "learning_rate": 3.780290358470309e-06, + "loss": 0.0, + "num_input_tokens_seen": 65471536, + "step": 112850 + }, + { + "epoch": 16.80890676198987, + "grad_norm": 5.1915271797042806e-06, + "learning_rate": 3.7785724623128766e-06, + "loss": 0.0, + "num_input_tokens_seen": 65474992, + "step": 112855 + }, + { + "epoch": 16.80965147453083, + "grad_norm": 3.904408004018478e-06, + "learning_rate": 3.776854924669898e-06, + "loss": 0.0, + "num_input_tokens_seen": 65478064, + "step": 112860 + }, + { + "epoch": 16.81039618707179, + "grad_norm": 2.5116439701378113e-06, + "learning_rate": 3.775137745570387e-06, + "loss": 0.0, + "num_input_tokens_seen": 65480848, + "step": 112865 + }, + { + "epoch": 16.81114089961275, + "grad_norm": 1.8606539015308954e-05, + "learning_rate": 3.7734209250433445e-06, + "loss": 0.0, + "num_input_tokens_seen": 65483632, + "step": 112870 + }, + { + "epoch": 16.81188561215371, + "grad_norm": 2.180500996473711e-05, + "learning_rate": 3.7717044631177867e-06, + "loss": 0.0, + "num_input_tokens_seen": 65486224, + "step": 112875 + }, + { + "epoch": 16.812630324694666, + "grad_norm": 2.2738557163393125e-05, + "learning_rate": 3.7699883598227016e-06, + "loss": 0.0014, + "num_input_tokens_seen": 65488880, + "step": 112880 + }, + { + "epoch": 16.813375037235627, + "grad_norm": 5.957495432085125e-06, + "learning_rate": 3.768272615187091e-06, + "loss": 0.0, + "num_input_tokens_seen": 65492048, + "step": 112885 + }, + { + "epoch": 16.814119749776587, + "grad_norm": 1.938907189469319e-05, + "learning_rate": 3.766557229239931e-06, + "loss": 0.0, + "num_input_tokens_seen": 65494960, + "step": 112890 + }, + { + "epoch": 16.814864462317544, + "grad_norm": 2.0891884560114704e-05, + "learning_rate": 3.7648422020102105e-06, + "loss": 0.0, + "num_input_tokens_seen": 65497904, + "step": 112895 + }, + { + "epoch": 16.815609174858505, + "grad_norm": 0.00015520543092861772, + "learning_rate": 3.7631275335268946e-06, + "loss": 0.0, + "num_input_tokens_seen": 65500944, + "step": 112900 + }, + { + "epoch": 16.816353887399465, + "grad_norm": 1.0923224181169644e-05, + "learning_rate": 3.7614132238189605e-06, + "loss": 0.0, + "num_input_tokens_seen": 65503696, + "step": 112905 + }, + { + "epoch": 16.817098599940422, + "grad_norm": 8.714278919796925e-06, + "learning_rate": 3.7596992729153623e-06, + "loss": 0.0, + "num_input_tokens_seen": 65506512, + "step": 112910 + }, + { + "epoch": 16.817843312481383, + "grad_norm": 3.0730318485439057e-06, + "learning_rate": 3.757985680845058e-06, + "loss": 0.0, + "num_input_tokens_seen": 65510672, + "step": 112915 + }, + { + "epoch": 16.81858802502234, + "grad_norm": 0.00011497159721329808, + "learning_rate": 3.7562724476369963e-06, + "loss": 0.0, + "num_input_tokens_seen": 65513616, + "step": 112920 + }, + { + "epoch": 16.8193327375633, + "grad_norm": 5.5968888773350045e-05, + "learning_rate": 3.7545595733201126e-06, + "loss": 0.0, + "num_input_tokens_seen": 65516336, + "step": 112925 + }, + { + "epoch": 16.82007745010426, + "grad_norm": 2.493119154678425e-06, + "learning_rate": 3.7528470579233614e-06, + "loss": 0.0009, + "num_input_tokens_seen": 65519088, + "step": 112930 + }, + { + "epoch": 16.820822162645218, + "grad_norm": 3.7362187867984176e-06, + "learning_rate": 3.751134901475656e-06, + "loss": 0.0, + "num_input_tokens_seen": 65521744, + "step": 112935 + }, + { + "epoch": 16.82156687518618, + "grad_norm": 0.000427340593887493, + "learning_rate": 3.7494231040059364e-06, + "loss": 0.0, + "num_input_tokens_seen": 65524432, + "step": 112940 + }, + { + "epoch": 16.82231158772714, + "grad_norm": 8.918443199945614e-05, + "learning_rate": 3.7477116655431162e-06, + "loss": 0.0, + "num_input_tokens_seen": 65527440, + "step": 112945 + }, + { + "epoch": 16.823056300268096, + "grad_norm": 7.8592138379463e-06, + "learning_rate": 3.7460005861161024e-06, + "loss": 0.0, + "num_input_tokens_seen": 65530320, + "step": 112950 + }, + { + "epoch": 16.823801012809056, + "grad_norm": 5.329133273335174e-06, + "learning_rate": 3.7442898657538113e-06, + "loss": 0.3406, + "num_input_tokens_seen": 65533008, + "step": 112955 + }, + { + "epoch": 16.824545725350013, + "grad_norm": 1.5317513316404074e-05, + "learning_rate": 3.7425795044851357e-06, + "loss": 0.0, + "num_input_tokens_seen": 65535760, + "step": 112960 + }, + { + "epoch": 16.825290437890974, + "grad_norm": 4.451811946637463e-06, + "learning_rate": 3.7408695023389785e-06, + "loss": 0.0, + "num_input_tokens_seen": 65538736, + "step": 112965 + }, + { + "epoch": 16.826035150431935, + "grad_norm": 2.212137587775942e-06, + "learning_rate": 3.7391598593442184e-06, + "loss": 0.0, + "num_input_tokens_seen": 65541776, + "step": 112970 + }, + { + "epoch": 16.82677986297289, + "grad_norm": 1.607339254405815e-05, + "learning_rate": 3.7374505755297494e-06, + "loss": 0.0, + "num_input_tokens_seen": 65544464, + "step": 112975 + }, + { + "epoch": 16.827524575513852, + "grad_norm": 9.229906027030665e-06, + "learning_rate": 3.735741650924443e-06, + "loss": 0.0, + "num_input_tokens_seen": 65547600, + "step": 112980 + }, + { + "epoch": 16.828269288054813, + "grad_norm": 4.809933670912869e-06, + "learning_rate": 3.7340330855571704e-06, + "loss": 0.0, + "num_input_tokens_seen": 65550576, + "step": 112985 + }, + { + "epoch": 16.82901400059577, + "grad_norm": 0.0011111374478787184, + "learning_rate": 3.7323248794567942e-06, + "loss": 0.0, + "num_input_tokens_seen": 65553424, + "step": 112990 + }, + { + "epoch": 16.82975871313673, + "grad_norm": 1.5815136066521518e-05, + "learning_rate": 3.7306170326521674e-06, + "loss": 0.0, + "num_input_tokens_seen": 65556464, + "step": 112995 + }, + { + "epoch": 16.830503425677687, + "grad_norm": 4.1925548430299386e-05, + "learning_rate": 3.7289095451721546e-06, + "loss": 0.0, + "num_input_tokens_seen": 65559280, + "step": 113000 + }, + { + "epoch": 16.831248138218648, + "grad_norm": 1.698668347671628e-05, + "learning_rate": 3.7272024170455914e-06, + "loss": 0.0, + "num_input_tokens_seen": 65562096, + "step": 113005 + }, + { + "epoch": 16.831992850759608, + "grad_norm": 1.4059199202165473e-05, + "learning_rate": 3.7254956483013278e-06, + "loss": 0.0, + "num_input_tokens_seen": 65564848, + "step": 113010 + }, + { + "epoch": 16.832737563300565, + "grad_norm": 2.4346012651221827e-05, + "learning_rate": 3.7237892389681866e-06, + "loss": 0.0, + "num_input_tokens_seen": 65567824, + "step": 113015 + }, + { + "epoch": 16.833482275841526, + "grad_norm": 1.6267587852780707e-05, + "learning_rate": 3.7220831890750067e-06, + "loss": 0.0001, + "num_input_tokens_seen": 65570512, + "step": 113020 + }, + { + "epoch": 16.834226988382483, + "grad_norm": 3.3757796700228937e-06, + "learning_rate": 3.7203774986506067e-06, + "loss": 0.0, + "num_input_tokens_seen": 65573200, + "step": 113025 + }, + { + "epoch": 16.834971700923443, + "grad_norm": 1.1066552360716742e-05, + "learning_rate": 3.718672167723797e-06, + "loss": 0.0, + "num_input_tokens_seen": 65576240, + "step": 113030 + }, + { + "epoch": 16.835716413464404, + "grad_norm": 0.0003894373367074877, + "learning_rate": 3.7169671963233952e-06, + "loss": 0.0, + "num_input_tokens_seen": 65578864, + "step": 113035 + }, + { + "epoch": 16.83646112600536, + "grad_norm": 7.484812522307038e-05, + "learning_rate": 3.7152625844781963e-06, + "loss": 0.0, + "num_input_tokens_seen": 65581872, + "step": 113040 + }, + { + "epoch": 16.83720583854632, + "grad_norm": 8.976428031921387, + "learning_rate": 3.7135583322170098e-06, + "loss": 0.0017, + "num_input_tokens_seen": 65584944, + "step": 113045 + }, + { + "epoch": 16.837950551087282, + "grad_norm": 0.022722484543919563, + "learning_rate": 3.7118544395686213e-06, + "loss": 0.0002, + "num_input_tokens_seen": 65587504, + "step": 113050 + }, + { + "epoch": 16.83869526362824, + "grad_norm": 7.427986565744504e-05, + "learning_rate": 3.710150906561813e-06, + "loss": 0.0, + "num_input_tokens_seen": 65590512, + "step": 113055 + }, + { + "epoch": 16.8394399761692, + "grad_norm": 2.1449110136018135e-05, + "learning_rate": 3.708447733225362e-06, + "loss": 0.0, + "num_input_tokens_seen": 65593424, + "step": 113060 + }, + { + "epoch": 16.840184688710156, + "grad_norm": 9.294023038819432e-05, + "learning_rate": 3.706744919588054e-06, + "loss": 0.0, + "num_input_tokens_seen": 65596080, + "step": 113065 + }, + { + "epoch": 16.840929401251117, + "grad_norm": 1.3782498172076885e-05, + "learning_rate": 3.7050424656786486e-06, + "loss": 0.0, + "num_input_tokens_seen": 65598992, + "step": 113070 + }, + { + "epoch": 16.841674113792077, + "grad_norm": 2.737750401138328e-05, + "learning_rate": 3.7033403715259014e-06, + "loss": 0.0001, + "num_input_tokens_seen": 65601968, + "step": 113075 + }, + { + "epoch": 16.842418826333034, + "grad_norm": 2.8669630864897044e-06, + "learning_rate": 3.7016386371585804e-06, + "loss": 0.0001, + "num_input_tokens_seen": 65604720, + "step": 113080 + }, + { + "epoch": 16.843163538873995, + "grad_norm": 0.0303751602768898, + "learning_rate": 3.6999372626054212e-06, + "loss": 0.0, + "num_input_tokens_seen": 65607440, + "step": 113085 + }, + { + "epoch": 16.843908251414955, + "grad_norm": 9.377181413583457e-06, + "learning_rate": 3.6982362478951786e-06, + "loss": 0.0, + "num_input_tokens_seen": 65610320, + "step": 113090 + }, + { + "epoch": 16.844652963955912, + "grad_norm": 4.013422767457087e-06, + "learning_rate": 3.6965355930565796e-06, + "loss": 0.0, + "num_input_tokens_seen": 65613584, + "step": 113095 + }, + { + "epoch": 16.845397676496873, + "grad_norm": 3.446190703471075e-06, + "learning_rate": 3.6948352981183683e-06, + "loss": 0.0, + "num_input_tokens_seen": 65616432, + "step": 113100 + }, + { + "epoch": 16.84614238903783, + "grad_norm": 5.471026270242874e-06, + "learning_rate": 3.693135363109257e-06, + "loss": 0.0, + "num_input_tokens_seen": 65619184, + "step": 113105 + }, + { + "epoch": 16.84688710157879, + "grad_norm": 2.1335748897399753e-05, + "learning_rate": 3.6914357880579647e-06, + "loss": 0.0, + "num_input_tokens_seen": 65622000, + "step": 113110 + }, + { + "epoch": 16.84763181411975, + "grad_norm": 0.0019045608351007104, + "learning_rate": 3.689736572993213e-06, + "loss": 0.0, + "num_input_tokens_seen": 65624816, + "step": 113115 + }, + { + "epoch": 16.848376526660708, + "grad_norm": 0.00010965554974973202, + "learning_rate": 3.6880377179437014e-06, + "loss": 0.0, + "num_input_tokens_seen": 65627696, + "step": 113120 + }, + { + "epoch": 16.84912123920167, + "grad_norm": 0.00026959972456097603, + "learning_rate": 3.6863392229381342e-06, + "loss": 0.0, + "num_input_tokens_seen": 65630544, + "step": 113125 + }, + { + "epoch": 16.84986595174263, + "grad_norm": 2.332575968466699e-05, + "learning_rate": 3.684641088005197e-06, + "loss": 0.0, + "num_input_tokens_seen": 65633584, + "step": 113130 + }, + { + "epoch": 16.850610664283586, + "grad_norm": 6.061168733140221e-06, + "learning_rate": 3.6829433131735895e-06, + "loss": 0.0, + "num_input_tokens_seen": 65636592, + "step": 113135 + }, + { + "epoch": 16.851355376824547, + "grad_norm": 1.2330368008406367e-05, + "learning_rate": 3.6812458984719857e-06, + "loss": 0.0, + "num_input_tokens_seen": 65639408, + "step": 113140 + }, + { + "epoch": 16.852100089365504, + "grad_norm": 5.460307147586718e-05, + "learning_rate": 3.6795488439290706e-06, + "loss": 0.0, + "num_input_tokens_seen": 65641968, + "step": 113145 + }, + { + "epoch": 16.852844801906464, + "grad_norm": 1.4451909009949304e-05, + "learning_rate": 3.6778521495735073e-06, + "loss": 0.0, + "num_input_tokens_seen": 65644720, + "step": 113150 + }, + { + "epoch": 16.853589514447425, + "grad_norm": 3.0595554108003853e-06, + "learning_rate": 3.6761558154339544e-06, + "loss": 0.0, + "num_input_tokens_seen": 65648016, + "step": 113155 + }, + { + "epoch": 16.85433422698838, + "grad_norm": 1.4889013073116075e-05, + "learning_rate": 3.6744598415390823e-06, + "loss": 0.0, + "num_input_tokens_seen": 65650800, + "step": 113160 + }, + { + "epoch": 16.855078939529342, + "grad_norm": 2.4645280063850805e-05, + "learning_rate": 3.672764227917533e-06, + "loss": 0.0, + "num_input_tokens_seen": 65653456, + "step": 113165 + }, + { + "epoch": 16.8558236520703, + "grad_norm": 7.5193865995970555e-06, + "learning_rate": 3.6710689745979606e-06, + "loss": 0.0, + "num_input_tokens_seen": 65656368, + "step": 113170 + }, + { + "epoch": 16.85656836461126, + "grad_norm": 3.6280835047364235e-06, + "learning_rate": 3.6693740816090005e-06, + "loss": 0.0, + "num_input_tokens_seen": 65659088, + "step": 113175 + }, + { + "epoch": 16.85731307715222, + "grad_norm": 3.321240365039557e-05, + "learning_rate": 3.6676795489792857e-06, + "loss": 0.0, + "num_input_tokens_seen": 65661744, + "step": 113180 + }, + { + "epoch": 16.858057789693177, + "grad_norm": 1.1608470231294632e-05, + "learning_rate": 3.665985376737438e-06, + "loss": 0.0, + "num_input_tokens_seen": 65664560, + "step": 113185 + }, + { + "epoch": 16.858802502234138, + "grad_norm": 2.7007723474525847e-05, + "learning_rate": 3.6642915649120894e-06, + "loss": 0.0, + "num_input_tokens_seen": 65667376, + "step": 113190 + }, + { + "epoch": 16.859547214775098, + "grad_norm": 4.185754369245842e-05, + "learning_rate": 3.662598113531851e-06, + "loss": 0.0, + "num_input_tokens_seen": 65670192, + "step": 113195 + }, + { + "epoch": 16.860291927316055, + "grad_norm": 0.00896116066724062, + "learning_rate": 3.6609050226253273e-06, + "loss": 0.0, + "num_input_tokens_seen": 65673008, + "step": 113200 + }, + { + "epoch": 16.861036639857016, + "grad_norm": 2.7194339509151177e-06, + "learning_rate": 3.6592122922211287e-06, + "loss": 0.0, + "num_input_tokens_seen": 65675920, + "step": 113205 + }, + { + "epoch": 16.861781352397973, + "grad_norm": 6.026455139362952e-06, + "learning_rate": 3.6575199223478436e-06, + "loss": 0.0, + "num_input_tokens_seen": 65678736, + "step": 113210 + }, + { + "epoch": 16.862526064938933, + "grad_norm": 2.6889847504207864e-05, + "learning_rate": 3.655827913034071e-06, + "loss": 0.0, + "num_input_tokens_seen": 65681776, + "step": 113215 + }, + { + "epoch": 16.863270777479894, + "grad_norm": 0.000263146182987839, + "learning_rate": 3.6541362643083887e-06, + "loss": 0.0, + "num_input_tokens_seen": 65684688, + "step": 113220 + }, + { + "epoch": 16.86401549002085, + "grad_norm": 4.980736048310064e-05, + "learning_rate": 3.6524449761993874e-06, + "loss": 0.0, + "num_input_tokens_seen": 65687664, + "step": 113225 + }, + { + "epoch": 16.86476020256181, + "grad_norm": 1.221945058205165e-05, + "learning_rate": 3.6507540487356274e-06, + "loss": 0.0, + "num_input_tokens_seen": 65690640, + "step": 113230 + }, + { + "epoch": 16.865504915102772, + "grad_norm": 4.122970858588815e-05, + "learning_rate": 3.6490634819456775e-06, + "loss": 0.0, + "num_input_tokens_seen": 65693680, + "step": 113235 + }, + { + "epoch": 16.86624962764373, + "grad_norm": 3.6060855563846417e-06, + "learning_rate": 3.6473732758581013e-06, + "loss": 0.0, + "num_input_tokens_seen": 65696336, + "step": 113240 + }, + { + "epoch": 16.86699434018469, + "grad_norm": 4.762736807606416e-06, + "learning_rate": 3.6456834305014565e-06, + "loss": 0.0, + "num_input_tokens_seen": 65699088, + "step": 113245 + }, + { + "epoch": 16.867739052725646, + "grad_norm": 2.977277290483471e-05, + "learning_rate": 3.6439939459042836e-06, + "loss": 0.0, + "num_input_tokens_seen": 65701840, + "step": 113250 + }, + { + "epoch": 16.868483765266607, + "grad_norm": 1.212864663102664e-05, + "learning_rate": 3.6423048220951216e-06, + "loss": 0.0, + "num_input_tokens_seen": 65704656, + "step": 113255 + }, + { + "epoch": 16.869228477807567, + "grad_norm": 9.954817869584076e-06, + "learning_rate": 3.6406160591025196e-06, + "loss": 0.0, + "num_input_tokens_seen": 65707472, + "step": 113260 + }, + { + "epoch": 16.869973190348524, + "grad_norm": 1.2822364624298643e-05, + "learning_rate": 3.638927656954996e-06, + "loss": 0.0, + "num_input_tokens_seen": 65710512, + "step": 113265 + }, + { + "epoch": 16.870717902889485, + "grad_norm": 0.0001102396345231682, + "learning_rate": 3.6372396156810817e-06, + "loss": 0.0, + "num_input_tokens_seen": 65713200, + "step": 113270 + }, + { + "epoch": 16.871462615430445, + "grad_norm": 9.897546988213435e-06, + "learning_rate": 3.6355519353092947e-06, + "loss": 0.0, + "num_input_tokens_seen": 65716304, + "step": 113275 + }, + { + "epoch": 16.872207327971402, + "grad_norm": 5.109180710860528e-05, + "learning_rate": 3.6338646158681377e-06, + "loss": 0.0, + "num_input_tokens_seen": 65719216, + "step": 113280 + }, + { + "epoch": 16.872952040512363, + "grad_norm": 5.237502591626253e-06, + "learning_rate": 3.6321776573861266e-06, + "loss": 0.0, + "num_input_tokens_seen": 65722128, + "step": 113285 + }, + { + "epoch": 16.87369675305332, + "grad_norm": 4.706009804067435e-06, + "learning_rate": 3.6304910598917526e-06, + "loss": 0.0, + "num_input_tokens_seen": 65725104, + "step": 113290 + }, + { + "epoch": 16.87444146559428, + "grad_norm": 2.592439841464511e-06, + "learning_rate": 3.6288048234135203e-06, + "loss": 0.0, + "num_input_tokens_seen": 65728240, + "step": 113295 + }, + { + "epoch": 16.87518617813524, + "grad_norm": 8.55680900713196e-06, + "learning_rate": 3.6271189479799017e-06, + "loss": 0.0, + "num_input_tokens_seen": 65731184, + "step": 113300 + }, + { + "epoch": 16.875930890676198, + "grad_norm": 0.00011062814883189276, + "learning_rate": 3.625433433619391e-06, + "loss": 0.0, + "num_input_tokens_seen": 65734416, + "step": 113305 + }, + { + "epoch": 16.87667560321716, + "grad_norm": 2.176590442104498e-06, + "learning_rate": 3.623748280360459e-06, + "loss": 0.0, + "num_input_tokens_seen": 65737552, + "step": 113310 + }, + { + "epoch": 16.87742031575812, + "grad_norm": 1.506295484432485e-05, + "learning_rate": 3.622063488231575e-06, + "loss": 0.0, + "num_input_tokens_seen": 65740560, + "step": 113315 + }, + { + "epoch": 16.878165028299076, + "grad_norm": 0.0003086936194449663, + "learning_rate": 3.6203790572612002e-06, + "loss": 0.0, + "num_input_tokens_seen": 65743664, + "step": 113320 + }, + { + "epoch": 16.878909740840037, + "grad_norm": 1.3626318832393736e-05, + "learning_rate": 3.6186949874777858e-06, + "loss": 0.0, + "num_input_tokens_seen": 65746672, + "step": 113325 + }, + { + "epoch": 16.879654453380994, + "grad_norm": 0.00026151243946515024, + "learning_rate": 3.617011278909796e-06, + "loss": 0.0, + "num_input_tokens_seen": 65749328, + "step": 113330 + }, + { + "epoch": 16.880399165921954, + "grad_norm": 2.3929675080580637e-05, + "learning_rate": 3.6153279315856603e-06, + "loss": 0.0002, + "num_input_tokens_seen": 65752240, + "step": 113335 + }, + { + "epoch": 16.881143878462915, + "grad_norm": 1.1977961548836902e-05, + "learning_rate": 3.613644945533831e-06, + "loss": 0.0, + "num_input_tokens_seen": 65755216, + "step": 113340 + }, + { + "epoch": 16.88188859100387, + "grad_norm": 1.3720159586227965e-05, + "learning_rate": 3.61196232078273e-06, + "loss": 0.0, + "num_input_tokens_seen": 65758288, + "step": 113345 + }, + { + "epoch": 16.882633303544832, + "grad_norm": 1.1929860193049535e-05, + "learning_rate": 3.610280057360793e-06, + "loss": 0.0002, + "num_input_tokens_seen": 65761072, + "step": 113350 + }, + { + "epoch": 16.883378016085793, + "grad_norm": 3.252741589676589e-05, + "learning_rate": 3.608598155296433e-06, + "loss": 0.0, + "num_input_tokens_seen": 65763952, + "step": 113355 + }, + { + "epoch": 16.88412272862675, + "grad_norm": 3.0832550692139193e-05, + "learning_rate": 3.606916614618061e-06, + "loss": 0.0, + "num_input_tokens_seen": 65766960, + "step": 113360 + }, + { + "epoch": 16.88486744116771, + "grad_norm": 1.3816336831951048e-05, + "learning_rate": 3.605235435354096e-06, + "loss": 0.0, + "num_input_tokens_seen": 65769808, + "step": 113365 + }, + { + "epoch": 16.885612153708667, + "grad_norm": 0.011604796163737774, + "learning_rate": 3.6035546175329283e-06, + "loss": 0.0, + "num_input_tokens_seen": 65772464, + "step": 113370 + }, + { + "epoch": 16.886356866249628, + "grad_norm": 0.0010911881690844893, + "learning_rate": 3.6018741611829637e-06, + "loss": 0.0063, + "num_input_tokens_seen": 65775568, + "step": 113375 + }, + { + "epoch": 16.88710157879059, + "grad_norm": 2.9791899578412995e-06, + "learning_rate": 3.6001940663325876e-06, + "loss": 0.0, + "num_input_tokens_seen": 65778416, + "step": 113380 + }, + { + "epoch": 16.887846291331545, + "grad_norm": 0.00010343503527110443, + "learning_rate": 3.598514333010183e-06, + "loss": 0.0, + "num_input_tokens_seen": 65781136, + "step": 113385 + }, + { + "epoch": 16.888591003872506, + "grad_norm": 2.9152602110116277e-06, + "learning_rate": 3.5968349612441277e-06, + "loss": 0.1875, + "num_input_tokens_seen": 65784048, + "step": 113390 + }, + { + "epoch": 16.889335716413463, + "grad_norm": 2.6991547201760113e-06, + "learning_rate": 3.595155951062787e-06, + "loss": 0.0, + "num_input_tokens_seen": 65787216, + "step": 113395 + }, + { + "epoch": 16.890080428954423, + "grad_norm": 0.00041755184065550566, + "learning_rate": 3.5934773024945355e-06, + "loss": 0.0, + "num_input_tokens_seen": 65790288, + "step": 113400 + }, + { + "epoch": 16.890825141495384, + "grad_norm": 0.00013268527982290834, + "learning_rate": 3.591799015567726e-06, + "loss": 0.3938, + "num_input_tokens_seen": 65793104, + "step": 113405 + }, + { + "epoch": 16.89156985403634, + "grad_norm": 5.502981821337016e-06, + "learning_rate": 3.5901210903107165e-06, + "loss": 0.0, + "num_input_tokens_seen": 65796048, + "step": 113410 + }, + { + "epoch": 16.8923145665773, + "grad_norm": 6.218788257683627e-06, + "learning_rate": 3.5884435267518476e-06, + "loss": 0.0, + "num_input_tokens_seen": 65798864, + "step": 113415 + }, + { + "epoch": 16.893059279118262, + "grad_norm": 2.827575008268468e-05, + "learning_rate": 3.586766324919466e-06, + "loss": 0.0, + "num_input_tokens_seen": 65801488, + "step": 113420 + }, + { + "epoch": 16.89380399165922, + "grad_norm": 3.662845483631827e-05, + "learning_rate": 3.5850894848418997e-06, + "loss": 0.0, + "num_input_tokens_seen": 65804464, + "step": 113425 + }, + { + "epoch": 16.89454870420018, + "grad_norm": 2.1466914859047392e-06, + "learning_rate": 3.5834130065474897e-06, + "loss": 0.0, + "num_input_tokens_seen": 65807216, + "step": 113430 + }, + { + "epoch": 16.895293416741136, + "grad_norm": 0.0001269077038159594, + "learning_rate": 3.5817368900645466e-06, + "loss": 0.0, + "num_input_tokens_seen": 65810096, + "step": 113435 + }, + { + "epoch": 16.896038129282097, + "grad_norm": 0.00038439183845184743, + "learning_rate": 3.5800611354213866e-06, + "loss": 0.0, + "num_input_tokens_seen": 65813104, + "step": 113440 + }, + { + "epoch": 16.896782841823057, + "grad_norm": 3.443114337642328e-06, + "learning_rate": 3.5783857426463286e-06, + "loss": 0.0, + "num_input_tokens_seen": 65816176, + "step": 113445 + }, + { + "epoch": 16.897527554364014, + "grad_norm": 2.1308289433363825e-05, + "learning_rate": 3.5767107117676698e-06, + "loss": 0.0, + "num_input_tokens_seen": 65819120, + "step": 113450 + }, + { + "epoch": 16.898272266904975, + "grad_norm": 8.518968388671055e-05, + "learning_rate": 3.575036042813712e-06, + "loss": 0.0, + "num_input_tokens_seen": 65821904, + "step": 113455 + }, + { + "epoch": 16.899016979445936, + "grad_norm": 1.9151981177856214e-05, + "learning_rate": 3.5733617358127384e-06, + "loss": 0.0, + "num_input_tokens_seen": 65824720, + "step": 113460 + }, + { + "epoch": 16.899761691986892, + "grad_norm": 6.737695912306663e-06, + "learning_rate": 3.5716877907930465e-06, + "loss": 0.0, + "num_input_tokens_seen": 65827568, + "step": 113465 + }, + { + "epoch": 16.900506404527853, + "grad_norm": 3.1008985388325527e-06, + "learning_rate": 3.5700142077829122e-06, + "loss": 0.0, + "num_input_tokens_seen": 65830768, + "step": 113470 + }, + { + "epoch": 16.90125111706881, + "grad_norm": 2.3132131900638342e-05, + "learning_rate": 3.5683409868106004e-06, + "loss": 0.0, + "num_input_tokens_seen": 65833776, + "step": 113475 + }, + { + "epoch": 16.90199582960977, + "grad_norm": 3.7065776268718764e-05, + "learning_rate": 3.566668127904391e-06, + "loss": 0.0, + "num_input_tokens_seen": 65836912, + "step": 113480 + }, + { + "epoch": 16.90274054215073, + "grad_norm": 1.693991907814052e-05, + "learning_rate": 3.5649956310925355e-06, + "loss": 0.0, + "num_input_tokens_seen": 65839760, + "step": 113485 + }, + { + "epoch": 16.903485254691688, + "grad_norm": 0.00021932419622316957, + "learning_rate": 3.563323496403298e-06, + "loss": 0.0, + "num_input_tokens_seen": 65842448, + "step": 113490 + }, + { + "epoch": 16.90422996723265, + "grad_norm": 0.0005833603790961206, + "learning_rate": 3.5616517238649172e-06, + "loss": 0.0, + "num_input_tokens_seen": 65845616, + "step": 113495 + }, + { + "epoch": 16.90497467977361, + "grad_norm": 8.73047156346729e-06, + "learning_rate": 3.5599803135056455e-06, + "loss": 0.0, + "num_input_tokens_seen": 65848496, + "step": 113500 + }, + { + "epoch": 16.905719392314566, + "grad_norm": 4.5469816541299224e-05, + "learning_rate": 3.558309265353718e-06, + "loss": 0.0, + "num_input_tokens_seen": 65851600, + "step": 113505 + }, + { + "epoch": 16.906464104855527, + "grad_norm": 4.4043335947208107e-05, + "learning_rate": 3.556638579437363e-06, + "loss": 0.0, + "num_input_tokens_seen": 65854384, + "step": 113510 + }, + { + "epoch": 16.907208817396484, + "grad_norm": 2.036874093391816e-06, + "learning_rate": 3.554968255784799e-06, + "loss": 0.0, + "num_input_tokens_seen": 65857200, + "step": 113515 + }, + { + "epoch": 16.907953529937444, + "grad_norm": 6.787870461266721e-06, + "learning_rate": 3.5532982944242567e-06, + "loss": 0.0, + "num_input_tokens_seen": 65859792, + "step": 113520 + }, + { + "epoch": 16.908698242478405, + "grad_norm": 4.9904883780982345e-05, + "learning_rate": 3.5516286953839406e-06, + "loss": 0.0, + "num_input_tokens_seen": 65862576, + "step": 113525 + }, + { + "epoch": 16.90944295501936, + "grad_norm": 2.1142623154446483e-05, + "learning_rate": 3.549959458692054e-06, + "loss": 0.0, + "num_input_tokens_seen": 65865456, + "step": 113530 + }, + { + "epoch": 16.910187667560322, + "grad_norm": 0.002181123709306121, + "learning_rate": 3.5482905843768065e-06, + "loss": 0.0, + "num_input_tokens_seen": 65868240, + "step": 113535 + }, + { + "epoch": 16.91093238010128, + "grad_norm": 7.1821664278104436e-06, + "learning_rate": 3.5466220724663824e-06, + "loss": 0.0, + "num_input_tokens_seen": 65871184, + "step": 113540 + }, + { + "epoch": 16.91167709264224, + "grad_norm": 0.00044311434612609446, + "learning_rate": 3.5449539229889807e-06, + "loss": 0.0, + "num_input_tokens_seen": 65873904, + "step": 113545 + }, + { + "epoch": 16.9124218051832, + "grad_norm": 9.466913979849778e-06, + "learning_rate": 3.543286135972773e-06, + "loss": 0.0, + "num_input_tokens_seen": 65876880, + "step": 113550 + }, + { + "epoch": 16.913166517724157, + "grad_norm": 4.838310815102886e-06, + "learning_rate": 3.541618711445935e-06, + "loss": 0.0, + "num_input_tokens_seen": 65880208, + "step": 113555 + }, + { + "epoch": 16.913911230265118, + "grad_norm": 1.618338683329057e-05, + "learning_rate": 3.5399516494366457e-06, + "loss": 0.0, + "num_input_tokens_seen": 65882992, + "step": 113560 + }, + { + "epoch": 16.91465594280608, + "grad_norm": 1.8953012840938754e-05, + "learning_rate": 3.5382849499730558e-06, + "loss": 0.0, + "num_input_tokens_seen": 65886000, + "step": 113565 + }, + { + "epoch": 16.915400655347035, + "grad_norm": 9.678059541329276e-06, + "learning_rate": 3.536618613083337e-06, + "loss": 0.0, + "num_input_tokens_seen": 65888560, + "step": 113570 + }, + { + "epoch": 16.916145367887996, + "grad_norm": 0.0002946541353594512, + "learning_rate": 3.53495263879563e-06, + "loss": 0.0, + "num_input_tokens_seen": 65891568, + "step": 113575 + }, + { + "epoch": 16.916890080428953, + "grad_norm": 9.25895346881589e-06, + "learning_rate": 3.5332870271380797e-06, + "loss": 0.0, + "num_input_tokens_seen": 65894480, + "step": 113580 + }, + { + "epoch": 16.917634792969913, + "grad_norm": 3.548317909007892e-05, + "learning_rate": 3.5316217781388243e-06, + "loss": 0.0, + "num_input_tokens_seen": 65897168, + "step": 113585 + }, + { + "epoch": 16.918379505510874, + "grad_norm": 0.0018094946863129735, + "learning_rate": 3.5299568918260054e-06, + "loss": 0.0, + "num_input_tokens_seen": 65900048, + "step": 113590 + }, + { + "epoch": 16.91912421805183, + "grad_norm": 1.4086835108173545e-05, + "learning_rate": 3.528292368227745e-06, + "loss": 0.0, + "num_input_tokens_seen": 65902672, + "step": 113595 + }, + { + "epoch": 16.91986893059279, + "grad_norm": 2.547328904256574e-06, + "learning_rate": 3.526628207372154e-06, + "loss": 0.1221, + "num_input_tokens_seen": 65905296, + "step": 113600 + }, + { + "epoch": 16.920613643133752, + "grad_norm": 0.0045826067216694355, + "learning_rate": 3.5249644092873624e-06, + "loss": 0.0, + "num_input_tokens_seen": 65908336, + "step": 113605 + }, + { + "epoch": 16.92135835567471, + "grad_norm": 1.3105337529850658e-05, + "learning_rate": 3.5233009740014617e-06, + "loss": 0.0, + "num_input_tokens_seen": 65911568, + "step": 113610 + }, + { + "epoch": 16.92210306821567, + "grad_norm": 2.7083485747425584e-06, + "learning_rate": 3.521637901542571e-06, + "loss": 0.0, + "num_input_tokens_seen": 65914608, + "step": 113615 + }, + { + "epoch": 16.922847780756626, + "grad_norm": 5.831073760986328, + "learning_rate": 3.5199751919387735e-06, + "loss": 0.0007, + "num_input_tokens_seen": 65917488, + "step": 113620 + }, + { + "epoch": 16.923592493297587, + "grad_norm": 0.0002160444128094241, + "learning_rate": 3.518312845218169e-06, + "loss": 0.0, + "num_input_tokens_seen": 65920816, + "step": 113625 + }, + { + "epoch": 16.924337205838548, + "grad_norm": 4.266112227924168e-06, + "learning_rate": 3.516650861408835e-06, + "loss": 0.0, + "num_input_tokens_seen": 65923568, + "step": 113630 + }, + { + "epoch": 16.925081918379504, + "grad_norm": 0.0011041194666177034, + "learning_rate": 3.514989240538846e-06, + "loss": 0.0, + "num_input_tokens_seen": 65926224, + "step": 113635 + }, + { + "epoch": 16.925826630920465, + "grad_norm": 6.278041837504134e-05, + "learning_rate": 3.513327982636283e-06, + "loss": 0.0, + "num_input_tokens_seen": 65929104, + "step": 113640 + }, + { + "epoch": 16.926571343461426, + "grad_norm": 2.045741666734102e-06, + "learning_rate": 3.5116670877292034e-06, + "loss": 0.0, + "num_input_tokens_seen": 65932016, + "step": 113645 + }, + { + "epoch": 16.927316056002383, + "grad_norm": 9.749273158377036e-05, + "learning_rate": 3.5100065558456714e-06, + "loss": 0.0073, + "num_input_tokens_seen": 65935088, + "step": 113650 + }, + { + "epoch": 16.928060768543343, + "grad_norm": 3.608980023273034e-06, + "learning_rate": 3.5083463870137306e-06, + "loss": 0.0, + "num_input_tokens_seen": 65938032, + "step": 113655 + }, + { + "epoch": 16.9288054810843, + "grad_norm": 0.00026331361732445657, + "learning_rate": 3.50668658126144e-06, + "loss": 0.0, + "num_input_tokens_seen": 65940784, + "step": 113660 + }, + { + "epoch": 16.92955019362526, + "grad_norm": 4.8336360123357736e-06, + "learning_rate": 3.5050271386168287e-06, + "loss": 0.0, + "num_input_tokens_seen": 65943440, + "step": 113665 + }, + { + "epoch": 16.93029490616622, + "grad_norm": 6.189152190927416e-05, + "learning_rate": 3.503368059107942e-06, + "loss": 0.0, + "num_input_tokens_seen": 65946608, + "step": 113670 + }, + { + "epoch": 16.931039618707178, + "grad_norm": 4.172543413005769e-06, + "learning_rate": 3.5017093427628045e-06, + "loss": 0.0, + "num_input_tokens_seen": 65949488, + "step": 113675 + }, + { + "epoch": 16.93178433124814, + "grad_norm": 1.4141364772513043e-05, + "learning_rate": 3.5000509896094323e-06, + "loss": 0.0, + "num_input_tokens_seen": 65952112, + "step": 113680 + }, + { + "epoch": 16.932529043789096, + "grad_norm": 8.008282020455226e-05, + "learning_rate": 3.4983929996758535e-06, + "loss": 0.0, + "num_input_tokens_seen": 65955472, + "step": 113685 + }, + { + "epoch": 16.933273756330056, + "grad_norm": 2.0601590222213417e-05, + "learning_rate": 3.496735372990065e-06, + "loss": 0.0, + "num_input_tokens_seen": 65958352, + "step": 113690 + }, + { + "epoch": 16.934018468871017, + "grad_norm": 14.078226089477539, + "learning_rate": 3.4950781095800828e-06, + "loss": 0.0198, + "num_input_tokens_seen": 65961200, + "step": 113695 + }, + { + "epoch": 16.934763181411974, + "grad_norm": 0.0001079480498447083, + "learning_rate": 3.493421209473896e-06, + "loss": 0.0, + "num_input_tokens_seen": 65964176, + "step": 113700 + }, + { + "epoch": 16.935507893952934, + "grad_norm": 0.0016863630153238773, + "learning_rate": 3.4917646726995018e-06, + "loss": 0.0, + "num_input_tokens_seen": 65967248, + "step": 113705 + }, + { + "epoch": 16.936252606493895, + "grad_norm": 1.163026081485441e-05, + "learning_rate": 3.490108499284886e-06, + "loss": 0.0, + "num_input_tokens_seen": 65970288, + "step": 113710 + }, + { + "epoch": 16.93699731903485, + "grad_norm": 6.064055560273118e-05, + "learning_rate": 3.488452689258026e-06, + "loss": 0.0, + "num_input_tokens_seen": 65973264, + "step": 113715 + }, + { + "epoch": 16.937742031575812, + "grad_norm": 0.02291840873658657, + "learning_rate": 3.4867972426468915e-06, + "loss": 0.0, + "num_input_tokens_seen": 65976272, + "step": 113720 + }, + { + "epoch": 16.93848674411677, + "grad_norm": 2.4149625460267998e-05, + "learning_rate": 3.4851421594794486e-06, + "loss": 0.0, + "num_input_tokens_seen": 65979088, + "step": 113725 + }, + { + "epoch": 16.93923145665773, + "grad_norm": 1.4292362720880192e-05, + "learning_rate": 3.483487439783667e-06, + "loss": 0.0, + "num_input_tokens_seen": 65982064, + "step": 113730 + }, + { + "epoch": 16.93997616919869, + "grad_norm": 7.138549790397519e-06, + "learning_rate": 3.4818330835874937e-06, + "loss": 0.0, + "num_input_tokens_seen": 65984944, + "step": 113735 + }, + { + "epoch": 16.940720881739647, + "grad_norm": 2.22854714593268e-06, + "learning_rate": 3.4801790909188837e-06, + "loss": 0.0, + "num_input_tokens_seen": 65988144, + "step": 113740 + }, + { + "epoch": 16.941465594280608, + "grad_norm": 2.771967410808429e-05, + "learning_rate": 3.4785254618057707e-06, + "loss": 0.0, + "num_input_tokens_seen": 65991056, + "step": 113745 + }, + { + "epoch": 16.94221030682157, + "grad_norm": 9.383462383993901e-06, + "learning_rate": 3.4768721962761015e-06, + "loss": 0.0, + "num_input_tokens_seen": 65994000, + "step": 113750 + }, + { + "epoch": 16.942955019362525, + "grad_norm": 0.0029532862827181816, + "learning_rate": 3.4752192943578038e-06, + "loss": 0.0, + "num_input_tokens_seen": 65997104, + "step": 113755 + }, + { + "epoch": 16.943699731903486, + "grad_norm": 1.0199206371908076e-05, + "learning_rate": 3.4735667560787916e-06, + "loss": 0.0, + "num_input_tokens_seen": 65999920, + "step": 113760 + }, + { + "epoch": 16.944444444444443, + "grad_norm": 0.00011672823166009039, + "learning_rate": 3.471914581466998e-06, + "loss": 0.0, + "num_input_tokens_seen": 66002864, + "step": 113765 + }, + { + "epoch": 16.945189156985403, + "grad_norm": 9.12774121388793e-05, + "learning_rate": 3.4702627705503197e-06, + "loss": 0.0, + "num_input_tokens_seen": 66005520, + "step": 113770 + }, + { + "epoch": 16.945933869526364, + "grad_norm": 5.153517577127786e-06, + "learning_rate": 3.468611323356677e-06, + "loss": 0.0, + "num_input_tokens_seen": 66008368, + "step": 113775 + }, + { + "epoch": 16.94667858206732, + "grad_norm": 1.9743330994970165e-05, + "learning_rate": 3.4669602399139607e-06, + "loss": 0.0, + "num_input_tokens_seen": 66011440, + "step": 113780 + }, + { + "epoch": 16.94742329460828, + "grad_norm": 0.00021999693126417696, + "learning_rate": 3.4653095202500677e-06, + "loss": 0.002, + "num_input_tokens_seen": 66014224, + "step": 113785 + }, + { + "epoch": 16.948168007149242, + "grad_norm": 1.1861889106512535e-05, + "learning_rate": 3.4636591643928823e-06, + "loss": 0.0, + "num_input_tokens_seen": 66017072, + "step": 113790 + }, + { + "epoch": 16.9489127196902, + "grad_norm": 0.0005791391595266759, + "learning_rate": 3.462009172370284e-06, + "loss": 0.0, + "num_input_tokens_seen": 66020720, + "step": 113795 + }, + { + "epoch": 16.94965743223116, + "grad_norm": 7.201163498393726e-06, + "learning_rate": 3.4603595442101537e-06, + "loss": 0.0, + "num_input_tokens_seen": 66023824, + "step": 113800 + }, + { + "epoch": 16.950402144772116, + "grad_norm": 6.416335963876918e-06, + "learning_rate": 3.45871027994035e-06, + "loss": 0.0, + "num_input_tokens_seen": 66026640, + "step": 113805 + }, + { + "epoch": 16.951146857313077, + "grad_norm": 4.3138999899383634e-05, + "learning_rate": 3.45706137958875e-06, + "loss": 0.0, + "num_input_tokens_seen": 66029808, + "step": 113810 + }, + { + "epoch": 16.951891569854038, + "grad_norm": 0.0002767859841696918, + "learning_rate": 3.4554128431831976e-06, + "loss": 0.0, + "num_input_tokens_seen": 66032848, + "step": 113815 + }, + { + "epoch": 16.952636282394995, + "grad_norm": 2.573261554061901e-06, + "learning_rate": 3.4537646707515527e-06, + "loss": 0.0, + "num_input_tokens_seen": 66035728, + "step": 113820 + }, + { + "epoch": 16.953380994935955, + "grad_norm": 4.495292523643002e-05, + "learning_rate": 3.45211686232165e-06, + "loss": 0.0, + "num_input_tokens_seen": 66038672, + "step": 113825 + }, + { + "epoch": 16.954125707476916, + "grad_norm": 2.7782311917690095e-06, + "learning_rate": 3.45046941792134e-06, + "loss": 0.0, + "num_input_tokens_seen": 66041648, + "step": 113830 + }, + { + "epoch": 16.954870420017873, + "grad_norm": 1.3007421330257785e-05, + "learning_rate": 3.4488223375784447e-06, + "loss": 0.0, + "num_input_tokens_seen": 66044464, + "step": 113835 + }, + { + "epoch": 16.955615132558833, + "grad_norm": 1.2411785064614378e-05, + "learning_rate": 3.447175621320792e-06, + "loss": 0.0, + "num_input_tokens_seen": 66047280, + "step": 113840 + }, + { + "epoch": 16.95635984509979, + "grad_norm": 4.665639062295668e-05, + "learning_rate": 3.445529269176198e-06, + "loss": 0.0, + "num_input_tokens_seen": 66050032, + "step": 113845 + }, + { + "epoch": 16.95710455764075, + "grad_norm": 2.8576912427524803e-06, + "learning_rate": 3.443883281172486e-06, + "loss": 0.0, + "num_input_tokens_seen": 66052848, + "step": 113850 + }, + { + "epoch": 16.95784927018171, + "grad_norm": 1.3279191255569458, + "learning_rate": 3.442237657337455e-06, + "loss": 0.0003, + "num_input_tokens_seen": 66056016, + "step": 113855 + }, + { + "epoch": 16.958593982722668, + "grad_norm": 2.354723255848512e-05, + "learning_rate": 3.440592397698905e-06, + "loss": 0.0, + "num_input_tokens_seen": 66059024, + "step": 113860 + }, + { + "epoch": 16.95933869526363, + "grad_norm": 1.876007991086226e-05, + "learning_rate": 3.4389475022846395e-06, + "loss": 0.0, + "num_input_tokens_seen": 66062000, + "step": 113865 + }, + { + "epoch": 16.96008340780459, + "grad_norm": 0.00025965043460018933, + "learning_rate": 3.4373029711224356e-06, + "loss": 0.0, + "num_input_tokens_seen": 66064944, + "step": 113870 + }, + { + "epoch": 16.960828120345546, + "grad_norm": 2.9386503683781484e-06, + "learning_rate": 3.435658804240088e-06, + "loss": 0.0, + "num_input_tokens_seen": 66067792, + "step": 113875 + }, + { + "epoch": 16.961572832886507, + "grad_norm": 1.099299424822675e-05, + "learning_rate": 3.4340150016653695e-06, + "loss": 0.0, + "num_input_tokens_seen": 66070864, + "step": 113880 + }, + { + "epoch": 16.962317545427464, + "grad_norm": 0.00017471345199737698, + "learning_rate": 3.432371563426043e-06, + "loss": 0.0, + "num_input_tokens_seen": 66073936, + "step": 113885 + }, + { + "epoch": 16.963062257968424, + "grad_norm": 4.5872329792473465e-06, + "learning_rate": 3.4307284895498836e-06, + "loss": 0.0, + "num_input_tokens_seen": 66076720, + "step": 113890 + }, + { + "epoch": 16.963806970509385, + "grad_norm": 2.5150497094728053e-05, + "learning_rate": 3.429085780064639e-06, + "loss": 0.0, + "num_input_tokens_seen": 66079632, + "step": 113895 + }, + { + "epoch": 16.964551683050342, + "grad_norm": 9.124464850174263e-05, + "learning_rate": 3.427443434998073e-06, + "loss": 0.0, + "num_input_tokens_seen": 66082448, + "step": 113900 + }, + { + "epoch": 16.965296395591302, + "grad_norm": 3.500338425510563e-05, + "learning_rate": 3.4258014543779222e-06, + "loss": 0.0, + "num_input_tokens_seen": 66086480, + "step": 113905 + }, + { + "epoch": 16.96604110813226, + "grad_norm": 2.6025013539765496e-06, + "learning_rate": 3.4241598382319303e-06, + "loss": 0.1314, + "num_input_tokens_seen": 66089392, + "step": 113910 + }, + { + "epoch": 16.96678582067322, + "grad_norm": 0.00026331006665714085, + "learning_rate": 3.422518586587831e-06, + "loss": 0.0, + "num_input_tokens_seen": 66092144, + "step": 113915 + }, + { + "epoch": 16.96753053321418, + "grad_norm": 1.984455593628809e-05, + "learning_rate": 3.4208776994733405e-06, + "loss": 0.0004, + "num_input_tokens_seen": 66094896, + "step": 113920 + }, + { + "epoch": 16.968275245755137, + "grad_norm": 2.4653390937601216e-05, + "learning_rate": 3.4192371769161987e-06, + "loss": 0.0, + "num_input_tokens_seen": 66097520, + "step": 113925 + }, + { + "epoch": 16.969019958296098, + "grad_norm": 1.3564889741246589e-05, + "learning_rate": 3.417597018944102e-06, + "loss": 0.0, + "num_input_tokens_seen": 66100240, + "step": 113930 + }, + { + "epoch": 16.96976467083706, + "grad_norm": 0.00015164533397182822, + "learning_rate": 3.415957225584776e-06, + "loss": 0.0, + "num_input_tokens_seen": 66103280, + "step": 113935 + }, + { + "epoch": 16.970509383378015, + "grad_norm": 7.332524546654895e-06, + "learning_rate": 3.4143177968659098e-06, + "loss": 0.0, + "num_input_tokens_seen": 66106096, + "step": 113940 + }, + { + "epoch": 16.971254095918976, + "grad_norm": 0.0007001494523137808, + "learning_rate": 3.412678732815211e-06, + "loss": 0.0, + "num_input_tokens_seen": 66108688, + "step": 113945 + }, + { + "epoch": 16.971998808459933, + "grad_norm": 5.150583547219867e-06, + "learning_rate": 3.411040033460361e-06, + "loss": 0.0, + "num_input_tokens_seen": 66111344, + "step": 113950 + }, + { + "epoch": 16.972743521000893, + "grad_norm": 6.315141945378855e-06, + "learning_rate": 3.4094016988290512e-06, + "loss": 0.0, + "num_input_tokens_seen": 66114192, + "step": 113955 + }, + { + "epoch": 16.973488233541854, + "grad_norm": 2.0448558643693104e-05, + "learning_rate": 3.407763728948954e-06, + "loss": 0.0, + "num_input_tokens_seen": 66117488, + "step": 113960 + }, + { + "epoch": 16.97423294608281, + "grad_norm": 5.106130993226543e-05, + "learning_rate": 3.4061261238477414e-06, + "loss": 0.0, + "num_input_tokens_seen": 66120400, + "step": 113965 + }, + { + "epoch": 16.97497765862377, + "grad_norm": 1.2752444490615744e-05, + "learning_rate": 3.4044888835530835e-06, + "loss": 0.0, + "num_input_tokens_seen": 66123024, + "step": 113970 + }, + { + "epoch": 16.975722371164732, + "grad_norm": 2.564715759945102e-05, + "learning_rate": 3.4028520080926383e-06, + "loss": 0.0, + "num_input_tokens_seen": 66125840, + "step": 113975 + }, + { + "epoch": 16.97646708370569, + "grad_norm": 0.00016462092753499746, + "learning_rate": 3.401215497494059e-06, + "loss": 0.0, + "num_input_tokens_seen": 66128656, + "step": 113980 + }, + { + "epoch": 16.97721179624665, + "grad_norm": 7.205544079624815e-06, + "learning_rate": 3.3995793517849846e-06, + "loss": 0.0, + "num_input_tokens_seen": 66131504, + "step": 113985 + }, + { + "epoch": 16.977956508787607, + "grad_norm": 6.709261651849374e-05, + "learning_rate": 3.3979435709930703e-06, + "loss": 0.0, + "num_input_tokens_seen": 66134480, + "step": 113990 + }, + { + "epoch": 16.978701221328567, + "grad_norm": 0.0001741535379551351, + "learning_rate": 3.3963081551459442e-06, + "loss": 0.0, + "num_input_tokens_seen": 66137136, + "step": 113995 + }, + { + "epoch": 16.979445933869528, + "grad_norm": 3.959958121413365e-05, + "learning_rate": 3.3946731042712286e-06, + "loss": 0.0, + "num_input_tokens_seen": 66140560, + "step": 114000 + }, + { + "epoch": 16.980190646410485, + "grad_norm": 0.0006519323796965182, + "learning_rate": 3.3930384183965573e-06, + "loss": 0.0, + "num_input_tokens_seen": 66143280, + "step": 114005 + }, + { + "epoch": 16.980935358951445, + "grad_norm": 8.959586921264417e-06, + "learning_rate": 3.3914040975495387e-06, + "loss": 0.0, + "num_input_tokens_seen": 66146256, + "step": 114010 + }, + { + "epoch": 16.981680071492406, + "grad_norm": 3.130310869892128e-05, + "learning_rate": 3.3897701417577893e-06, + "loss": 0.0, + "num_input_tokens_seen": 66149232, + "step": 114015 + }, + { + "epoch": 16.982424784033363, + "grad_norm": 2.1130394998181146e-06, + "learning_rate": 3.388136551048904e-06, + "loss": 0.0, + "num_input_tokens_seen": 66152176, + "step": 114020 + }, + { + "epoch": 16.983169496574323, + "grad_norm": 5.232747207628563e-05, + "learning_rate": 3.386503325450491e-06, + "loss": 0.0612, + "num_input_tokens_seen": 66155024, + "step": 114025 + }, + { + "epoch": 16.98391420911528, + "grad_norm": 0.0032512876205146313, + "learning_rate": 3.3848704649901336e-06, + "loss": 0.0, + "num_input_tokens_seen": 66157936, + "step": 114030 + }, + { + "epoch": 16.98465892165624, + "grad_norm": 5.8776063269760925e-06, + "learning_rate": 3.3832379696954243e-06, + "loss": 0.0, + "num_input_tokens_seen": 66160688, + "step": 114035 + }, + { + "epoch": 16.9854036341972, + "grad_norm": 2.9800527045154013e-05, + "learning_rate": 3.3816058395939434e-06, + "loss": 0.0001, + "num_input_tokens_seen": 66163696, + "step": 114040 + }, + { + "epoch": 16.986148346738158, + "grad_norm": 2.141053118975833e-05, + "learning_rate": 3.3799740747132547e-06, + "loss": 0.0, + "num_input_tokens_seen": 66166928, + "step": 114045 + }, + { + "epoch": 16.98689305927912, + "grad_norm": 6.557775122928433e-06, + "learning_rate": 3.378342675080934e-06, + "loss": 0.0, + "num_input_tokens_seen": 66169872, + "step": 114050 + }, + { + "epoch": 16.987637771820076, + "grad_norm": 0.0009069862426258624, + "learning_rate": 3.376711640724531e-06, + "loss": 0.0, + "num_input_tokens_seen": 66172592, + "step": 114055 + }, + { + "epoch": 16.988382484361036, + "grad_norm": 7.749033102300018e-05, + "learning_rate": 3.375080971671615e-06, + "loss": 0.0, + "num_input_tokens_seen": 66175376, + "step": 114060 + }, + { + "epoch": 16.989127196901997, + "grad_norm": 0.000452049367595464, + "learning_rate": 3.3734506679497207e-06, + "loss": 0.0, + "num_input_tokens_seen": 66178992, + "step": 114065 + }, + { + "epoch": 16.989871909442954, + "grad_norm": 0.00010477662726771086, + "learning_rate": 3.3718207295864028e-06, + "loss": 0.0, + "num_input_tokens_seen": 66182032, + "step": 114070 + }, + { + "epoch": 16.990616621983914, + "grad_norm": 5.752001015935093e-06, + "learning_rate": 3.3701911566091925e-06, + "loss": 0.0, + "num_input_tokens_seen": 66184656, + "step": 114075 + }, + { + "epoch": 16.991361334524875, + "grad_norm": 4.069792339578271e-05, + "learning_rate": 3.368561949045615e-06, + "loss": 0.0, + "num_input_tokens_seen": 66187504, + "step": 114080 + }, + { + "epoch": 16.992106047065832, + "grad_norm": 7.865225052228197e-05, + "learning_rate": 3.3669331069232006e-06, + "loss": 0.0, + "num_input_tokens_seen": 66190832, + "step": 114085 + }, + { + "epoch": 16.992850759606792, + "grad_norm": 5.6834605857147835e-06, + "learning_rate": 3.3653046302694614e-06, + "loss": 0.0, + "num_input_tokens_seen": 66193968, + "step": 114090 + }, + { + "epoch": 16.99359547214775, + "grad_norm": 3.312952458145446e-06, + "learning_rate": 3.3636765191119165e-06, + "loss": 0.0, + "num_input_tokens_seen": 66196592, + "step": 114095 + }, + { + "epoch": 16.99434018468871, + "grad_norm": 1.0043713700724766e-05, + "learning_rate": 3.3620487734780603e-06, + "loss": 0.0, + "num_input_tokens_seen": 66199664, + "step": 114100 + }, + { + "epoch": 16.99508489722967, + "grad_norm": 5.660577244270826e-06, + "learning_rate": 3.3604213933954048e-06, + "loss": 0.0, + "num_input_tokens_seen": 66202544, + "step": 114105 + }, + { + "epoch": 16.995829609770627, + "grad_norm": 7.661189010832459e-05, + "learning_rate": 3.358794378891436e-06, + "loss": 0.0, + "num_input_tokens_seen": 66205424, + "step": 114110 + }, + { + "epoch": 16.996574322311588, + "grad_norm": 3.880007625411963e-06, + "learning_rate": 3.3571677299936403e-06, + "loss": 0.0, + "num_input_tokens_seen": 66208368, + "step": 114115 + }, + { + "epoch": 16.99731903485255, + "grad_norm": 0.0003236357297282666, + "learning_rate": 3.3555414467295017e-06, + "loss": 0.0, + "num_input_tokens_seen": 66211280, + "step": 114120 + }, + { + "epoch": 16.998063747393505, + "grad_norm": 6.65667830617167e-05, + "learning_rate": 3.3539155291264833e-06, + "loss": 0.0, + "num_input_tokens_seen": 66214256, + "step": 114125 + }, + { + "epoch": 16.998808459934466, + "grad_norm": 3.18263701046817e-05, + "learning_rate": 3.352289977212067e-06, + "loss": 0.0, + "num_input_tokens_seen": 66216784, + "step": 114130 + }, + { + "epoch": 16.999553172475423, + "grad_norm": 3.5511052374204155e-06, + "learning_rate": 3.3506647910137078e-06, + "loss": 0.0, + "num_input_tokens_seen": 66219632, + "step": 114135 + }, + { + "epoch": 17.0, + "eval_loss": 3.3436172008514404, + "eval_runtime": 51.3142, + "eval_samples_per_second": 58.152, + "eval_steps_per_second": 14.538, + "num_input_tokens_seen": 66220752, + "step": 114138 + }, + { + "epoch": 17.000297885016384, + "grad_norm": 8.132455150189344e-06, + "learning_rate": 3.3490399705588677e-06, + "loss": 0.0, + "num_input_tokens_seen": 66222064, + "step": 114140 + }, + { + "epoch": 17.001042597557344, + "grad_norm": 4.832378181163222e-06, + "learning_rate": 3.3474155158749854e-06, + "loss": 0.0, + "num_input_tokens_seen": 66225104, + "step": 114145 + }, + { + "epoch": 17.0017873100983, + "grad_norm": 0.0010726312175393105, + "learning_rate": 3.345791426989517e-06, + "loss": 0.0, + "num_input_tokens_seen": 66228176, + "step": 114150 + }, + { + "epoch": 17.00253202263926, + "grad_norm": 0.0033471395727247, + "learning_rate": 3.3441677039298956e-06, + "loss": 0.0, + "num_input_tokens_seen": 66231216, + "step": 114155 + }, + { + "epoch": 17.003276735180222, + "grad_norm": 5.403406248660758e-06, + "learning_rate": 3.3425443467235443e-06, + "loss": 0.0, + "num_input_tokens_seen": 66233744, + "step": 114160 + }, + { + "epoch": 17.00402144772118, + "grad_norm": 5.2986066293669865e-05, + "learning_rate": 3.3409213553979e-06, + "loss": 0.0, + "num_input_tokens_seen": 66236912, + "step": 114165 + }, + { + "epoch": 17.00476616026214, + "grad_norm": 1.157734641310526e-05, + "learning_rate": 3.3392987299803753e-06, + "loss": 0.0, + "num_input_tokens_seen": 66239504, + "step": 114170 + }, + { + "epoch": 17.005510872803097, + "grad_norm": 2.598394985398045e-06, + "learning_rate": 3.33767647049838e-06, + "loss": 0.0, + "num_input_tokens_seen": 66242384, + "step": 114175 + }, + { + "epoch": 17.006255585344057, + "grad_norm": 5.821718332299497e-06, + "learning_rate": 3.3360545769793277e-06, + "loss": 0.0, + "num_input_tokens_seen": 66245136, + "step": 114180 + }, + { + "epoch": 17.007000297885018, + "grad_norm": 1.1691149666148704e-05, + "learning_rate": 3.3344330494506166e-06, + "loss": 0.0, + "num_input_tokens_seen": 66247984, + "step": 114185 + }, + { + "epoch": 17.007745010425975, + "grad_norm": 0.00014730094699189067, + "learning_rate": 3.3328118879396324e-06, + "loss": 0.0, + "num_input_tokens_seen": 66250928, + "step": 114190 + }, + { + "epoch": 17.008489722966935, + "grad_norm": 21.453950881958008, + "learning_rate": 3.331191092473776e-06, + "loss": 0.0011, + "num_input_tokens_seen": 66253712, + "step": 114195 + }, + { + "epoch": 17.009234435507896, + "grad_norm": 2.2027538761903998e-06, + "learning_rate": 3.3295706630804222e-06, + "loss": 0.0, + "num_input_tokens_seen": 66256720, + "step": 114200 + }, + { + "epoch": 17.009979148048853, + "grad_norm": 3.582779072530684e-06, + "learning_rate": 3.3279505997869442e-06, + "loss": 0.0, + "num_input_tokens_seen": 66259504, + "step": 114205 + }, + { + "epoch": 17.010723860589813, + "grad_norm": 9.107261575991288e-06, + "learning_rate": 3.3263309026207166e-06, + "loss": 0.0, + "num_input_tokens_seen": 66262480, + "step": 114210 + }, + { + "epoch": 17.01146857313077, + "grad_norm": 0.00017416909395251423, + "learning_rate": 3.3247115716090987e-06, + "loss": 0.0, + "num_input_tokens_seen": 66265168, + "step": 114215 + }, + { + "epoch": 17.01221328567173, + "grad_norm": 4.6820299758110195e-05, + "learning_rate": 3.3230926067794516e-06, + "loss": 0.0, + "num_input_tokens_seen": 66268080, + "step": 114220 + }, + { + "epoch": 17.01295799821269, + "grad_norm": 0.00016508194676134735, + "learning_rate": 3.3214740081591173e-06, + "loss": 0.0, + "num_input_tokens_seen": 66271280, + "step": 114225 + }, + { + "epoch": 17.01370271075365, + "grad_norm": 6.6113848333770875e-06, + "learning_rate": 3.3198557757754544e-06, + "loss": 0.0, + "num_input_tokens_seen": 66274256, + "step": 114230 + }, + { + "epoch": 17.01444742329461, + "grad_norm": 9.687061719887424e-06, + "learning_rate": 3.3182379096557916e-06, + "loss": 0.0, + "num_input_tokens_seen": 66277040, + "step": 114235 + }, + { + "epoch": 17.015192135835566, + "grad_norm": 4.2301071516703814e-06, + "learning_rate": 3.3166204098274643e-06, + "loss": 0.0, + "num_input_tokens_seen": 66279952, + "step": 114240 + }, + { + "epoch": 17.015936848376526, + "grad_norm": 0.001965136732906103, + "learning_rate": 3.3150032763177962e-06, + "loss": 0.0, + "num_input_tokens_seen": 66282768, + "step": 114245 + }, + { + "epoch": 17.016681560917487, + "grad_norm": 1.1791594261012506e-05, + "learning_rate": 3.3133865091541037e-06, + "loss": 0.0, + "num_input_tokens_seen": 66285616, + "step": 114250 + }, + { + "epoch": 17.017426273458444, + "grad_norm": 5.927602978772484e-05, + "learning_rate": 3.31177010836371e-06, + "loss": 0.0, + "num_input_tokens_seen": 66288624, + "step": 114255 + }, + { + "epoch": 17.018170985999404, + "grad_norm": 1.9802305359917227e-06, + "learning_rate": 3.310154073973909e-06, + "loss": 0.0, + "num_input_tokens_seen": 66291376, + "step": 114260 + }, + { + "epoch": 17.018915698540365, + "grad_norm": 1.2583301213453524e-05, + "learning_rate": 3.3085384060120185e-06, + "loss": 0.0, + "num_input_tokens_seen": 66294192, + "step": 114265 + }, + { + "epoch": 17.019660411081322, + "grad_norm": 5.506046727532521e-05, + "learning_rate": 3.3069231045053216e-06, + "loss": 0.0, + "num_input_tokens_seen": 66296816, + "step": 114270 + }, + { + "epoch": 17.020405123622282, + "grad_norm": 0.0003117134911008179, + "learning_rate": 3.3053081694811137e-06, + "loss": 0.0, + "num_input_tokens_seen": 66299792, + "step": 114275 + }, + { + "epoch": 17.02114983616324, + "grad_norm": 4.411574627738446e-05, + "learning_rate": 3.303693600966676e-06, + "loss": 0.0, + "num_input_tokens_seen": 66302608, + "step": 114280 + }, + { + "epoch": 17.0218945487042, + "grad_norm": 3.8974449125817046e-06, + "learning_rate": 3.3020793989892774e-06, + "loss": 0.0, + "num_input_tokens_seen": 66305328, + "step": 114285 + }, + { + "epoch": 17.02263926124516, + "grad_norm": 8.916456863516942e-06, + "learning_rate": 3.3004655635761994e-06, + "loss": 0.0, + "num_input_tokens_seen": 66308464, + "step": 114290 + }, + { + "epoch": 17.023383973786117, + "grad_norm": 0.00030220235930755734, + "learning_rate": 3.298852094754698e-06, + "loss": 0.0, + "num_input_tokens_seen": 66311504, + "step": 114295 + }, + { + "epoch": 17.024128686327078, + "grad_norm": 3.3772507777030114e-06, + "learning_rate": 3.29723899255204e-06, + "loss": 0.0, + "num_input_tokens_seen": 66314512, + "step": 114300 + }, + { + "epoch": 17.02487339886804, + "grad_norm": 1.0658766031265259, + "learning_rate": 3.295626256995471e-06, + "loss": 0.0028, + "num_input_tokens_seen": 66317328, + "step": 114305 + }, + { + "epoch": 17.025618111408996, + "grad_norm": 8.37963743833825e-05, + "learning_rate": 3.294013888112235e-06, + "loss": 0.0, + "num_input_tokens_seen": 66320208, + "step": 114310 + }, + { + "epoch": 17.026362823949956, + "grad_norm": 6.427450716728345e-05, + "learning_rate": 3.2924018859295746e-06, + "loss": 0.0, + "num_input_tokens_seen": 66323088, + "step": 114315 + }, + { + "epoch": 17.027107536490913, + "grad_norm": 7.681018360017333e-06, + "learning_rate": 3.290790250474718e-06, + "loss": 0.0, + "num_input_tokens_seen": 66325936, + "step": 114320 + }, + { + "epoch": 17.027852249031874, + "grad_norm": 8.999566489364952e-06, + "learning_rate": 3.2891789817748984e-06, + "loss": 0.0, + "num_input_tokens_seen": 66328752, + "step": 114325 + }, + { + "epoch": 17.028596961572834, + "grad_norm": 6.417314580176026e-05, + "learning_rate": 3.287568079857331e-06, + "loss": 0.0001, + "num_input_tokens_seen": 66331696, + "step": 114330 + }, + { + "epoch": 17.02934167411379, + "grad_norm": 2.971431240439415e-05, + "learning_rate": 3.285957544749238e-06, + "loss": 0.0, + "num_input_tokens_seen": 66334608, + "step": 114335 + }, + { + "epoch": 17.03008638665475, + "grad_norm": 2.753724857029738e-06, + "learning_rate": 3.284347376477817e-06, + "loss": 0.0, + "num_input_tokens_seen": 66337456, + "step": 114340 + }, + { + "epoch": 17.030831099195712, + "grad_norm": 4.321762844483601e-06, + "learning_rate": 3.2827375750702825e-06, + "loss": 0.0066, + "num_input_tokens_seen": 66340464, + "step": 114345 + }, + { + "epoch": 17.03157581173667, + "grad_norm": 6.431726069422439e-05, + "learning_rate": 3.2811281405538188e-06, + "loss": 0.0, + "num_input_tokens_seen": 66343120, + "step": 114350 + }, + { + "epoch": 17.03232052427763, + "grad_norm": 1.2977567166672088e-05, + "learning_rate": 3.2795190729556254e-06, + "loss": 0.0, + "num_input_tokens_seen": 66346256, + "step": 114355 + }, + { + "epoch": 17.033065236818587, + "grad_norm": 1.7930551621248014e-05, + "learning_rate": 3.2779103723028807e-06, + "loss": 0.0, + "num_input_tokens_seen": 66349040, + "step": 114360 + }, + { + "epoch": 17.033809949359547, + "grad_norm": 2.4061000658548437e-05, + "learning_rate": 3.276302038622761e-06, + "loss": 0.0, + "num_input_tokens_seen": 66351824, + "step": 114365 + }, + { + "epoch": 17.034554661900508, + "grad_norm": 4.229334808769636e-05, + "learning_rate": 3.2746940719424414e-06, + "loss": 0.0, + "num_input_tokens_seen": 66354576, + "step": 114370 + }, + { + "epoch": 17.035299374441465, + "grad_norm": 5.639515438815579e-05, + "learning_rate": 3.2730864722890886e-06, + "loss": 0.0, + "num_input_tokens_seen": 66357296, + "step": 114375 + }, + { + "epoch": 17.036044086982425, + "grad_norm": 4.3617696974251885e-06, + "learning_rate": 3.2714792396898534e-06, + "loss": 0.0, + "num_input_tokens_seen": 66360336, + "step": 114380 + }, + { + "epoch": 17.036788799523382, + "grad_norm": 3.410313865970238e-06, + "learning_rate": 3.2698723741718894e-06, + "loss": 0.0, + "num_input_tokens_seen": 66363216, + "step": 114385 + }, + { + "epoch": 17.037533512064343, + "grad_norm": 4.821035690838471e-05, + "learning_rate": 3.2682658757623526e-06, + "loss": 0.0, + "num_input_tokens_seen": 66366128, + "step": 114390 + }, + { + "epoch": 17.038278224605303, + "grad_norm": 0.0003834792587440461, + "learning_rate": 3.2666597444883734e-06, + "loss": 0.0, + "num_input_tokens_seen": 66369104, + "step": 114395 + }, + { + "epoch": 17.03902293714626, + "grad_norm": 5.2123607019893825e-05, + "learning_rate": 3.265053980377086e-06, + "loss": 0.0, + "num_input_tokens_seen": 66372240, + "step": 114400 + }, + { + "epoch": 17.03976764968722, + "grad_norm": 3.881186785292812e-05, + "learning_rate": 3.2634485834556276e-06, + "loss": 0.0, + "num_input_tokens_seen": 66375088, + "step": 114405 + }, + { + "epoch": 17.04051236222818, + "grad_norm": 4.72502906632144e-05, + "learning_rate": 3.2618435537511066e-06, + "loss": 0.0, + "num_input_tokens_seen": 66378032, + "step": 114410 + }, + { + "epoch": 17.04125707476914, + "grad_norm": 5.040710675530136e-06, + "learning_rate": 3.2602388912906482e-06, + "loss": 0.0, + "num_input_tokens_seen": 66381296, + "step": 114415 + }, + { + "epoch": 17.0420017873101, + "grad_norm": 3.114941364401602e-06, + "learning_rate": 3.2586345961013565e-06, + "loss": 0.0005, + "num_input_tokens_seen": 66384208, + "step": 114420 + }, + { + "epoch": 17.042746499851056, + "grad_norm": 2.5056699541892158e-06, + "learning_rate": 3.2570306682103396e-06, + "loss": 0.0, + "num_input_tokens_seen": 66387056, + "step": 114425 + }, + { + "epoch": 17.043491212392016, + "grad_norm": 4.375493972474942e-06, + "learning_rate": 3.2554271076446873e-06, + "loss": 0.0, + "num_input_tokens_seen": 66390512, + "step": 114430 + }, + { + "epoch": 17.044235924932977, + "grad_norm": 0.0005593316163867712, + "learning_rate": 3.2538239144314974e-06, + "loss": 0.0, + "num_input_tokens_seen": 66393232, + "step": 114435 + }, + { + "epoch": 17.044980637473934, + "grad_norm": 2.6297150270693237e-06, + "learning_rate": 3.252221088597854e-06, + "loss": 0.0, + "num_input_tokens_seen": 66396080, + "step": 114440 + }, + { + "epoch": 17.045725350014894, + "grad_norm": 9.078533184947446e-05, + "learning_rate": 3.250618630170829e-06, + "loss": 0.0, + "num_input_tokens_seen": 66398960, + "step": 114445 + }, + { + "epoch": 17.046470062555855, + "grad_norm": 0.00033582650939933956, + "learning_rate": 3.2490165391774963e-06, + "loss": 0.0, + "num_input_tokens_seen": 66402416, + "step": 114450 + }, + { + "epoch": 17.047214775096812, + "grad_norm": 8.636809070594609e-06, + "learning_rate": 3.2474148156449195e-06, + "loss": 0.0, + "num_input_tokens_seen": 66405072, + "step": 114455 + }, + { + "epoch": 17.047959487637772, + "grad_norm": 3.658285095298197e-06, + "learning_rate": 3.2458134596001636e-06, + "loss": 0.0588, + "num_input_tokens_seen": 66407696, + "step": 114460 + }, + { + "epoch": 17.04870420017873, + "grad_norm": 4.3538311729207635e-05, + "learning_rate": 3.2442124710702764e-06, + "loss": 0.0, + "num_input_tokens_seen": 66410896, + "step": 114465 + }, + { + "epoch": 17.04944891271969, + "grad_norm": 3.598496505219373e-06, + "learning_rate": 3.242611850082314e-06, + "loss": 0.3313, + "num_input_tokens_seen": 66414000, + "step": 114470 + }, + { + "epoch": 17.05019362526065, + "grad_norm": 0.0002671558759175241, + "learning_rate": 3.2410115966633044e-06, + "loss": 0.0, + "num_input_tokens_seen": 66417040, + "step": 114475 + }, + { + "epoch": 17.050938337801608, + "grad_norm": 0.0003467572678346187, + "learning_rate": 3.239411710840293e-06, + "loss": 0.0, + "num_input_tokens_seen": 66420112, + "step": 114480 + }, + { + "epoch": 17.051683050342568, + "grad_norm": 0.00021610436670016497, + "learning_rate": 3.2378121926403077e-06, + "loss": 0.0, + "num_input_tokens_seen": 66423056, + "step": 114485 + }, + { + "epoch": 17.05242776288353, + "grad_norm": 5.682776100002229e-05, + "learning_rate": 3.236213042090358e-06, + "loss": 0.1326, + "num_input_tokens_seen": 66426160, + "step": 114490 + }, + { + "epoch": 17.053172475424486, + "grad_norm": 2.4684399249963462e-05, + "learning_rate": 3.234614259217478e-06, + "loss": 0.0, + "num_input_tokens_seen": 66428816, + "step": 114495 + }, + { + "epoch": 17.053917187965446, + "grad_norm": 3.7638586945831776e-05, + "learning_rate": 3.2330158440486672e-06, + "loss": 0.0, + "num_input_tokens_seen": 66431568, + "step": 114500 + }, + { + "epoch": 17.054661900506403, + "grad_norm": 1.5006249668658711e-05, + "learning_rate": 3.231417796610925e-06, + "loss": 0.0, + "num_input_tokens_seen": 66434544, + "step": 114505 + }, + { + "epoch": 17.055406613047364, + "grad_norm": 8.480184078507591e-06, + "learning_rate": 3.229820116931259e-06, + "loss": 0.0, + "num_input_tokens_seen": 66437200, + "step": 114510 + }, + { + "epoch": 17.056151325588324, + "grad_norm": 3.944361014873721e-05, + "learning_rate": 3.228222805036657e-06, + "loss": 0.0, + "num_input_tokens_seen": 66440080, + "step": 114515 + }, + { + "epoch": 17.05689603812928, + "grad_norm": 2.6816376248461893e-06, + "learning_rate": 3.226625860954105e-06, + "loss": 0.0011, + "num_input_tokens_seen": 66442960, + "step": 114520 + }, + { + "epoch": 17.05764075067024, + "grad_norm": 0.00016658761887811124, + "learning_rate": 3.225029284710571e-06, + "loss": 0.0, + "num_input_tokens_seen": 66445776, + "step": 114525 + }, + { + "epoch": 17.058385463211202, + "grad_norm": 6.554836727445945e-05, + "learning_rate": 3.2234330763330432e-06, + "loss": 0.0, + "num_input_tokens_seen": 66448624, + "step": 114530 + }, + { + "epoch": 17.05913017575216, + "grad_norm": 2.7348398816684494e-06, + "learning_rate": 3.221837235848474e-06, + "loss": 0.0, + "num_input_tokens_seen": 66451408, + "step": 114535 + }, + { + "epoch": 17.05987488829312, + "grad_norm": 3.430119932090747e-06, + "learning_rate": 3.220241763283838e-06, + "loss": 0.0, + "num_input_tokens_seen": 66454000, + "step": 114540 + }, + { + "epoch": 17.060619600834077, + "grad_norm": 5.360194336390123e-06, + "learning_rate": 3.2186466586660746e-06, + "loss": 0.0, + "num_input_tokens_seen": 66456752, + "step": 114545 + }, + { + "epoch": 17.061364313375037, + "grad_norm": 3.3874024666147307e-05, + "learning_rate": 3.2170519220221435e-06, + "loss": 0.0, + "num_input_tokens_seen": 66459536, + "step": 114550 + }, + { + "epoch": 17.062109025915998, + "grad_norm": 3.3472408631496364e-06, + "learning_rate": 3.2154575533789753e-06, + "loss": 0.0, + "num_input_tokens_seen": 66462320, + "step": 114555 + }, + { + "epoch": 17.062853738456955, + "grad_norm": 2.71102453552885e-06, + "learning_rate": 3.2138635527635186e-06, + "loss": 0.0, + "num_input_tokens_seen": 66465232, + "step": 114560 + }, + { + "epoch": 17.063598450997915, + "grad_norm": 0.0001460077182855457, + "learning_rate": 3.2122699202026927e-06, + "loss": 0.0, + "num_input_tokens_seen": 66468208, + "step": 114565 + }, + { + "epoch": 17.064343163538872, + "grad_norm": 6.1548707890324295e-06, + "learning_rate": 3.2106766557234243e-06, + "loss": 0.0, + "num_input_tokens_seen": 66471056, + "step": 114570 + }, + { + "epoch": 17.065087876079833, + "grad_norm": 5.275568582874257e-06, + "learning_rate": 3.209083759352627e-06, + "loss": 0.0, + "num_input_tokens_seen": 66473872, + "step": 114575 + }, + { + "epoch": 17.065832588620793, + "grad_norm": 6.797336482122773e-06, + "learning_rate": 3.2074912311172046e-06, + "loss": 0.0, + "num_input_tokens_seen": 66476432, + "step": 114580 + }, + { + "epoch": 17.06657730116175, + "grad_norm": 4.3440863919386175e-06, + "learning_rate": 3.2058990710440773e-06, + "loss": 0.0, + "num_input_tokens_seen": 66479248, + "step": 114585 + }, + { + "epoch": 17.06732201370271, + "grad_norm": 2.324441447854042e-05, + "learning_rate": 3.2043072791601293e-06, + "loss": 0.0, + "num_input_tokens_seen": 66481808, + "step": 114590 + }, + { + "epoch": 17.06806672624367, + "grad_norm": 0.00011767115211114287, + "learning_rate": 3.202715855492261e-06, + "loss": 0.0, + "num_input_tokens_seen": 66484752, + "step": 114595 + }, + { + "epoch": 17.06881143878463, + "grad_norm": 1.434861133020604e-05, + "learning_rate": 3.201124800067357e-06, + "loss": 0.0, + "num_input_tokens_seen": 66487536, + "step": 114600 + }, + { + "epoch": 17.06955615132559, + "grad_norm": 4.695978077506879e-06, + "learning_rate": 3.1995341129122864e-06, + "loss": 0.0, + "num_input_tokens_seen": 66490352, + "step": 114605 + }, + { + "epoch": 17.070300863866546, + "grad_norm": 3.6546539377013687e-06, + "learning_rate": 3.197943794053937e-06, + "loss": 0.0, + "num_input_tokens_seen": 66493136, + "step": 114610 + }, + { + "epoch": 17.071045576407506, + "grad_norm": 6.12548183198669e-06, + "learning_rate": 3.196353843519162e-06, + "loss": 0.0, + "num_input_tokens_seen": 66496080, + "step": 114615 + }, + { + "epoch": 17.071790288948467, + "grad_norm": 0.0001402013876941055, + "learning_rate": 3.1947642613348344e-06, + "loss": 0.0, + "num_input_tokens_seen": 66499440, + "step": 114620 + }, + { + "epoch": 17.072535001489424, + "grad_norm": 3.0800269996689167e-06, + "learning_rate": 3.193175047527797e-06, + "loss": 0.0, + "num_input_tokens_seen": 66502544, + "step": 114625 + }, + { + "epoch": 17.073279714030384, + "grad_norm": 4.820852154807653e-06, + "learning_rate": 3.1915862021249105e-06, + "loss": 0.0, + "num_input_tokens_seen": 66505200, + "step": 114630 + }, + { + "epoch": 17.074024426571345, + "grad_norm": 5.90307490710984e-06, + "learning_rate": 3.18999772515301e-06, + "loss": 0.0, + "num_input_tokens_seen": 66508048, + "step": 114635 + }, + { + "epoch": 17.074769139112302, + "grad_norm": 1.618527494429145e-05, + "learning_rate": 3.1884096166389292e-06, + "loss": 0.0, + "num_input_tokens_seen": 66510832, + "step": 114640 + }, + { + "epoch": 17.075513851653263, + "grad_norm": 0.0016341108130291104, + "learning_rate": 3.1868218766095e-06, + "loss": 0.0, + "num_input_tokens_seen": 66513776, + "step": 114645 + }, + { + "epoch": 17.07625856419422, + "grad_norm": 4.118883680348517e-06, + "learning_rate": 3.1852345050915415e-06, + "loss": 0.0, + "num_input_tokens_seen": 66516752, + "step": 114650 + }, + { + "epoch": 17.07700327673518, + "grad_norm": 6.387408211594447e-05, + "learning_rate": 3.1836475021118804e-06, + "loss": 0.0, + "num_input_tokens_seen": 66519664, + "step": 114655 + }, + { + "epoch": 17.07774798927614, + "grad_norm": 1.4327621101983823e-05, + "learning_rate": 3.1820608676973144e-06, + "loss": 0.0001, + "num_input_tokens_seen": 66522704, + "step": 114660 + }, + { + "epoch": 17.078492701817098, + "grad_norm": 3.8644826418021694e-05, + "learning_rate": 3.180474601874661e-06, + "loss": 0.0, + "num_input_tokens_seen": 66525648, + "step": 114665 + }, + { + "epoch": 17.079237414358058, + "grad_norm": 6.955206481507048e-05, + "learning_rate": 3.1788887046707072e-06, + "loss": 0.0006, + "num_input_tokens_seen": 66528656, + "step": 114670 + }, + { + "epoch": 17.07998212689902, + "grad_norm": 2.8868091703770915e-06, + "learning_rate": 3.177303176112256e-06, + "loss": 0.0, + "num_input_tokens_seen": 66531344, + "step": 114675 + }, + { + "epoch": 17.080726839439976, + "grad_norm": 4.396217264002189e-06, + "learning_rate": 3.1757180162260897e-06, + "loss": 0.0, + "num_input_tokens_seen": 66534096, + "step": 114680 + }, + { + "epoch": 17.081471551980936, + "grad_norm": 5.609306754195131e-06, + "learning_rate": 3.174133225038978e-06, + "loss": 0.0, + "num_input_tokens_seen": 66537168, + "step": 114685 + }, + { + "epoch": 17.082216264521893, + "grad_norm": 5.420672550826566e-06, + "learning_rate": 3.17254880257771e-06, + "loss": 0.0, + "num_input_tokens_seen": 66539888, + "step": 114690 + }, + { + "epoch": 17.082960977062854, + "grad_norm": 2.9683028515137266e-06, + "learning_rate": 3.1709647488690404e-06, + "loss": 0.0, + "num_input_tokens_seen": 66542704, + "step": 114695 + }, + { + "epoch": 17.083705689603814, + "grad_norm": 3.2468522022099933e-06, + "learning_rate": 3.1693810639397412e-06, + "loss": 0.0, + "num_input_tokens_seen": 66545712, + "step": 114700 + }, + { + "epoch": 17.08445040214477, + "grad_norm": 9.631406101107132e-06, + "learning_rate": 3.1677977478165588e-06, + "loss": 0.0, + "num_input_tokens_seen": 66548464, + "step": 114705 + }, + { + "epoch": 17.08519511468573, + "grad_norm": 0.0007614174392074347, + "learning_rate": 3.166214800526246e-06, + "loss": 0.0, + "num_input_tokens_seen": 66551664, + "step": 114710 + }, + { + "epoch": 17.085939827226692, + "grad_norm": 4.513456951826811e-05, + "learning_rate": 3.1646322220955372e-06, + "loss": 0.0, + "num_input_tokens_seen": 66554352, + "step": 114715 + }, + { + "epoch": 17.08668453976765, + "grad_norm": 7.27462065697182e-06, + "learning_rate": 3.16305001255118e-06, + "loss": 0.0, + "num_input_tokens_seen": 66557328, + "step": 114720 + }, + { + "epoch": 17.08742925230861, + "grad_norm": 0.00015409181651193649, + "learning_rate": 3.1614681719199015e-06, + "loss": 0.0, + "num_input_tokens_seen": 66560112, + "step": 114725 + }, + { + "epoch": 17.088173964849567, + "grad_norm": 2.7946629415964708e-05, + "learning_rate": 3.1598867002284148e-06, + "loss": 0.0, + "num_input_tokens_seen": 66563184, + "step": 114730 + }, + { + "epoch": 17.088918677390527, + "grad_norm": 7.508589533244958e-06, + "learning_rate": 3.15830559750345e-06, + "loss": 0.0, + "num_input_tokens_seen": 66566032, + "step": 114735 + }, + { + "epoch": 17.089663389931488, + "grad_norm": 1.3595207747130189e-05, + "learning_rate": 3.1567248637717066e-06, + "loss": 0.0, + "num_input_tokens_seen": 66568912, + "step": 114740 + }, + { + "epoch": 17.090408102472445, + "grad_norm": 2.2831203750683926e-05, + "learning_rate": 3.1551444990599033e-06, + "loss": 0.0, + "num_input_tokens_seen": 66572048, + "step": 114745 + }, + { + "epoch": 17.091152815013405, + "grad_norm": 0.00010631714394548908, + "learning_rate": 3.1535645033947265e-06, + "loss": 0.0, + "num_input_tokens_seen": 66574704, + "step": 114750 + }, + { + "epoch": 17.091897527554362, + "grad_norm": 0.0007159795495681465, + "learning_rate": 3.15198487680288e-06, + "loss": 0.0, + "num_input_tokens_seen": 66577456, + "step": 114755 + }, + { + "epoch": 17.092642240095323, + "grad_norm": 1.974440601770766e-05, + "learning_rate": 3.150405619311042e-06, + "loss": 0.0, + "num_input_tokens_seen": 66580688, + "step": 114760 + }, + { + "epoch": 17.093386952636283, + "grad_norm": 0.0006673481548205018, + "learning_rate": 3.148826730945889e-06, + "loss": 0.0, + "num_input_tokens_seen": 66583824, + "step": 114765 + }, + { + "epoch": 17.09413166517724, + "grad_norm": 0.00040577456820756197, + "learning_rate": 3.147248211734105e-06, + "loss": 0.0, + "num_input_tokens_seen": 66586896, + "step": 114770 + }, + { + "epoch": 17.0948763777182, + "grad_norm": 5.8547007938614115e-05, + "learning_rate": 3.145670061702352e-06, + "loss": 0.0, + "num_input_tokens_seen": 66589840, + "step": 114775 + }, + { + "epoch": 17.09562109025916, + "grad_norm": 2.1666135580744594e-05, + "learning_rate": 3.144092280877292e-06, + "loss": 0.0, + "num_input_tokens_seen": 66593040, + "step": 114780 + }, + { + "epoch": 17.09636580280012, + "grad_norm": 0.00015847122995182872, + "learning_rate": 3.1425148692855734e-06, + "loss": 0.0, + "num_input_tokens_seen": 66595824, + "step": 114785 + }, + { + "epoch": 17.09711051534108, + "grad_norm": 0.0010066631948575377, + "learning_rate": 3.1409378269538574e-06, + "loss": 0.0, + "num_input_tokens_seen": 66598832, + "step": 114790 + }, + { + "epoch": 17.097855227882036, + "grad_norm": 0.00035183122963644564, + "learning_rate": 3.1393611539087765e-06, + "loss": 0.0, + "num_input_tokens_seen": 66601936, + "step": 114795 + }, + { + "epoch": 17.098599940422996, + "grad_norm": 5.488452643476194e-06, + "learning_rate": 3.1377848501769724e-06, + "loss": 0.0, + "num_input_tokens_seen": 66605008, + "step": 114800 + }, + { + "epoch": 17.099344652963957, + "grad_norm": 3.7293502828106284e-06, + "learning_rate": 3.136208915785077e-06, + "loss": 0.0, + "num_input_tokens_seen": 66608016, + "step": 114805 + }, + { + "epoch": 17.100089365504914, + "grad_norm": 6.996480806265026e-06, + "learning_rate": 3.1346333507597027e-06, + "loss": 0.0, + "num_input_tokens_seen": 66610576, + "step": 114810 + }, + { + "epoch": 17.100834078045875, + "grad_norm": 9.470829536439851e-05, + "learning_rate": 3.1330581551274827e-06, + "loss": 0.0, + "num_input_tokens_seen": 66613424, + "step": 114815 + }, + { + "epoch": 17.101578790586835, + "grad_norm": 1.5245355825754814e-05, + "learning_rate": 3.1314833289150138e-06, + "loss": 0.0, + "num_input_tokens_seen": 66616176, + "step": 114820 + }, + { + "epoch": 17.102323503127792, + "grad_norm": 4.066458859597333e-05, + "learning_rate": 3.129908872148912e-06, + "loss": 0.0, + "num_input_tokens_seen": 66619216, + "step": 114825 + }, + { + "epoch": 17.103068215668753, + "grad_norm": 3.263630060246214e-05, + "learning_rate": 3.128334784855774e-06, + "loss": 0.0, + "num_input_tokens_seen": 66622224, + "step": 114830 + }, + { + "epoch": 17.10381292820971, + "grad_norm": 1.1442100003478117e-05, + "learning_rate": 3.126761067062184e-06, + "loss": 0.0, + "num_input_tokens_seen": 66625168, + "step": 114835 + }, + { + "epoch": 17.10455764075067, + "grad_norm": 0.00043110441765747964, + "learning_rate": 3.125187718794742e-06, + "loss": 0.1563, + "num_input_tokens_seen": 66628240, + "step": 114840 + }, + { + "epoch": 17.10530235329163, + "grad_norm": 3.3537562558194622e-06, + "learning_rate": 3.1236147400800194e-06, + "loss": 0.0, + "num_input_tokens_seen": 66631280, + "step": 114845 + }, + { + "epoch": 17.106047065832588, + "grad_norm": 9.274903277400881e-06, + "learning_rate": 3.1220421309445913e-06, + "loss": 0.0, + "num_input_tokens_seen": 66634128, + "step": 114850 + }, + { + "epoch": 17.106791778373548, + "grad_norm": 4.114735475013731e-06, + "learning_rate": 3.1204698914150205e-06, + "loss": 0.0, + "num_input_tokens_seen": 66636848, + "step": 114855 + }, + { + "epoch": 17.10753649091451, + "grad_norm": 4.59455004602205e-05, + "learning_rate": 3.11889802151788e-06, + "loss": 0.0, + "num_input_tokens_seen": 66640112, + "step": 114860 + }, + { + "epoch": 17.108281203455466, + "grad_norm": 0.10851510614156723, + "learning_rate": 3.117326521279712e-06, + "loss": 0.0, + "num_input_tokens_seen": 66643184, + "step": 114865 + }, + { + "epoch": 17.109025915996426, + "grad_norm": 0.001536427065730095, + "learning_rate": 3.1157553907270766e-06, + "loss": 0.0, + "num_input_tokens_seen": 66645712, + "step": 114870 + }, + { + "epoch": 17.109770628537383, + "grad_norm": 2.6182477085967548e-05, + "learning_rate": 3.1141846298865074e-06, + "loss": 0.0, + "num_input_tokens_seen": 66648400, + "step": 114875 + }, + { + "epoch": 17.110515341078344, + "grad_norm": 9.448070159123745e-06, + "learning_rate": 3.11261423878455e-06, + "loss": 0.0, + "num_input_tokens_seen": 66651056, + "step": 114880 + }, + { + "epoch": 17.111260053619304, + "grad_norm": 2.721287955864682e-06, + "learning_rate": 3.111044217447731e-06, + "loss": 0.0, + "num_input_tokens_seen": 66654032, + "step": 114885 + }, + { + "epoch": 17.11200476616026, + "grad_norm": 3.4267446608282626e-05, + "learning_rate": 3.1094745659025674e-06, + "loss": 0.0, + "num_input_tokens_seen": 66656784, + "step": 114890 + }, + { + "epoch": 17.11274947870122, + "grad_norm": 6.467821367550641e-05, + "learning_rate": 3.1079052841755857e-06, + "loss": 0.0, + "num_input_tokens_seen": 66659760, + "step": 114895 + }, + { + "epoch": 17.113494191242182, + "grad_norm": 0.00015660816279705614, + "learning_rate": 3.1063363722932975e-06, + "loss": 0.0, + "num_input_tokens_seen": 66662736, + "step": 114900 + }, + { + "epoch": 17.11423890378314, + "grad_norm": 6.930911331437528e-05, + "learning_rate": 3.1047678302822016e-06, + "loss": 0.0, + "num_input_tokens_seen": 66665552, + "step": 114905 + }, + { + "epoch": 17.1149836163241, + "grad_norm": 0.0003000996366608888, + "learning_rate": 3.1031996581687955e-06, + "loss": 0.0, + "num_input_tokens_seen": 66668432, + "step": 114910 + }, + { + "epoch": 17.115728328865057, + "grad_norm": 0.00014557912072632462, + "learning_rate": 3.101631855979581e-06, + "loss": 0.0, + "num_input_tokens_seen": 66671120, + "step": 114915 + }, + { + "epoch": 17.116473041406017, + "grad_norm": 2.9142705898266286e-05, + "learning_rate": 3.100064423741042e-06, + "loss": 0.0284, + "num_input_tokens_seen": 66674128, + "step": 114920 + }, + { + "epoch": 17.117217753946978, + "grad_norm": 9.358223906019703e-06, + "learning_rate": 3.098497361479649e-06, + "loss": 0.0, + "num_input_tokens_seen": 66676784, + "step": 114925 + }, + { + "epoch": 17.117962466487935, + "grad_norm": 8.227903163060546e-05, + "learning_rate": 3.0969306692218897e-06, + "loss": 0.0, + "num_input_tokens_seen": 66679888, + "step": 114930 + }, + { + "epoch": 17.118707179028895, + "grad_norm": 3.928892965632258e-06, + "learning_rate": 3.0953643469942173e-06, + "loss": 0.0, + "num_input_tokens_seen": 66682928, + "step": 114935 + }, + { + "epoch": 17.119451891569852, + "grad_norm": 0.0006627513794228435, + "learning_rate": 3.093798394823111e-06, + "loss": 0.0, + "num_input_tokens_seen": 66685648, + "step": 114940 + }, + { + "epoch": 17.120196604110813, + "grad_norm": 5.9787587815662846e-05, + "learning_rate": 3.0922328127350076e-06, + "loss": 0.0, + "num_input_tokens_seen": 66688528, + "step": 114945 + }, + { + "epoch": 17.120941316651773, + "grad_norm": 0.0001109011864173226, + "learning_rate": 3.090667600756372e-06, + "loss": 0.0001, + "num_input_tokens_seen": 66691376, + "step": 114950 + }, + { + "epoch": 17.12168602919273, + "grad_norm": 3.355354056111537e-05, + "learning_rate": 3.089102758913634e-06, + "loss": 0.0, + "num_input_tokens_seen": 66694096, + "step": 114955 + }, + { + "epoch": 17.12243074173369, + "grad_norm": 6.712251342833042e-05, + "learning_rate": 3.087538287233241e-06, + "loss": 0.0, + "num_input_tokens_seen": 66697264, + "step": 114960 + }, + { + "epoch": 17.12317545427465, + "grad_norm": 2.4392338673351333e-06, + "learning_rate": 3.0859741857416193e-06, + "loss": 0.0, + "num_input_tokens_seen": 66700080, + "step": 114965 + }, + { + "epoch": 17.12392016681561, + "grad_norm": 0.001154731260612607, + "learning_rate": 3.0844104544651893e-06, + "loss": 0.0, + "num_input_tokens_seen": 66702736, + "step": 114970 + }, + { + "epoch": 17.12466487935657, + "grad_norm": 0.06530711054801941, + "learning_rate": 3.082847093430369e-06, + "loss": 0.0, + "num_input_tokens_seen": 66705616, + "step": 114975 + }, + { + "epoch": 17.125409591897526, + "grad_norm": 0.00019213910854887217, + "learning_rate": 3.0812841026635705e-06, + "loss": 0.0, + "num_input_tokens_seen": 66708688, + "step": 114980 + }, + { + "epoch": 17.126154304438487, + "grad_norm": 3.739166641025804e-05, + "learning_rate": 3.079721482191203e-06, + "loss": 0.0, + "num_input_tokens_seen": 66711536, + "step": 114985 + }, + { + "epoch": 17.126899016979447, + "grad_norm": 1.6695465092197992e-05, + "learning_rate": 3.0781592320396568e-06, + "loss": 0.0, + "num_input_tokens_seen": 66714384, + "step": 114990 + }, + { + "epoch": 17.127643729520404, + "grad_norm": 1.2150996553828008e-05, + "learning_rate": 3.076597352235333e-06, + "loss": 0.0, + "num_input_tokens_seen": 66717584, + "step": 114995 + }, + { + "epoch": 17.128388442061365, + "grad_norm": 3.2957623261609115e-06, + "learning_rate": 3.075035842804619e-06, + "loss": 0.0, + "num_input_tokens_seen": 66720400, + "step": 115000 + }, + { + "epoch": 17.129133154602325, + "grad_norm": 1.2402291758917272e-05, + "learning_rate": 3.073474703773885e-06, + "loss": 0.0, + "num_input_tokens_seen": 66723216, + "step": 115005 + }, + { + "epoch": 17.129877867143282, + "grad_norm": 6.470187599916244e-06, + "learning_rate": 3.0719139351695125e-06, + "loss": 0.0, + "num_input_tokens_seen": 66726160, + "step": 115010 + }, + { + "epoch": 17.130622579684243, + "grad_norm": 1.2281465387786739e-05, + "learning_rate": 3.070353537017867e-06, + "loss": 0.0, + "num_input_tokens_seen": 66729328, + "step": 115015 + }, + { + "epoch": 17.1313672922252, + "grad_norm": 5.167109975445783e-06, + "learning_rate": 3.0687935093453106e-06, + "loss": 0.0, + "num_input_tokens_seen": 66732272, + "step": 115020 + }, + { + "epoch": 17.13211200476616, + "grad_norm": 1.753570541040972e-05, + "learning_rate": 3.0672338521781975e-06, + "loss": 0.0, + "num_input_tokens_seen": 66735024, + "step": 115025 + }, + { + "epoch": 17.13285671730712, + "grad_norm": 1.8410311213301611e-06, + "learning_rate": 3.0656745655428783e-06, + "loss": 0.0, + "num_input_tokens_seen": 66737840, + "step": 115030 + }, + { + "epoch": 17.133601429848078, + "grad_norm": 2.1276800907799043e-06, + "learning_rate": 3.0641156494656957e-06, + "loss": 0.0, + "num_input_tokens_seen": 66740496, + "step": 115035 + }, + { + "epoch": 17.134346142389038, + "grad_norm": 5.638456786982715e-05, + "learning_rate": 3.062557103972985e-06, + "loss": 0.0, + "num_input_tokens_seen": 66743472, + "step": 115040 + }, + { + "epoch": 17.13509085493, + "grad_norm": 3.875861693813931e-06, + "learning_rate": 3.0609989290910775e-06, + "loss": 0.0, + "num_input_tokens_seen": 66746480, + "step": 115045 + }, + { + "epoch": 17.135835567470956, + "grad_norm": 0.07246973365545273, + "learning_rate": 3.059441124846288e-06, + "loss": 0.0, + "num_input_tokens_seen": 66749424, + "step": 115050 + }, + { + "epoch": 17.136580280011916, + "grad_norm": 6.53384777251631e-05, + "learning_rate": 3.0578836912649458e-06, + "loss": 0.0, + "num_input_tokens_seen": 66752240, + "step": 115055 + }, + { + "epoch": 17.137324992552873, + "grad_norm": 7.535035820183111e-06, + "learning_rate": 3.0563266283733517e-06, + "loss": 0.0, + "num_input_tokens_seen": 66754768, + "step": 115060 + }, + { + "epoch": 17.138069705093834, + "grad_norm": 1.233604325534543e-05, + "learning_rate": 3.054769936197824e-06, + "loss": 0.0, + "num_input_tokens_seen": 66757872, + "step": 115065 + }, + { + "epoch": 17.138814417634794, + "grad_norm": 4.685803560278146e-06, + "learning_rate": 3.0532136147646496e-06, + "loss": 0.0, + "num_input_tokens_seen": 66760944, + "step": 115070 + }, + { + "epoch": 17.13955913017575, + "grad_norm": 0.001100495457649231, + "learning_rate": 3.05165766410013e-06, + "loss": 0.0, + "num_input_tokens_seen": 66764144, + "step": 115075 + }, + { + "epoch": 17.140303842716712, + "grad_norm": 6.252089951885864e-05, + "learning_rate": 3.050102084230541e-06, + "loss": 0.0, + "num_input_tokens_seen": 66766928, + "step": 115080 + }, + { + "epoch": 17.14104855525767, + "grad_norm": 4.21221338910982e-05, + "learning_rate": 3.0485468751821735e-06, + "loss": 0.0, + "num_input_tokens_seen": 66769776, + "step": 115085 + }, + { + "epoch": 17.14179326779863, + "grad_norm": 4.543330578599125e-05, + "learning_rate": 3.046992036981294e-06, + "loss": 0.0, + "num_input_tokens_seen": 66772592, + "step": 115090 + }, + { + "epoch": 17.14253798033959, + "grad_norm": 5.281316043692641e-05, + "learning_rate": 3.0454375696541694e-06, + "loss": 0.0, + "num_input_tokens_seen": 66775696, + "step": 115095 + }, + { + "epoch": 17.143282692880547, + "grad_norm": 3.869810461765155e-05, + "learning_rate": 3.0438834732270686e-06, + "loss": 0.0, + "num_input_tokens_seen": 66778576, + "step": 115100 + }, + { + "epoch": 17.144027405421507, + "grad_norm": 2.8137371828051982e-06, + "learning_rate": 3.0423297477262415e-06, + "loss": 0.0, + "num_input_tokens_seen": 66781616, + "step": 115105 + }, + { + "epoch": 17.144772117962468, + "grad_norm": 6.990248220972717e-05, + "learning_rate": 3.0407763931779354e-06, + "loss": 0.0, + "num_input_tokens_seen": 66784496, + "step": 115110 + }, + { + "epoch": 17.145516830503425, + "grad_norm": 8.73302633408457e-06, + "learning_rate": 3.039223409608391e-06, + "loss": 0.0, + "num_input_tokens_seen": 66787408, + "step": 115115 + }, + { + "epoch": 17.146261543044385, + "grad_norm": 1.1102051757916342e-05, + "learning_rate": 3.0376707970438513e-06, + "loss": 0.0426, + "num_input_tokens_seen": 66790224, + "step": 115120 + }, + { + "epoch": 17.147006255585342, + "grad_norm": 2.3591394437971758e-06, + "learning_rate": 3.036118555510539e-06, + "loss": 0.0, + "num_input_tokens_seen": 66792944, + "step": 115125 + }, + { + "epoch": 17.147750968126303, + "grad_norm": 0.00012433585652615875, + "learning_rate": 3.0345666850346787e-06, + "loss": 0.0, + "num_input_tokens_seen": 66796016, + "step": 115130 + }, + { + "epoch": 17.148495680667263, + "grad_norm": 2.8528336770250462e-05, + "learning_rate": 3.033015185642493e-06, + "loss": 0.0, + "num_input_tokens_seen": 66798576, + "step": 115135 + }, + { + "epoch": 17.14924039320822, + "grad_norm": 2.6788433387991972e-05, + "learning_rate": 3.0314640573601864e-06, + "loss": 0.0, + "num_input_tokens_seen": 66801328, + "step": 115140 + }, + { + "epoch": 17.14998510574918, + "grad_norm": 2.4423638024018146e-05, + "learning_rate": 3.029913300213971e-06, + "loss": 0.0, + "num_input_tokens_seen": 66804208, + "step": 115145 + }, + { + "epoch": 17.15072981829014, + "grad_norm": 6.0721748013747856e-05, + "learning_rate": 3.0283629142300347e-06, + "loss": 0.0, + "num_input_tokens_seen": 66806768, + "step": 115150 + }, + { + "epoch": 17.1514745308311, + "grad_norm": 5.56081840841216e-06, + "learning_rate": 3.0268128994345807e-06, + "loss": 0.0, + "num_input_tokens_seen": 66809648, + "step": 115155 + }, + { + "epoch": 17.15221924337206, + "grad_norm": 0.0031480968464165926, + "learning_rate": 3.0252632558537913e-06, + "loss": 0.0, + "num_input_tokens_seen": 66812656, + "step": 115160 + }, + { + "epoch": 17.152963955913016, + "grad_norm": 2.1114701667102054e-06, + "learning_rate": 3.0237139835138402e-06, + "loss": 0.0, + "num_input_tokens_seen": 66815440, + "step": 115165 + }, + { + "epoch": 17.153708668453977, + "grad_norm": 9.900111763272434e-06, + "learning_rate": 3.0221650824409114e-06, + "loss": 0.0, + "num_input_tokens_seen": 66818192, + "step": 115170 + }, + { + "epoch": 17.154453380994937, + "grad_norm": 8.374704520974774e-06, + "learning_rate": 3.0206165526611654e-06, + "loss": 0.0, + "num_input_tokens_seen": 66820912, + "step": 115175 + }, + { + "epoch": 17.155198093535894, + "grad_norm": 3.2887190172914416e-05, + "learning_rate": 3.0190683942007637e-06, + "loss": 0.0, + "num_input_tokens_seen": 66824016, + "step": 115180 + }, + { + "epoch": 17.155942806076855, + "grad_norm": 0.00018659827765077353, + "learning_rate": 3.017520607085858e-06, + "loss": 0.0, + "num_input_tokens_seen": 66826896, + "step": 115185 + }, + { + "epoch": 17.156687518617815, + "grad_norm": 0.000813258346170187, + "learning_rate": 3.0159731913426027e-06, + "loss": 0.0, + "num_input_tokens_seen": 66830064, + "step": 115190 + }, + { + "epoch": 17.157432231158772, + "grad_norm": 2.0134142687311396e-05, + "learning_rate": 3.014426146997132e-06, + "loss": 0.0, + "num_input_tokens_seen": 66833232, + "step": 115195 + }, + { + "epoch": 17.158176943699733, + "grad_norm": 0.0001305911864619702, + "learning_rate": 3.0128794740755916e-06, + "loss": 0.0001, + "num_input_tokens_seen": 66835952, + "step": 115200 + }, + { + "epoch": 17.15892165624069, + "grad_norm": 2.7969419534201734e-05, + "learning_rate": 3.0113331726041055e-06, + "loss": 0.0, + "num_input_tokens_seen": 66838800, + "step": 115205 + }, + { + "epoch": 17.15966636878165, + "grad_norm": 3.349161124788225e-05, + "learning_rate": 3.0097872426087914e-06, + "loss": 0.0, + "num_input_tokens_seen": 66841680, + "step": 115210 + }, + { + "epoch": 17.16041108132261, + "grad_norm": 4.993385118723381e-06, + "learning_rate": 3.0082416841157783e-06, + "loss": 0.0, + "num_input_tokens_seen": 66844400, + "step": 115215 + }, + { + "epoch": 17.161155793863568, + "grad_norm": 3.0253104341682047e-06, + "learning_rate": 3.006696497151165e-06, + "loss": 0.0, + "num_input_tokens_seen": 66847152, + "step": 115220 + }, + { + "epoch": 17.16190050640453, + "grad_norm": 0.00028020600439049304, + "learning_rate": 3.005151681741067e-06, + "loss": 0.0, + "num_input_tokens_seen": 66850128, + "step": 115225 + }, + { + "epoch": 17.16264521894549, + "grad_norm": 0.0017194128595292568, + "learning_rate": 3.0036072379115737e-06, + "loss": 0.0001, + "num_input_tokens_seen": 66853040, + "step": 115230 + }, + { + "epoch": 17.163389931486446, + "grad_norm": 8.011589670786634e-06, + "learning_rate": 3.0020631656887845e-06, + "loss": 0.0, + "num_input_tokens_seen": 66856016, + "step": 115235 + }, + { + "epoch": 17.164134644027406, + "grad_norm": 2.4755068807280622e-05, + "learning_rate": 3.000519465098772e-06, + "loss": 0.0, + "num_input_tokens_seen": 66858896, + "step": 115240 + }, + { + "epoch": 17.164879356568363, + "grad_norm": 8.681194231030531e-06, + "learning_rate": 2.9989761361676306e-06, + "loss": 0.0, + "num_input_tokens_seen": 66861616, + "step": 115245 + }, + { + "epoch": 17.165624069109324, + "grad_norm": 6.318096438917564e-06, + "learning_rate": 2.997433178921427e-06, + "loss": 0.0001, + "num_input_tokens_seen": 66864816, + "step": 115250 + }, + { + "epoch": 17.166368781650284, + "grad_norm": 5.723192771256436e-06, + "learning_rate": 2.995890593386222e-06, + "loss": 0.0, + "num_input_tokens_seen": 66867952, + "step": 115255 + }, + { + "epoch": 17.16711349419124, + "grad_norm": 2.1805088181281462e-05, + "learning_rate": 2.9943483795880854e-06, + "loss": 0.0, + "num_input_tokens_seen": 66870672, + "step": 115260 + }, + { + "epoch": 17.167858206732202, + "grad_norm": 0.000489987141918391, + "learning_rate": 2.992806537553064e-06, + "loss": 0.0044, + "num_input_tokens_seen": 66873488, + "step": 115265 + }, + { + "epoch": 17.16860291927316, + "grad_norm": 1.1274300049990416e-05, + "learning_rate": 2.9912650673072113e-06, + "loss": 0.0001, + "num_input_tokens_seen": 66876528, + "step": 115270 + }, + { + "epoch": 17.16934763181412, + "grad_norm": 2.989704444189556e-05, + "learning_rate": 2.989723968876565e-06, + "loss": 0.0, + "num_input_tokens_seen": 66879248, + "step": 115275 + }, + { + "epoch": 17.17009234435508, + "grad_norm": 2.67531031568069e-05, + "learning_rate": 2.9881832422871654e-06, + "loss": 0.0, + "num_input_tokens_seen": 66882288, + "step": 115280 + }, + { + "epoch": 17.170837056896037, + "grad_norm": 1.419628915755311e-05, + "learning_rate": 2.986642887565036e-06, + "loss": 0.0, + "num_input_tokens_seen": 66885104, + "step": 115285 + }, + { + "epoch": 17.171581769436997, + "grad_norm": 4.7384501158376224e-06, + "learning_rate": 2.9851029047362008e-06, + "loss": 0.0, + "num_input_tokens_seen": 66888016, + "step": 115290 + }, + { + "epoch": 17.172326481977958, + "grad_norm": 6.166555067466106e-06, + "learning_rate": 2.98356329382668e-06, + "loss": 0.0, + "num_input_tokens_seen": 66890800, + "step": 115295 + }, + { + "epoch": 17.173071194518915, + "grad_norm": 3.2348889362765476e-05, + "learning_rate": 2.9820240548624814e-06, + "loss": 0.0, + "num_input_tokens_seen": 66893840, + "step": 115300 + }, + { + "epoch": 17.173815907059875, + "grad_norm": 8.407696441281587e-06, + "learning_rate": 2.9804851878696054e-06, + "loss": 0.0, + "num_input_tokens_seen": 66896528, + "step": 115305 + }, + { + "epoch": 17.174560619600832, + "grad_norm": 0.001860873424448073, + "learning_rate": 2.9789466928740515e-06, + "loss": 0.0, + "num_input_tokens_seen": 66899728, + "step": 115310 + }, + { + "epoch": 17.175305332141793, + "grad_norm": 1.349667854810832e-05, + "learning_rate": 2.9774085699018158e-06, + "loss": 0.0, + "num_input_tokens_seen": 66902448, + "step": 115315 + }, + { + "epoch": 17.176050044682754, + "grad_norm": 0.0010491892462596297, + "learning_rate": 2.9758708189788736e-06, + "loss": 0.0, + "num_input_tokens_seen": 66905488, + "step": 115320 + }, + { + "epoch": 17.17679475722371, + "grad_norm": 8.416599484917242e-06, + "learning_rate": 2.9743334401312133e-06, + "loss": 0.0, + "num_input_tokens_seen": 66908336, + "step": 115325 + }, + { + "epoch": 17.17753946976467, + "grad_norm": 2.5805741188378306e-06, + "learning_rate": 2.9727964333848056e-06, + "loss": 0.0, + "num_input_tokens_seen": 66911312, + "step": 115330 + }, + { + "epoch": 17.17828418230563, + "grad_norm": 7.3603350756457075e-06, + "learning_rate": 2.9712597987656105e-06, + "loss": 0.0, + "num_input_tokens_seen": 66914384, + "step": 115335 + }, + { + "epoch": 17.17902889484659, + "grad_norm": 3.8492551539093256e-05, + "learning_rate": 2.9697235362995955e-06, + "loss": 0.0, + "num_input_tokens_seen": 66917232, + "step": 115340 + }, + { + "epoch": 17.17977360738755, + "grad_norm": 9.140175279753748e-06, + "learning_rate": 2.9681876460127073e-06, + "loss": 0.0, + "num_input_tokens_seen": 66920016, + "step": 115345 + }, + { + "epoch": 17.180518319928506, + "grad_norm": 2.1080553779029287e-05, + "learning_rate": 2.9666521279309023e-06, + "loss": 0.0, + "num_input_tokens_seen": 66922928, + "step": 115350 + }, + { + "epoch": 17.181263032469467, + "grad_norm": 4.58544946013717e-06, + "learning_rate": 2.965116982080107e-06, + "loss": 0.0, + "num_input_tokens_seen": 66925680, + "step": 115355 + }, + { + "epoch": 17.182007745010427, + "grad_norm": 1.5750836610095575e-05, + "learning_rate": 2.9635822084862737e-06, + "loss": 0.0, + "num_input_tokens_seen": 66928432, + "step": 115360 + }, + { + "epoch": 17.182752457551384, + "grad_norm": 9.796451195143163e-05, + "learning_rate": 2.9620478071753223e-06, + "loss": 0.0, + "num_input_tokens_seen": 66931280, + "step": 115365 + }, + { + "epoch": 17.183497170092345, + "grad_norm": 0.0008244001073762774, + "learning_rate": 2.9605137781731713e-06, + "loss": 0.0, + "num_input_tokens_seen": 66934128, + "step": 115370 + }, + { + "epoch": 17.184241882633305, + "grad_norm": 5.2062026952626184e-05, + "learning_rate": 2.9589801215057445e-06, + "loss": 0.0, + "num_input_tokens_seen": 66937264, + "step": 115375 + }, + { + "epoch": 17.184986595174262, + "grad_norm": 2.76526825473411e-05, + "learning_rate": 2.9574468371989378e-06, + "loss": 0.0, + "num_input_tokens_seen": 66940560, + "step": 115380 + }, + { + "epoch": 17.185731307715223, + "grad_norm": 1.3224625945440494e-05, + "learning_rate": 2.955913925278672e-06, + "loss": 0.0, + "num_input_tokens_seen": 66943280, + "step": 115385 + }, + { + "epoch": 17.18647602025618, + "grad_norm": 0.0006411499343812466, + "learning_rate": 2.95438138577083e-06, + "loss": 0.3625, + "num_input_tokens_seen": 66946256, + "step": 115390 + }, + { + "epoch": 17.18722073279714, + "grad_norm": 3.354472210048698e-05, + "learning_rate": 2.952849218701312e-06, + "loss": 0.0, + "num_input_tokens_seen": 66949136, + "step": 115395 + }, + { + "epoch": 17.1879654453381, + "grad_norm": 2.543161463108845e-05, + "learning_rate": 2.951317424095995e-06, + "loss": 0.0, + "num_input_tokens_seen": 66951696, + "step": 115400 + }, + { + "epoch": 17.188710157879058, + "grad_norm": 2.7932104785577394e-05, + "learning_rate": 2.9497860019807643e-06, + "loss": 0.0, + "num_input_tokens_seen": 66954512, + "step": 115405 + }, + { + "epoch": 17.18945487042002, + "grad_norm": 2.7722786398953758e-05, + "learning_rate": 2.948254952381491e-06, + "loss": 0.0, + "num_input_tokens_seen": 66957552, + "step": 115410 + }, + { + "epoch": 17.19019958296098, + "grad_norm": 0.0021159013267606497, + "learning_rate": 2.946724275324031e-06, + "loss": 0.0, + "num_input_tokens_seen": 66960656, + "step": 115415 + }, + { + "epoch": 17.190944295501936, + "grad_norm": 3.546710104274098e-06, + "learning_rate": 2.9451939708342564e-06, + "loss": 0.0, + "num_input_tokens_seen": 66963568, + "step": 115420 + }, + { + "epoch": 17.191689008042896, + "grad_norm": 5.91320131206885e-05, + "learning_rate": 2.9436640389380073e-06, + "loss": 0.0, + "num_input_tokens_seen": 66966416, + "step": 115425 + }, + { + "epoch": 17.192433720583853, + "grad_norm": 0.0010864654323086143, + "learning_rate": 2.9421344796611435e-06, + "loss": 0.0, + "num_input_tokens_seen": 66969360, + "step": 115430 + }, + { + "epoch": 17.193178433124814, + "grad_norm": 1.056532983056968e-05, + "learning_rate": 2.9406052930295e-06, + "loss": 0.0, + "num_input_tokens_seen": 66971984, + "step": 115435 + }, + { + "epoch": 17.193923145665774, + "grad_norm": 9.663804121373687e-06, + "learning_rate": 2.9390764790689085e-06, + "loss": 0.0, + "num_input_tokens_seen": 66974832, + "step": 115440 + }, + { + "epoch": 17.19466785820673, + "grad_norm": 0.00029762383201159537, + "learning_rate": 2.9375480378051987e-06, + "loss": 0.0, + "num_input_tokens_seen": 66978192, + "step": 115445 + }, + { + "epoch": 17.195412570747692, + "grad_norm": 6.873683560115751e-06, + "learning_rate": 2.9360199692641864e-06, + "loss": 0.0, + "num_input_tokens_seen": 66981040, + "step": 115450 + }, + { + "epoch": 17.19615728328865, + "grad_norm": 0.005273646209388971, + "learning_rate": 2.9344922734716977e-06, + "loss": 0.0, + "num_input_tokens_seen": 66983920, + "step": 115455 + }, + { + "epoch": 17.19690199582961, + "grad_norm": 9.282519386033528e-06, + "learning_rate": 2.932964950453529e-06, + "loss": 0.0, + "num_input_tokens_seen": 66986928, + "step": 115460 + }, + { + "epoch": 17.19764670837057, + "grad_norm": 258.6550598144531, + "learning_rate": 2.9314380002354953e-06, + "loss": 0.0119, + "num_input_tokens_seen": 66989648, + "step": 115465 + }, + { + "epoch": 17.198391420911527, + "grad_norm": 9.143578608927783e-06, + "learning_rate": 2.9299114228433816e-06, + "loss": 0.0, + "num_input_tokens_seen": 66992656, + "step": 115470 + }, + { + "epoch": 17.199136133452487, + "grad_norm": 1.393751608702587e-05, + "learning_rate": 2.9283852183029898e-06, + "loss": 0.0, + "num_input_tokens_seen": 66995600, + "step": 115475 + }, + { + "epoch": 17.199880845993448, + "grad_norm": 2.611060381241259e-06, + "learning_rate": 2.9268593866400907e-06, + "loss": 0.0, + "num_input_tokens_seen": 66998608, + "step": 115480 + }, + { + "epoch": 17.200625558534405, + "grad_norm": 7.035278395051137e-05, + "learning_rate": 2.9253339278804748e-06, + "loss": 0.0, + "num_input_tokens_seen": 67001424, + "step": 115485 + }, + { + "epoch": 17.201370271075366, + "grad_norm": 3.825253952527419e-05, + "learning_rate": 2.923808842049905e-06, + "loss": 0.0, + "num_input_tokens_seen": 67004208, + "step": 115490 + }, + { + "epoch": 17.202114983616323, + "grad_norm": 3.761385869438527e-06, + "learning_rate": 2.922284129174141e-06, + "loss": 0.0, + "num_input_tokens_seen": 67006896, + "step": 115495 + }, + { + "epoch": 17.202859696157283, + "grad_norm": 1.3211229088483378e-05, + "learning_rate": 2.920759789278957e-06, + "loss": 0.0, + "num_input_tokens_seen": 67009680, + "step": 115500 + }, + { + "epoch": 17.203604408698244, + "grad_norm": 2.3231375507748453e-06, + "learning_rate": 2.919235822390093e-06, + "loss": 0.0, + "num_input_tokens_seen": 67012336, + "step": 115505 + }, + { + "epoch": 17.2043491212392, + "grad_norm": 2.362122586418991e-06, + "learning_rate": 2.9177122285332982e-06, + "loss": 0.0, + "num_input_tokens_seen": 67015504, + "step": 115510 + }, + { + "epoch": 17.20509383378016, + "grad_norm": 1.100068857340375e-05, + "learning_rate": 2.9161890077343074e-06, + "loss": 0.0, + "num_input_tokens_seen": 67018256, + "step": 115515 + }, + { + "epoch": 17.20583854632112, + "grad_norm": 2.2932421416044235e-05, + "learning_rate": 2.914666160018864e-06, + "loss": 0.0, + "num_input_tokens_seen": 67021072, + "step": 115520 + }, + { + "epoch": 17.20658325886208, + "grad_norm": 4.70162149213138e-06, + "learning_rate": 2.9131436854126894e-06, + "loss": 0.0, + "num_input_tokens_seen": 67024080, + "step": 115525 + }, + { + "epoch": 17.20732797140304, + "grad_norm": 0.0009912148816511035, + "learning_rate": 2.9116215839414986e-06, + "loss": 0.0, + "num_input_tokens_seen": 67027120, + "step": 115530 + }, + { + "epoch": 17.208072683943996, + "grad_norm": 4.496160181588493e-05, + "learning_rate": 2.9100998556310153e-06, + "loss": 0.0, + "num_input_tokens_seen": 67029936, + "step": 115535 + }, + { + "epoch": 17.208817396484957, + "grad_norm": 0.00016927237447816879, + "learning_rate": 2.9085785005069394e-06, + "loss": 0.0, + "num_input_tokens_seen": 67032848, + "step": 115540 + }, + { + "epoch": 17.209562109025917, + "grad_norm": 4.058464401168749e-05, + "learning_rate": 2.907057518594983e-06, + "loss": 0.0, + "num_input_tokens_seen": 67035792, + "step": 115545 + }, + { + "epoch": 17.210306821566874, + "grad_norm": 0.00030121716554276645, + "learning_rate": 2.9055369099208306e-06, + "loss": 0.0, + "num_input_tokens_seen": 67038736, + "step": 115550 + }, + { + "epoch": 17.211051534107835, + "grad_norm": 3.954509520553984e-05, + "learning_rate": 2.904016674510179e-06, + "loss": 0.0, + "num_input_tokens_seen": 67041840, + "step": 115555 + }, + { + "epoch": 17.211796246648795, + "grad_norm": 2.8040078632329823e-06, + "learning_rate": 2.9024968123887107e-06, + "loss": 0.0, + "num_input_tokens_seen": 67044528, + "step": 115560 + }, + { + "epoch": 17.212540959189752, + "grad_norm": 7.919968993519433e-06, + "learning_rate": 2.900977323582099e-06, + "loss": 0.0, + "num_input_tokens_seen": 67047152, + "step": 115565 + }, + { + "epoch": 17.213285671730713, + "grad_norm": 3.856510011246428e-05, + "learning_rate": 2.8994582081160155e-06, + "loss": 0.0, + "num_input_tokens_seen": 67049776, + "step": 115570 + }, + { + "epoch": 17.21403038427167, + "grad_norm": 1.7916579508892028e-06, + "learning_rate": 2.897939466016117e-06, + "loss": 0.0, + "num_input_tokens_seen": 67053040, + "step": 115575 + }, + { + "epoch": 17.21477509681263, + "grad_norm": 3.25539440382272e-06, + "learning_rate": 2.8964210973080745e-06, + "loss": 0.0, + "num_input_tokens_seen": 67056016, + "step": 115580 + }, + { + "epoch": 17.21551980935359, + "grad_norm": 0.12502586841583252, + "learning_rate": 2.8949031020175264e-06, + "loss": 0.0, + "num_input_tokens_seen": 67058800, + "step": 115585 + }, + { + "epoch": 17.216264521894548, + "grad_norm": 7.285148967639543e-06, + "learning_rate": 2.89338548017013e-06, + "loss": 0.0, + "num_input_tokens_seen": 67061872, + "step": 115590 + }, + { + "epoch": 17.21700923443551, + "grad_norm": 6.165091326693073e-05, + "learning_rate": 2.8918682317915115e-06, + "loss": 0.0, + "num_input_tokens_seen": 67064848, + "step": 115595 + }, + { + "epoch": 17.217753946976465, + "grad_norm": 5.361595867725555e-06, + "learning_rate": 2.890351356907314e-06, + "loss": 0.0, + "num_input_tokens_seen": 67067472, + "step": 115600 + }, + { + "epoch": 17.218498659517426, + "grad_norm": 1.959371547854971e-05, + "learning_rate": 2.8888348555431625e-06, + "loss": 0.0, + "num_input_tokens_seen": 67070256, + "step": 115605 + }, + { + "epoch": 17.219243372058386, + "grad_norm": 3.815807758655865e-06, + "learning_rate": 2.887318727724664e-06, + "loss": 0.0, + "num_input_tokens_seen": 67073136, + "step": 115610 + }, + { + "epoch": 17.219988084599343, + "grad_norm": 3.871379522024654e-05, + "learning_rate": 2.88580297347745e-06, + "loss": 0.0, + "num_input_tokens_seen": 67076240, + "step": 115615 + }, + { + "epoch": 17.220732797140304, + "grad_norm": 4.20466712967027e-05, + "learning_rate": 2.884287592827112e-06, + "loss": 0.0, + "num_input_tokens_seen": 67079216, + "step": 115620 + }, + { + "epoch": 17.221477509681264, + "grad_norm": 2.217153678429895e-06, + "learning_rate": 2.882772585799262e-06, + "loss": 0.0, + "num_input_tokens_seen": 67082160, + "step": 115625 + }, + { + "epoch": 17.22222222222222, + "grad_norm": 1.2769864042638801e-05, + "learning_rate": 2.8812579524194916e-06, + "loss": 0.0, + "num_input_tokens_seen": 67085264, + "step": 115630 + }, + { + "epoch": 17.222966934763182, + "grad_norm": 4.212282146909274e-05, + "learning_rate": 2.879743692713388e-06, + "loss": 0.0, + "num_input_tokens_seen": 67088048, + "step": 115635 + }, + { + "epoch": 17.22371164730414, + "grad_norm": 0.0017691132379695773, + "learning_rate": 2.8782298067065256e-06, + "loss": 0.0, + "num_input_tokens_seen": 67090864, + "step": 115640 + }, + { + "epoch": 17.2244563598451, + "grad_norm": 6.561740883626044e-05, + "learning_rate": 2.8767162944244918e-06, + "loss": 0.0, + "num_input_tokens_seen": 67093712, + "step": 115645 + }, + { + "epoch": 17.22520107238606, + "grad_norm": 4.527804321696749e-06, + "learning_rate": 2.875203155892853e-06, + "loss": 0.0, + "num_input_tokens_seen": 67096400, + "step": 115650 + }, + { + "epoch": 17.225945784927017, + "grad_norm": 0.0002487925230525434, + "learning_rate": 2.8736903911371652e-06, + "loss": 0.0, + "num_input_tokens_seen": 67099280, + "step": 115655 + }, + { + "epoch": 17.226690497467978, + "grad_norm": 2.4976495751616312e-06, + "learning_rate": 2.8721780001829956e-06, + "loss": 0.0, + "num_input_tokens_seen": 67102192, + "step": 115660 + }, + { + "epoch": 17.227435210008938, + "grad_norm": 3.057494541280903e-05, + "learning_rate": 2.870665983055881e-06, + "loss": 0.0, + "num_input_tokens_seen": 67104880, + "step": 115665 + }, + { + "epoch": 17.228179922549895, + "grad_norm": 5.965301625110442e-06, + "learning_rate": 2.8691543397813824e-06, + "loss": 0.0, + "num_input_tokens_seen": 67107600, + "step": 115670 + }, + { + "epoch": 17.228924635090856, + "grad_norm": 0.000427925813710317, + "learning_rate": 2.8676430703850206e-06, + "loss": 0.0, + "num_input_tokens_seen": 67110512, + "step": 115675 + }, + { + "epoch": 17.229669347631813, + "grad_norm": 0.0001440506021026522, + "learning_rate": 2.8661321748923416e-06, + "loss": 0.0, + "num_input_tokens_seen": 67113232, + "step": 115680 + }, + { + "epoch": 17.230414060172773, + "grad_norm": 1.550867636979092e-05, + "learning_rate": 2.8646216533288556e-06, + "loss": 0.0, + "num_input_tokens_seen": 67115664, + "step": 115685 + }, + { + "epoch": 17.231158772713734, + "grad_norm": 0.00019126114784739912, + "learning_rate": 2.863111505720098e-06, + "loss": 0.0, + "num_input_tokens_seen": 67118320, + "step": 115690 + }, + { + "epoch": 17.23190348525469, + "grad_norm": 0.0005359220085665584, + "learning_rate": 2.8616017320915704e-06, + "loss": 0.0, + "num_input_tokens_seen": 67121232, + "step": 115695 + }, + { + "epoch": 17.23264819779565, + "grad_norm": 0.0005052513442933559, + "learning_rate": 2.8600923324687807e-06, + "loss": 0.0, + "num_input_tokens_seen": 67124176, + "step": 115700 + }, + { + "epoch": 17.23339291033661, + "grad_norm": 2.9307125259947497e-06, + "learning_rate": 2.85858330687723e-06, + "loss": 0.0, + "num_input_tokens_seen": 67126864, + "step": 115705 + }, + { + "epoch": 17.23413762287757, + "grad_norm": 5.247374701866647e-06, + "learning_rate": 2.8570746553424065e-06, + "loss": 0.0, + "num_input_tokens_seen": 67129520, + "step": 115710 + }, + { + "epoch": 17.23488233541853, + "grad_norm": 0.0001118254876928404, + "learning_rate": 2.8555663778898066e-06, + "loss": 0.0, + "num_input_tokens_seen": 67132624, + "step": 115715 + }, + { + "epoch": 17.235627047959486, + "grad_norm": 8.195288501156028e-06, + "learning_rate": 2.854058474544899e-06, + "loss": 0.0, + "num_input_tokens_seen": 67135952, + "step": 115720 + }, + { + "epoch": 17.236371760500447, + "grad_norm": 2.171261076000519e-05, + "learning_rate": 2.852550945333174e-06, + "loss": 0.0, + "num_input_tokens_seen": 67139376, + "step": 115725 + }, + { + "epoch": 17.237116473041407, + "grad_norm": 0.00013409025268629193, + "learning_rate": 2.851043790280089e-06, + "loss": 0.0, + "num_input_tokens_seen": 67142608, + "step": 115730 + }, + { + "epoch": 17.237861185582364, + "grad_norm": 1.7265600035898387e-05, + "learning_rate": 2.849537009411102e-06, + "loss": 0.0, + "num_input_tokens_seen": 67145392, + "step": 115735 + }, + { + "epoch": 17.238605898123325, + "grad_norm": 3.830432615359314e-05, + "learning_rate": 2.8480306027516807e-06, + "loss": 0.0, + "num_input_tokens_seen": 67148112, + "step": 115740 + }, + { + "epoch": 17.239350610664285, + "grad_norm": 0.0035192982759326696, + "learning_rate": 2.8465245703272607e-06, + "loss": 0.0, + "num_input_tokens_seen": 67150672, + "step": 115745 + }, + { + "epoch": 17.240095323205242, + "grad_norm": 5.862735179107403e-06, + "learning_rate": 2.8450189121632998e-06, + "loss": 0.001, + "num_input_tokens_seen": 67153296, + "step": 115750 + }, + { + "epoch": 17.240840035746203, + "grad_norm": 2.6066072678077035e-06, + "learning_rate": 2.8435136282852217e-06, + "loss": 0.0, + "num_input_tokens_seen": 67156336, + "step": 115755 + }, + { + "epoch": 17.24158474828716, + "grad_norm": 0.00010153920447919518, + "learning_rate": 2.842008718718467e-06, + "loss": 0.0589, + "num_input_tokens_seen": 67159344, + "step": 115760 + }, + { + "epoch": 17.24232946082812, + "grad_norm": 7.163661939557642e-05, + "learning_rate": 2.840504183488457e-06, + "loss": 0.0, + "num_input_tokens_seen": 67162448, + "step": 115765 + }, + { + "epoch": 17.24307417336908, + "grad_norm": 9.704169315227773e-06, + "learning_rate": 2.8390000226206025e-06, + "loss": 0.0, + "num_input_tokens_seen": 67165104, + "step": 115770 + }, + { + "epoch": 17.243818885910038, + "grad_norm": 0.00010703686712076887, + "learning_rate": 2.837496236140322e-06, + "loss": 0.0, + "num_input_tokens_seen": 67167920, + "step": 115775 + }, + { + "epoch": 17.244563598451, + "grad_norm": 3.7537906791840214e-06, + "learning_rate": 2.835992824073011e-06, + "loss": 0.0, + "num_input_tokens_seen": 67170544, + "step": 115780 + }, + { + "epoch": 17.245308310991955, + "grad_norm": 3.08513299387414e-05, + "learning_rate": 2.8344897864440805e-06, + "loss": 0.0, + "num_input_tokens_seen": 67173264, + "step": 115785 + }, + { + "epoch": 17.246053023532916, + "grad_norm": 1.843762947828509e-05, + "learning_rate": 2.832987123278913e-06, + "loss": 0.0, + "num_input_tokens_seen": 67176208, + "step": 115790 + }, + { + "epoch": 17.246797736073876, + "grad_norm": 3.1256392958312063e-06, + "learning_rate": 2.8314848346029017e-06, + "loss": 0.0, + "num_input_tokens_seen": 67178960, + "step": 115795 + }, + { + "epoch": 17.247542448614833, + "grad_norm": 2.763977590802824e-06, + "learning_rate": 2.829982920441421e-06, + "loss": 0.0, + "num_input_tokens_seen": 67181712, + "step": 115800 + }, + { + "epoch": 17.248287161155794, + "grad_norm": 1.984115078812465e-05, + "learning_rate": 2.8284813808198473e-06, + "loss": 0.0, + "num_input_tokens_seen": 67184528, + "step": 115805 + }, + { + "epoch": 17.249031873696755, + "grad_norm": 0.04954378306865692, + "learning_rate": 2.82698021576355e-06, + "loss": 0.1532, + "num_input_tokens_seen": 67187312, + "step": 115810 + }, + { + "epoch": 17.24977658623771, + "grad_norm": 3.117125061180559e-06, + "learning_rate": 2.825479425297878e-06, + "loss": 0.0, + "num_input_tokens_seen": 67190288, + "step": 115815 + }, + { + "epoch": 17.250521298778672, + "grad_norm": 1.8851364075089805e-05, + "learning_rate": 2.823979009448202e-06, + "loss": 0.0, + "num_input_tokens_seen": 67192880, + "step": 115820 + }, + { + "epoch": 17.25126601131963, + "grad_norm": 1.908614649437368e-05, + "learning_rate": 2.8224789682398556e-06, + "loss": 0.0, + "num_input_tokens_seen": 67195792, + "step": 115825 + }, + { + "epoch": 17.25201072386059, + "grad_norm": 7.290491339517757e-06, + "learning_rate": 2.8209793016981927e-06, + "loss": 0.0, + "num_input_tokens_seen": 67198640, + "step": 115830 + }, + { + "epoch": 17.25275543640155, + "grad_norm": 7.648253813385963e-05, + "learning_rate": 2.8194800098485407e-06, + "loss": 0.0, + "num_input_tokens_seen": 67201584, + "step": 115835 + }, + { + "epoch": 17.253500148942507, + "grad_norm": 8.106965651677456e-06, + "learning_rate": 2.817981092716232e-06, + "loss": 0.0, + "num_input_tokens_seen": 67204304, + "step": 115840 + }, + { + "epoch": 17.254244861483468, + "grad_norm": 0.00772436335682869, + "learning_rate": 2.8164825503265825e-06, + "loss": 0.0, + "num_input_tokens_seen": 67207248, + "step": 115845 + }, + { + "epoch": 17.254989574024428, + "grad_norm": 3.1211616260407027e-06, + "learning_rate": 2.8149843827049186e-06, + "loss": 0.0, + "num_input_tokens_seen": 67209872, + "step": 115850 + }, + { + "epoch": 17.255734286565385, + "grad_norm": 5.216590579948388e-06, + "learning_rate": 2.813486589876549e-06, + "loss": 0.0, + "num_input_tokens_seen": 67212656, + "step": 115855 + }, + { + "epoch": 17.256478999106346, + "grad_norm": 4.330334922997281e-05, + "learning_rate": 2.8119891718667664e-06, + "loss": 0.0, + "num_input_tokens_seen": 67215280, + "step": 115860 + }, + { + "epoch": 17.257223711647303, + "grad_norm": 5.039914321969263e-05, + "learning_rate": 2.8104921287008785e-06, + "loss": 0.0, + "num_input_tokens_seen": 67218320, + "step": 115865 + }, + { + "epoch": 17.257968424188263, + "grad_norm": 4.9453565225121565e-06, + "learning_rate": 2.8089954604041734e-06, + "loss": 0.0, + "num_input_tokens_seen": 67221104, + "step": 115870 + }, + { + "epoch": 17.258713136729224, + "grad_norm": 1.0603223017824348e-05, + "learning_rate": 2.807499167001937e-06, + "loss": 0.0, + "num_input_tokens_seen": 67223824, + "step": 115875 + }, + { + "epoch": 17.25945784927018, + "grad_norm": 6.607799150515348e-06, + "learning_rate": 2.8060032485194453e-06, + "loss": 0.0, + "num_input_tokens_seen": 67226576, + "step": 115880 + }, + { + "epoch": 17.26020256181114, + "grad_norm": 5.364728349377401e-05, + "learning_rate": 2.8045077049819733e-06, + "loss": 0.0, + "num_input_tokens_seen": 67229168, + "step": 115885 + }, + { + "epoch": 17.2609472743521, + "grad_norm": 0.0012973649427294731, + "learning_rate": 2.8030125364147868e-06, + "loss": 0.0, + "num_input_tokens_seen": 67232112, + "step": 115890 + }, + { + "epoch": 17.26169198689306, + "grad_norm": 0.001835253438912332, + "learning_rate": 2.8015177428431433e-06, + "loss": 0.0, + "num_input_tokens_seen": 67235120, + "step": 115895 + }, + { + "epoch": 17.26243669943402, + "grad_norm": 3.113653065156541e-06, + "learning_rate": 2.8000233242922973e-06, + "loss": 0.0, + "num_input_tokens_seen": 67238160, + "step": 115900 + }, + { + "epoch": 17.263181411974976, + "grad_norm": 7.081071089487523e-06, + "learning_rate": 2.7985292807874873e-06, + "loss": 0.0, + "num_input_tokens_seen": 67240688, + "step": 115905 + }, + { + "epoch": 17.263926124515937, + "grad_norm": 4.2122253944398835e-05, + "learning_rate": 2.797035612353968e-06, + "loss": 0.0, + "num_input_tokens_seen": 67243248, + "step": 115910 + }, + { + "epoch": 17.264670837056897, + "grad_norm": 3.7209115362202283e-06, + "learning_rate": 2.7955423190169585e-06, + "loss": 0.0, + "num_input_tokens_seen": 67246256, + "step": 115915 + }, + { + "epoch": 17.265415549597854, + "grad_norm": 3.373391791683389e-06, + "learning_rate": 2.794049400801699e-06, + "loss": 0.0, + "num_input_tokens_seen": 67249072, + "step": 115920 + }, + { + "epoch": 17.266160262138815, + "grad_norm": 4.9218291678698733e-05, + "learning_rate": 2.792556857733403e-06, + "loss": 0.0, + "num_input_tokens_seen": 67251728, + "step": 115925 + }, + { + "epoch": 17.266904974679775, + "grad_norm": 3.2456243843626e-06, + "learning_rate": 2.7910646898372916e-06, + "loss": 0.0, + "num_input_tokens_seen": 67254480, + "step": 115930 + }, + { + "epoch": 17.267649687220732, + "grad_norm": 2.1065702640044037e-06, + "learning_rate": 2.7895728971385706e-06, + "loss": 0.0, + "num_input_tokens_seen": 67257456, + "step": 115935 + }, + { + "epoch": 17.268394399761693, + "grad_norm": 2.102799953718204e-06, + "learning_rate": 2.7880814796624355e-06, + "loss": 0.0, + "num_input_tokens_seen": 67260304, + "step": 115940 + }, + { + "epoch": 17.26913911230265, + "grad_norm": 0.0013541047228500247, + "learning_rate": 2.7865904374340947e-06, + "loss": 0.0, + "num_input_tokens_seen": 67263376, + "step": 115945 + }, + { + "epoch": 17.26988382484361, + "grad_norm": 0.000529614626429975, + "learning_rate": 2.7850997704787244e-06, + "loss": 0.0, + "num_input_tokens_seen": 67266256, + "step": 115950 + }, + { + "epoch": 17.27062853738457, + "grad_norm": 8.761310891713947e-05, + "learning_rate": 2.783609478821525e-06, + "loss": 0.0429, + "num_input_tokens_seen": 67269072, + "step": 115955 + }, + { + "epoch": 17.271373249925528, + "grad_norm": 0.004614043980836868, + "learning_rate": 2.782119562487662e-06, + "loss": 0.0, + "num_input_tokens_seen": 67271920, + "step": 115960 + }, + { + "epoch": 17.27211796246649, + "grad_norm": 0.0006172812427394092, + "learning_rate": 2.7806300215023063e-06, + "loss": 0.0, + "num_input_tokens_seen": 67275088, + "step": 115965 + }, + { + "epoch": 17.272862675007445, + "grad_norm": 0.0010625323047861457, + "learning_rate": 2.7791408558906245e-06, + "loss": 0.0, + "num_input_tokens_seen": 67278064, + "step": 115970 + }, + { + "epoch": 17.273607387548406, + "grad_norm": 5.397265704232268e-05, + "learning_rate": 2.777652065677766e-06, + "loss": 0.0, + "num_input_tokens_seen": 67280816, + "step": 115975 + }, + { + "epoch": 17.274352100089367, + "grad_norm": 9.253302778233774e-06, + "learning_rate": 2.7761636508888995e-06, + "loss": 0.0, + "num_input_tokens_seen": 67283792, + "step": 115980 + }, + { + "epoch": 17.275096812630323, + "grad_norm": 7.0098699325171765e-06, + "learning_rate": 2.774675611549152e-06, + "loss": 0.0, + "num_input_tokens_seen": 67286672, + "step": 115985 + }, + { + "epoch": 17.275841525171284, + "grad_norm": 2.6639750103640836e-06, + "learning_rate": 2.773187947683678e-06, + "loss": 0.0, + "num_input_tokens_seen": 67289552, + "step": 115990 + }, + { + "epoch": 17.276586237712245, + "grad_norm": 0.002250456949695945, + "learning_rate": 2.7717006593175997e-06, + "loss": 0.0, + "num_input_tokens_seen": 67292336, + "step": 115995 + }, + { + "epoch": 17.2773309502532, + "grad_norm": 4.18826166423969e-05, + "learning_rate": 2.7702137464760497e-06, + "loss": 0.0, + "num_input_tokens_seen": 67295088, + "step": 116000 + }, + { + "epoch": 17.278075662794162, + "grad_norm": 3.35216500388924e-05, + "learning_rate": 2.768727209184141e-06, + "loss": 0.0, + "num_input_tokens_seen": 67298096, + "step": 116005 + }, + { + "epoch": 17.27882037533512, + "grad_norm": 1.0964648936351296e-05, + "learning_rate": 2.767241047466998e-06, + "loss": 0.0, + "num_input_tokens_seen": 67301200, + "step": 116010 + }, + { + "epoch": 17.27956508787608, + "grad_norm": 2.0829581899306504e-06, + "learning_rate": 2.765755261349717e-06, + "loss": 0.0, + "num_input_tokens_seen": 67304432, + "step": 116015 + }, + { + "epoch": 17.28030980041704, + "grad_norm": 7.84862641012296e-06, + "learning_rate": 2.764269850857401e-06, + "loss": 0.0, + "num_input_tokens_seen": 67307312, + "step": 116020 + }, + { + "epoch": 17.281054512957997, + "grad_norm": 0.0004236296226736158, + "learning_rate": 2.7627848160151513e-06, + "loss": 0.0, + "num_input_tokens_seen": 67310416, + "step": 116025 + }, + { + "epoch": 17.281799225498958, + "grad_norm": 1.1320918929413892e-05, + "learning_rate": 2.7613001568480514e-06, + "loss": 0.0, + "num_input_tokens_seen": 67313328, + "step": 116030 + }, + { + "epoch": 17.282543938039918, + "grad_norm": 1.303299632127164e-05, + "learning_rate": 2.759815873381183e-06, + "loss": 0.0, + "num_input_tokens_seen": 67316304, + "step": 116035 + }, + { + "epoch": 17.283288650580875, + "grad_norm": 3.6738331345986808e-06, + "learning_rate": 2.7583319656396155e-06, + "loss": 0.0, + "num_input_tokens_seen": 67318928, + "step": 116040 + }, + { + "epoch": 17.284033363121836, + "grad_norm": 0.00043989764526486397, + "learning_rate": 2.756848433648429e-06, + "loss": 0.0, + "num_input_tokens_seen": 67321968, + "step": 116045 + }, + { + "epoch": 17.284778075662793, + "grad_norm": 1.7134461813839152e-05, + "learning_rate": 2.755365277432681e-06, + "loss": 0.0, + "num_input_tokens_seen": 67324976, + "step": 116050 + }, + { + "epoch": 17.285522788203753, + "grad_norm": 1.233332386618713e-05, + "learning_rate": 2.753882497017424e-06, + "loss": 0.0, + "num_input_tokens_seen": 67327856, + "step": 116055 + }, + { + "epoch": 17.286267500744714, + "grad_norm": 3.687172011268558e-06, + "learning_rate": 2.7524000924277178e-06, + "loss": 0.0, + "num_input_tokens_seen": 67330640, + "step": 116060 + }, + { + "epoch": 17.28701221328567, + "grad_norm": 8.327103569172323e-05, + "learning_rate": 2.7509180636885927e-06, + "loss": 0.0, + "num_input_tokens_seen": 67333360, + "step": 116065 + }, + { + "epoch": 17.28775692582663, + "grad_norm": 1.1305533917038701e-05, + "learning_rate": 2.7494364108251016e-06, + "loss": 0.0, + "num_input_tokens_seen": 67336368, + "step": 116070 + }, + { + "epoch": 17.288501638367592, + "grad_norm": 5.555238385568373e-05, + "learning_rate": 2.747955133862262e-06, + "loss": 0.0, + "num_input_tokens_seen": 67338992, + "step": 116075 + }, + { + "epoch": 17.28924635090855, + "grad_norm": 2.5368174192408333e-06, + "learning_rate": 2.746474232825111e-06, + "loss": 0.0, + "num_input_tokens_seen": 67341680, + "step": 116080 + }, + { + "epoch": 17.28999106344951, + "grad_norm": 3.453641329542734e-05, + "learning_rate": 2.744993707738655e-06, + "loss": 0.0, + "num_input_tokens_seen": 67344688, + "step": 116085 + }, + { + "epoch": 17.290735775990466, + "grad_norm": 6.407035471056588e-06, + "learning_rate": 2.7435135586279165e-06, + "loss": 0.0, + "num_input_tokens_seen": 67347600, + "step": 116090 + }, + { + "epoch": 17.291480488531427, + "grad_norm": 1.1162854207213968e-05, + "learning_rate": 2.7420337855178944e-06, + "loss": 0.0, + "num_input_tokens_seen": 67350544, + "step": 116095 + }, + { + "epoch": 17.292225201072387, + "grad_norm": 5.253410108707612e-06, + "learning_rate": 2.7405543884335887e-06, + "loss": 0.0, + "num_input_tokens_seen": 67353680, + "step": 116100 + }, + { + "epoch": 17.292969913613344, + "grad_norm": 4.549644472717773e-06, + "learning_rate": 2.739075367399996e-06, + "loss": 0.0, + "num_input_tokens_seen": 67356432, + "step": 116105 + }, + { + "epoch": 17.293714626154305, + "grad_norm": 1.071702081389958e-05, + "learning_rate": 2.7375967224420928e-06, + "loss": 0.0002, + "num_input_tokens_seen": 67359120, + "step": 116110 + }, + { + "epoch": 17.294459338695262, + "grad_norm": 4.941099632560508e-06, + "learning_rate": 2.736118453584871e-06, + "loss": 0.0, + "num_input_tokens_seen": 67361776, + "step": 116115 + }, + { + "epoch": 17.295204051236222, + "grad_norm": 1.5799661923665553e-05, + "learning_rate": 2.7346405608532965e-06, + "loss": 0.0, + "num_input_tokens_seen": 67364560, + "step": 116120 + }, + { + "epoch": 17.295948763777183, + "grad_norm": 4.50691595688113e-06, + "learning_rate": 2.7331630442723466e-06, + "loss": 0.0, + "num_input_tokens_seen": 67367664, + "step": 116125 + }, + { + "epoch": 17.29669347631814, + "grad_norm": 4.0544578951084986e-05, + "learning_rate": 2.7316859038669736e-06, + "loss": 0.0, + "num_input_tokens_seen": 67370576, + "step": 116130 + }, + { + "epoch": 17.2974381888591, + "grad_norm": 2.496061824786011e-06, + "learning_rate": 2.7302091396621294e-06, + "loss": 0.0, + "num_input_tokens_seen": 67373488, + "step": 116135 + }, + { + "epoch": 17.29818290140006, + "grad_norm": 4.009019448858453e-06, + "learning_rate": 2.7287327516827748e-06, + "loss": 0.0, + "num_input_tokens_seen": 67376272, + "step": 116140 + }, + { + "epoch": 17.298927613941018, + "grad_norm": 2.985752871609293e-06, + "learning_rate": 2.7272567399538375e-06, + "loss": 0.0, + "num_input_tokens_seen": 67379504, + "step": 116145 + }, + { + "epoch": 17.29967232648198, + "grad_norm": 0.0017961988924071193, + "learning_rate": 2.725781104500269e-06, + "loss": 0.0, + "num_input_tokens_seen": 67382640, + "step": 116150 + }, + { + "epoch": 17.300417039022935, + "grad_norm": 3.136192026431672e-05, + "learning_rate": 2.7243058453469835e-06, + "loss": 0.0, + "num_input_tokens_seen": 67385200, + "step": 116155 + }, + { + "epoch": 17.301161751563896, + "grad_norm": 0.00366851594299078, + "learning_rate": 2.722830962518913e-06, + "loss": 0.0, + "num_input_tokens_seen": 67388208, + "step": 116160 + }, + { + "epoch": 17.301906464104857, + "grad_norm": 7.715511856076773e-06, + "learning_rate": 2.7213564560409743e-06, + "loss": 0.0, + "num_input_tokens_seen": 67390768, + "step": 116165 + }, + { + "epoch": 17.302651176645814, + "grad_norm": 1.0166811080125626e-05, + "learning_rate": 2.7198823259380777e-06, + "loss": 0.0, + "num_input_tokens_seen": 67393808, + "step": 116170 + }, + { + "epoch": 17.303395889186774, + "grad_norm": 2.0512811715889256e-06, + "learning_rate": 2.7184085722351205e-06, + "loss": 0.0, + "num_input_tokens_seen": 67396624, + "step": 116175 + }, + { + "epoch": 17.304140601727735, + "grad_norm": 3.275338940511574e-06, + "learning_rate": 2.7169351949570017e-06, + "loss": 0.0, + "num_input_tokens_seen": 67399696, + "step": 116180 + }, + { + "epoch": 17.30488531426869, + "grad_norm": 0.0006329136667773128, + "learning_rate": 2.7154621941286206e-06, + "loss": 0.0, + "num_input_tokens_seen": 67402736, + "step": 116185 + }, + { + "epoch": 17.305630026809652, + "grad_norm": 9.026793850352988e-05, + "learning_rate": 2.7139895697748496e-06, + "loss": 0.0, + "num_input_tokens_seen": 67405744, + "step": 116190 + }, + { + "epoch": 17.30637473935061, + "grad_norm": 6.996643787715584e-05, + "learning_rate": 2.7125173219205824e-06, + "loss": 0.0, + "num_input_tokens_seen": 67408624, + "step": 116195 + }, + { + "epoch": 17.30711945189157, + "grad_norm": 1.6972164303297177e-05, + "learning_rate": 2.711045450590677e-06, + "loss": 0.0, + "num_input_tokens_seen": 67411312, + "step": 116200 + }, + { + "epoch": 17.30786416443253, + "grad_norm": 2.6725115276349243e-06, + "learning_rate": 2.7095739558100074e-06, + "loss": 0.0, + "num_input_tokens_seen": 67414352, + "step": 116205 + }, + { + "epoch": 17.308608876973487, + "grad_norm": 2.7955851692240685e-05, + "learning_rate": 2.708102837603435e-06, + "loss": 0.0, + "num_input_tokens_seen": 67417008, + "step": 116210 + }, + { + "epoch": 17.309353589514448, + "grad_norm": 0.00013858302554581314, + "learning_rate": 2.706632095995801e-06, + "loss": 0.0, + "num_input_tokens_seen": 67420176, + "step": 116215 + }, + { + "epoch": 17.31009830205541, + "grad_norm": 8.91011004569009e-05, + "learning_rate": 2.7051617310119653e-06, + "loss": 0.0, + "num_input_tokens_seen": 67422992, + "step": 116220 + }, + { + "epoch": 17.310843014596365, + "grad_norm": 0.0013924642698839307, + "learning_rate": 2.7036917426767615e-06, + "loss": 0.0, + "num_input_tokens_seen": 67425648, + "step": 116225 + }, + { + "epoch": 17.311587727137326, + "grad_norm": 7.3474579949106555e-06, + "learning_rate": 2.702222131015028e-06, + "loss": 0.0, + "num_input_tokens_seen": 67428624, + "step": 116230 + }, + { + "epoch": 17.312332439678283, + "grad_norm": 0.0008454612689092755, + "learning_rate": 2.700752896051581e-06, + "loss": 0.0, + "num_input_tokens_seen": 67431632, + "step": 116235 + }, + { + "epoch": 17.313077152219243, + "grad_norm": 0.00012597764725796878, + "learning_rate": 2.699284037811256e-06, + "loss": 0.0, + "num_input_tokens_seen": 67434608, + "step": 116240 + }, + { + "epoch": 17.313821864760204, + "grad_norm": 9.742614565766416e-06, + "learning_rate": 2.6978155563188583e-06, + "loss": 0.0, + "num_input_tokens_seen": 67437936, + "step": 116245 + }, + { + "epoch": 17.31456657730116, + "grad_norm": 0.0005644919583573937, + "learning_rate": 2.6963474515992044e-06, + "loss": 0.0, + "num_input_tokens_seen": 67441136, + "step": 116250 + }, + { + "epoch": 17.31531128984212, + "grad_norm": 9.966411198547576e-06, + "learning_rate": 2.6948797236770907e-06, + "loss": 0.0, + "num_input_tokens_seen": 67443888, + "step": 116255 + }, + { + "epoch": 17.316056002383082, + "grad_norm": 1.0093555829371326e-05, + "learning_rate": 2.6934123725773088e-06, + "loss": 0.0, + "num_input_tokens_seen": 67446576, + "step": 116260 + }, + { + "epoch": 17.31680071492404, + "grad_norm": 3.6456979159993352e-06, + "learning_rate": 2.6919453983246577e-06, + "loss": 0.0, + "num_input_tokens_seen": 67449552, + "step": 116265 + }, + { + "epoch": 17.317545427465, + "grad_norm": 4.258609624230303e-05, + "learning_rate": 2.690478800943913e-06, + "loss": 0.0, + "num_input_tokens_seen": 67452368, + "step": 116270 + }, + { + "epoch": 17.318290140005956, + "grad_norm": 1.1358813026163261e-05, + "learning_rate": 2.68901258045986e-06, + "loss": 0.0, + "num_input_tokens_seen": 67455216, + "step": 116275 + }, + { + "epoch": 17.319034852546917, + "grad_norm": 2.1154541173018515e-05, + "learning_rate": 2.6875467368972563e-06, + "loss": 0.0, + "num_input_tokens_seen": 67457904, + "step": 116280 + }, + { + "epoch": 17.319779565087877, + "grad_norm": 0.00031350363860838115, + "learning_rate": 2.6860812702808795e-06, + "loss": 0.0, + "num_input_tokens_seen": 67460976, + "step": 116285 + }, + { + "epoch": 17.320524277628834, + "grad_norm": 2.628403490234632e-05, + "learning_rate": 2.6846161806354826e-06, + "loss": 0.0, + "num_input_tokens_seen": 67463888, + "step": 116290 + }, + { + "epoch": 17.321268990169795, + "grad_norm": 2.458952621964272e-06, + "learning_rate": 2.6831514679858115e-06, + "loss": 0.0, + "num_input_tokens_seen": 67466864, + "step": 116295 + }, + { + "epoch": 17.322013702710752, + "grad_norm": 3.999493856099434e-06, + "learning_rate": 2.6816871323566165e-06, + "loss": 0.0, + "num_input_tokens_seen": 67469936, + "step": 116300 + }, + { + "epoch": 17.322758415251712, + "grad_norm": 2.4316090275533497e-05, + "learning_rate": 2.6802231737726307e-06, + "loss": 0.0, + "num_input_tokens_seen": 67472752, + "step": 116305 + }, + { + "epoch": 17.323503127792673, + "grad_norm": 9.949380000762176e-06, + "learning_rate": 2.6787595922585924e-06, + "loss": 0.0, + "num_input_tokens_seen": 67475856, + "step": 116310 + }, + { + "epoch": 17.32424784033363, + "grad_norm": 1.2127031368436292e-05, + "learning_rate": 2.6772963878392177e-06, + "loss": 0.0, + "num_input_tokens_seen": 67478736, + "step": 116315 + }, + { + "epoch": 17.32499255287459, + "grad_norm": 4.325062491261633e-06, + "learning_rate": 2.67583356053924e-06, + "loss": 0.0, + "num_input_tokens_seen": 67481584, + "step": 116320 + }, + { + "epoch": 17.32573726541555, + "grad_norm": 8.028969205042813e-06, + "learning_rate": 2.6743711103833614e-06, + "loss": 0.0, + "num_input_tokens_seen": 67484240, + "step": 116325 + }, + { + "epoch": 17.326481977956508, + "grad_norm": 2.846717279680888e-06, + "learning_rate": 2.6729090373962957e-06, + "loss": 0.0, + "num_input_tokens_seen": 67487216, + "step": 116330 + }, + { + "epoch": 17.32722669049747, + "grad_norm": 7.237283716676757e-05, + "learning_rate": 2.67144734160274e-06, + "loss": 0.0, + "num_input_tokens_seen": 67489776, + "step": 116335 + }, + { + "epoch": 17.327971403038426, + "grad_norm": 3.7938607420073822e-06, + "learning_rate": 2.669986023027382e-06, + "loss": 0.0, + "num_input_tokens_seen": 67492592, + "step": 116340 + }, + { + "epoch": 17.328716115579386, + "grad_norm": 5.01059002999682e-05, + "learning_rate": 2.6685250816949196e-06, + "loss": 0.0, + "num_input_tokens_seen": 67495920, + "step": 116345 + }, + { + "epoch": 17.329460828120347, + "grad_norm": 4.377513505460229e-06, + "learning_rate": 2.6670645176300246e-06, + "loss": 0.0, + "num_input_tokens_seen": 67498864, + "step": 116350 + }, + { + "epoch": 17.330205540661304, + "grad_norm": 3.0382673230633372e-06, + "learning_rate": 2.6656043308573826e-06, + "loss": 0.0, + "num_input_tokens_seen": 67501584, + "step": 116355 + }, + { + "epoch": 17.330950253202264, + "grad_norm": 1.904272176034283e-05, + "learning_rate": 2.664144521401654e-06, + "loss": 0.0, + "num_input_tokens_seen": 67504624, + "step": 116360 + }, + { + "epoch": 17.331694965743225, + "grad_norm": 1.6321117072948255e-05, + "learning_rate": 2.6626850892875033e-06, + "loss": 0.0, + "num_input_tokens_seen": 67507504, + "step": 116365 + }, + { + "epoch": 17.33243967828418, + "grad_norm": 4.89517242385773e-06, + "learning_rate": 2.6612260345395797e-06, + "loss": 0.0, + "num_input_tokens_seen": 67510320, + "step": 116370 + }, + { + "epoch": 17.333184390825142, + "grad_norm": 7.735102371952962e-06, + "learning_rate": 2.6597673571825436e-06, + "loss": 0.0, + "num_input_tokens_seen": 67513008, + "step": 116375 + }, + { + "epoch": 17.3339291033661, + "grad_norm": 4.054459623148432e-06, + "learning_rate": 2.658309057241032e-06, + "loss": 0.0, + "num_input_tokens_seen": 67515760, + "step": 116380 + }, + { + "epoch": 17.33467381590706, + "grad_norm": 8.065495421760716e-06, + "learning_rate": 2.6568511347396795e-06, + "loss": 0.0, + "num_input_tokens_seen": 67518608, + "step": 116385 + }, + { + "epoch": 17.33541852844802, + "grad_norm": 5.126320320414379e-05, + "learning_rate": 2.6553935897031203e-06, + "loss": 0.0, + "num_input_tokens_seen": 67521424, + "step": 116390 + }, + { + "epoch": 17.336163240988977, + "grad_norm": 4.787796569871716e-05, + "learning_rate": 2.6539364221559725e-06, + "loss": 0.0, + "num_input_tokens_seen": 67524272, + "step": 116395 + }, + { + "epoch": 17.336907953529938, + "grad_norm": 0.001572884968481958, + "learning_rate": 2.652479632122862e-06, + "loss": 0.0, + "num_input_tokens_seen": 67526800, + "step": 116400 + }, + { + "epoch": 17.3376526660709, + "grad_norm": 1.1017782526323572e-05, + "learning_rate": 2.65102321962839e-06, + "loss": 0.0, + "num_input_tokens_seen": 67530224, + "step": 116405 + }, + { + "epoch": 17.338397378611855, + "grad_norm": 0.00037234267801977694, + "learning_rate": 2.6495671846971716e-06, + "loss": 0.0, + "num_input_tokens_seen": 67533136, + "step": 116410 + }, + { + "epoch": 17.339142091152816, + "grad_norm": 0.00017448193102609366, + "learning_rate": 2.6481115273538e-06, + "loss": 0.0, + "num_input_tokens_seen": 67536112, + "step": 116415 + }, + { + "epoch": 17.339886803693773, + "grad_norm": 0.0001768818765413016, + "learning_rate": 2.6466562476228612e-06, + "loss": 0.0, + "num_input_tokens_seen": 67539024, + "step": 116420 + }, + { + "epoch": 17.340631516234733, + "grad_norm": 0.00014378561172634363, + "learning_rate": 2.645201345528953e-06, + "loss": 0.0, + "num_input_tokens_seen": 67542256, + "step": 116425 + }, + { + "epoch": 17.341376228775694, + "grad_norm": 1.6434189092251472e-05, + "learning_rate": 2.643746821096646e-06, + "loss": 0.0, + "num_input_tokens_seen": 67545232, + "step": 116430 + }, + { + "epoch": 17.34212094131665, + "grad_norm": 1.3694410881726071e-05, + "learning_rate": 2.6422926743505132e-06, + "loss": 0.0, + "num_input_tokens_seen": 67547856, + "step": 116435 + }, + { + "epoch": 17.34286565385761, + "grad_norm": 1.7647504137130454e-05, + "learning_rate": 2.6408389053151185e-06, + "loss": 0.0, + "num_input_tokens_seen": 67550832, + "step": 116440 + }, + { + "epoch": 17.343610366398572, + "grad_norm": 2.4899829895730363e-06, + "learning_rate": 2.6393855140150304e-06, + "loss": 0.0, + "num_input_tokens_seen": 67553872, + "step": 116445 + }, + { + "epoch": 17.34435507893953, + "grad_norm": 6.051753189240117e-06, + "learning_rate": 2.6379325004747937e-06, + "loss": 0.0132, + "num_input_tokens_seen": 67556912, + "step": 116450 + }, + { + "epoch": 17.34509979148049, + "grad_norm": 4.0016552702581976e-06, + "learning_rate": 2.636479864718966e-06, + "loss": 0.0, + "num_input_tokens_seen": 67559760, + "step": 116455 + }, + { + "epoch": 17.345844504021446, + "grad_norm": 2.3712205802439712e-05, + "learning_rate": 2.635027606772078e-06, + "loss": 0.0, + "num_input_tokens_seen": 67562640, + "step": 116460 + }, + { + "epoch": 17.346589216562407, + "grad_norm": 1.0057516192318872e-05, + "learning_rate": 2.633575726658666e-06, + "loss": 0.0, + "num_input_tokens_seen": 67565616, + "step": 116465 + }, + { + "epoch": 17.347333929103367, + "grad_norm": 5.818592035211623e-05, + "learning_rate": 2.632124224403262e-06, + "loss": 0.0, + "num_input_tokens_seen": 67568560, + "step": 116470 + }, + { + "epoch": 17.348078641644324, + "grad_norm": 5.0057528824254405e-06, + "learning_rate": 2.6306731000303842e-06, + "loss": 0.0, + "num_input_tokens_seen": 67571440, + "step": 116475 + }, + { + "epoch": 17.348823354185285, + "grad_norm": 1.5112173059605993e-05, + "learning_rate": 2.6292223535645507e-06, + "loss": 0.0, + "num_input_tokens_seen": 67574544, + "step": 116480 + }, + { + "epoch": 17.349568066726242, + "grad_norm": 6.777801172574982e-05, + "learning_rate": 2.627771985030264e-06, + "loss": 0.0, + "num_input_tokens_seen": 67577264, + "step": 116485 + }, + { + "epoch": 17.350312779267203, + "grad_norm": 0.0006202124059200287, + "learning_rate": 2.6263219944520383e-06, + "loss": 0.0, + "num_input_tokens_seen": 67579952, + "step": 116490 + }, + { + "epoch": 17.351057491808163, + "grad_norm": 8.91132076503709e-05, + "learning_rate": 2.6248723818543625e-06, + "loss": 0.0, + "num_input_tokens_seen": 67582864, + "step": 116495 + }, + { + "epoch": 17.35180220434912, + "grad_norm": 0.00010589989688014612, + "learning_rate": 2.6234231472617276e-06, + "loss": 0.0, + "num_input_tokens_seen": 67585680, + "step": 116500 + }, + { + "epoch": 17.35254691689008, + "grad_norm": 2.846681354640168e-06, + "learning_rate": 2.6219742906986143e-06, + "loss": 0.0, + "num_input_tokens_seen": 67588528, + "step": 116505 + }, + { + "epoch": 17.35329162943104, + "grad_norm": 5.796539335278794e-05, + "learning_rate": 2.6205258121894976e-06, + "loss": 0.0, + "num_input_tokens_seen": 67591152, + "step": 116510 + }, + { + "epoch": 17.354036341971998, + "grad_norm": 7.553302111773519e-06, + "learning_rate": 2.619077711758858e-06, + "loss": 0.0, + "num_input_tokens_seen": 67594128, + "step": 116515 + }, + { + "epoch": 17.35478105451296, + "grad_norm": 5.146098374098074e-06, + "learning_rate": 2.6176299894311444e-06, + "loss": 0.0, + "num_input_tokens_seen": 67597008, + "step": 116520 + }, + { + "epoch": 17.355525767053916, + "grad_norm": 1.3240980479167774e-05, + "learning_rate": 2.616182645230833e-06, + "loss": 0.0, + "num_input_tokens_seen": 67599792, + "step": 116525 + }, + { + "epoch": 17.356270479594876, + "grad_norm": 1.9320765204611234e-05, + "learning_rate": 2.614735679182359e-06, + "loss": 0.0, + "num_input_tokens_seen": 67602832, + "step": 116530 + }, + { + "epoch": 17.357015192135837, + "grad_norm": 2.44739385379944e-06, + "learning_rate": 2.6132890913101783e-06, + "loss": 0.0, + "num_input_tokens_seen": 67605552, + "step": 116535 + }, + { + "epoch": 17.357759904676794, + "grad_norm": 6.570041023223894e-06, + "learning_rate": 2.6118428816387265e-06, + "loss": 0.0, + "num_input_tokens_seen": 67608368, + "step": 116540 + }, + { + "epoch": 17.358504617217754, + "grad_norm": 5.577810952672735e-06, + "learning_rate": 2.610397050192431e-06, + "loss": 0.0089, + "num_input_tokens_seen": 67611248, + "step": 116545 + }, + { + "epoch": 17.359249329758715, + "grad_norm": 9.126108125201426e-06, + "learning_rate": 2.6089515969957263e-06, + "loss": 0.0, + "num_input_tokens_seen": 67613872, + "step": 116550 + }, + { + "epoch": 17.35999404229967, + "grad_norm": 2.2427570911531802e-06, + "learning_rate": 2.607506522073025e-06, + "loss": 0.0, + "num_input_tokens_seen": 67616752, + "step": 116555 + }, + { + "epoch": 17.360738754840632, + "grad_norm": 5.9002081798098516e-06, + "learning_rate": 2.606061825448744e-06, + "loss": 0.0, + "num_input_tokens_seen": 67619536, + "step": 116560 + }, + { + "epoch": 17.36148346738159, + "grad_norm": 3.523382474668324e-05, + "learning_rate": 2.6046175071472835e-06, + "loss": 0.0, + "num_input_tokens_seen": 67622288, + "step": 116565 + }, + { + "epoch": 17.36222817992255, + "grad_norm": 2.2292470021056943e-06, + "learning_rate": 2.603173567193054e-06, + "loss": 0.0, + "num_input_tokens_seen": 67625136, + "step": 116570 + }, + { + "epoch": 17.36297289246351, + "grad_norm": 8.900758075469639e-06, + "learning_rate": 2.601730005610442e-06, + "loss": 0.0, + "num_input_tokens_seen": 67627824, + "step": 116575 + }, + { + "epoch": 17.363717605004467, + "grad_norm": 1.0191366527578793e-05, + "learning_rate": 2.6002868224238334e-06, + "loss": 0.0, + "num_input_tokens_seen": 67630480, + "step": 116580 + }, + { + "epoch": 17.364462317545428, + "grad_norm": 2.28199041885091e-06, + "learning_rate": 2.5988440176576172e-06, + "loss": 0.0, + "num_input_tokens_seen": 67633328, + "step": 116585 + }, + { + "epoch": 17.36520703008639, + "grad_norm": 3.7110601169842994e-06, + "learning_rate": 2.5974015913361597e-06, + "loss": 0.0, + "num_input_tokens_seen": 67636112, + "step": 116590 + }, + { + "epoch": 17.365951742627345, + "grad_norm": 1.6691879864083603e-05, + "learning_rate": 2.5959595434838363e-06, + "loss": 0.0, + "num_input_tokens_seen": 67638960, + "step": 116595 + }, + { + "epoch": 17.366696455168306, + "grad_norm": 2.6356390208093217e-06, + "learning_rate": 2.594517874125005e-06, + "loss": 0.0, + "num_input_tokens_seen": 67642064, + "step": 116600 + }, + { + "epoch": 17.367441167709263, + "grad_norm": 6.42675076960586e-05, + "learning_rate": 2.5930765832840238e-06, + "loss": 0.0, + "num_input_tokens_seen": 67645264, + "step": 116605 + }, + { + "epoch": 17.368185880250223, + "grad_norm": 3.47842205883353e-06, + "learning_rate": 2.5916356709852373e-06, + "loss": 0.0, + "num_input_tokens_seen": 67648272, + "step": 116610 + }, + { + "epoch": 17.368930592791184, + "grad_norm": 4.173337401880417e-06, + "learning_rate": 2.5901951372529933e-06, + "loss": 0.0, + "num_input_tokens_seen": 67650960, + "step": 116615 + }, + { + "epoch": 17.36967530533214, + "grad_norm": 1.660086309129838e-05, + "learning_rate": 2.5887549821116297e-06, + "loss": 0.0, + "num_input_tokens_seen": 67653968, + "step": 116620 + }, + { + "epoch": 17.3704200178731, + "grad_norm": 2.2147669369587675e-05, + "learning_rate": 2.5873152055854694e-06, + "loss": 0.0, + "num_input_tokens_seen": 67656592, + "step": 116625 + }, + { + "epoch": 17.37116473041406, + "grad_norm": 3.306508369860239e-05, + "learning_rate": 2.5858758076988425e-06, + "loss": 0.0, + "num_input_tokens_seen": 67659696, + "step": 116630 + }, + { + "epoch": 17.37190944295502, + "grad_norm": 0.0001474346936447546, + "learning_rate": 2.5844367884760577e-06, + "loss": 0.0, + "num_input_tokens_seen": 67662672, + "step": 116635 + }, + { + "epoch": 17.37265415549598, + "grad_norm": 0.0003314999339636415, + "learning_rate": 2.5829981479414346e-06, + "loss": 0.0, + "num_input_tokens_seen": 67665584, + "step": 116640 + }, + { + "epoch": 17.373398868036936, + "grad_norm": 0.0001375333231408149, + "learning_rate": 2.58155988611927e-06, + "loss": 0.0, + "num_input_tokens_seen": 67668496, + "step": 116645 + }, + { + "epoch": 17.374143580577897, + "grad_norm": 3.941265822504647e-05, + "learning_rate": 2.580122003033872e-06, + "loss": 0.0, + "num_input_tokens_seen": 67671216, + "step": 116650 + }, + { + "epoch": 17.374888293118858, + "grad_norm": 7.12273458702839e-06, + "learning_rate": 2.578684498709524e-06, + "loss": 0.0, + "num_input_tokens_seen": 67674224, + "step": 116655 + }, + { + "epoch": 17.375633005659815, + "grad_norm": 6.726761785103008e-05, + "learning_rate": 2.5772473731705106e-06, + "loss": 0.0, + "num_input_tokens_seen": 67677456, + "step": 116660 + }, + { + "epoch": 17.376377718200775, + "grad_norm": 9.469855285715312e-06, + "learning_rate": 2.5758106264411193e-06, + "loss": 0.0, + "num_input_tokens_seen": 67680336, + "step": 116665 + }, + { + "epoch": 17.377122430741732, + "grad_norm": 4.174555215286091e-06, + "learning_rate": 2.574374258545609e-06, + "loss": 0.0, + "num_input_tokens_seen": 67683088, + "step": 116670 + }, + { + "epoch": 17.377867143282693, + "grad_norm": 2.5827357603702694e-05, + "learning_rate": 2.5729382695082572e-06, + "loss": 0.0, + "num_input_tokens_seen": 67686224, + "step": 116675 + }, + { + "epoch": 17.378611855823653, + "grad_norm": 1.8976226783706807e-05, + "learning_rate": 2.5715026593533172e-06, + "loss": 0.0, + "num_input_tokens_seen": 67689392, + "step": 116680 + }, + { + "epoch": 17.37935656836461, + "grad_norm": 1.3714409760723356e-05, + "learning_rate": 2.5700674281050496e-06, + "loss": 0.0, + "num_input_tokens_seen": 67692112, + "step": 116685 + }, + { + "epoch": 17.38010128090557, + "grad_norm": 5.2584662626031786e-05, + "learning_rate": 2.568632575787694e-06, + "loss": 0.0, + "num_input_tokens_seen": 67695120, + "step": 116690 + }, + { + "epoch": 17.38084599344653, + "grad_norm": 0.0023235674016177654, + "learning_rate": 2.567198102425494e-06, + "loss": 0.0, + "num_input_tokens_seen": 67698096, + "step": 116695 + }, + { + "epoch": 17.381590705987488, + "grad_norm": 0.0016804292099550366, + "learning_rate": 2.5657640080426815e-06, + "loss": 0.0, + "num_input_tokens_seen": 67700944, + "step": 116700 + }, + { + "epoch": 17.38233541852845, + "grad_norm": 0.0001844376529334113, + "learning_rate": 2.5643302926634804e-06, + "loss": 0.0, + "num_input_tokens_seen": 67704240, + "step": 116705 + }, + { + "epoch": 17.383080131069406, + "grad_norm": 8.481649274472147e-05, + "learning_rate": 2.562896956312122e-06, + "loss": 0.0, + "num_input_tokens_seen": 67706864, + "step": 116710 + }, + { + "epoch": 17.383824843610366, + "grad_norm": 3.806979293585755e-06, + "learning_rate": 2.5614639990128113e-06, + "loss": 0.0, + "num_input_tokens_seen": 67709744, + "step": 116715 + }, + { + "epoch": 17.384569556151327, + "grad_norm": 1.6902211427805014e-05, + "learning_rate": 2.560031420789763e-06, + "loss": 0.0, + "num_input_tokens_seen": 67712560, + "step": 116720 + }, + { + "epoch": 17.385314268692284, + "grad_norm": 0.0001375512802042067, + "learning_rate": 2.558599221667174e-06, + "loss": 0.0, + "num_input_tokens_seen": 67715472, + "step": 116725 + }, + { + "epoch": 17.386058981233244, + "grad_norm": 6.730700260959566e-05, + "learning_rate": 2.557167401669247e-06, + "loss": 0.0, + "num_input_tokens_seen": 67718544, + "step": 116730 + }, + { + "epoch": 17.386803693774205, + "grad_norm": 5.06293554281001e-06, + "learning_rate": 2.555735960820169e-06, + "loss": 0.0, + "num_input_tokens_seen": 67721520, + "step": 116735 + }, + { + "epoch": 17.38754840631516, + "grad_norm": 2.6255356715410016e-05, + "learning_rate": 2.554304899144111e-06, + "loss": 0.0, + "num_input_tokens_seen": 67724304, + "step": 116740 + }, + { + "epoch": 17.388293118856122, + "grad_norm": 2.6072132186527597e-06, + "learning_rate": 2.552874216665269e-06, + "loss": 0.0, + "num_input_tokens_seen": 67727248, + "step": 116745 + }, + { + "epoch": 17.38903783139708, + "grad_norm": 0.00020498040248639882, + "learning_rate": 2.5514439134077945e-06, + "loss": 0.0, + "num_input_tokens_seen": 67730064, + "step": 116750 + }, + { + "epoch": 17.38978254393804, + "grad_norm": 2.0196819605189376e-05, + "learning_rate": 2.5500139893958663e-06, + "loss": 0.0, + "num_input_tokens_seen": 67732784, + "step": 116755 + }, + { + "epoch": 17.390527256479, + "grad_norm": 2.903495442296844e-06, + "learning_rate": 2.5485844446536316e-06, + "loss": 0.0, + "num_input_tokens_seen": 67735824, + "step": 116760 + }, + { + "epoch": 17.391271969019957, + "grad_norm": 1.130032615037635e-05, + "learning_rate": 2.5471552792052463e-06, + "loss": 0.0, + "num_input_tokens_seen": 67738576, + "step": 116765 + }, + { + "epoch": 17.392016681560918, + "grad_norm": 1.3875889635528438e-05, + "learning_rate": 2.545726493074849e-06, + "loss": 0.0, + "num_input_tokens_seen": 67741456, + "step": 116770 + }, + { + "epoch": 17.39276139410188, + "grad_norm": 1.645917109271977e-05, + "learning_rate": 2.544298086286584e-06, + "loss": 0.0, + "num_input_tokens_seen": 67744336, + "step": 116775 + }, + { + "epoch": 17.393506106642835, + "grad_norm": 1.4414231372938957e-05, + "learning_rate": 2.54287005886458e-06, + "loss": 0.0, + "num_input_tokens_seen": 67746960, + "step": 116780 + }, + { + "epoch": 17.394250819183796, + "grad_norm": 2.606205953270546e-06, + "learning_rate": 2.541442410832959e-06, + "loss": 0.0, + "num_input_tokens_seen": 67749808, + "step": 116785 + }, + { + "epoch": 17.394995531724753, + "grad_norm": 1.6503294318681583e-05, + "learning_rate": 2.540015142215846e-06, + "loss": 0.0, + "num_input_tokens_seen": 67752432, + "step": 116790 + }, + { + "epoch": 17.395740244265713, + "grad_norm": 1.7961208868655376e-05, + "learning_rate": 2.5385882530373438e-06, + "loss": 0.0, + "num_input_tokens_seen": 67755472, + "step": 116795 + }, + { + "epoch": 17.396484956806674, + "grad_norm": 1.0412436495244037e-05, + "learning_rate": 2.53716174332157e-06, + "loss": 0.0, + "num_input_tokens_seen": 67758160, + "step": 116800 + }, + { + "epoch": 17.39722966934763, + "grad_norm": 0.00015343510312959552, + "learning_rate": 2.535735613092613e-06, + "loss": 0.0, + "num_input_tokens_seen": 67760880, + "step": 116805 + }, + { + "epoch": 17.39797438188859, + "grad_norm": 0.00018345998250879347, + "learning_rate": 2.5343098623745787e-06, + "loss": 0.0, + "num_input_tokens_seen": 67763600, + "step": 116810 + }, + { + "epoch": 17.39871909442955, + "grad_norm": 2.4993441911647096e-06, + "learning_rate": 2.532884491191542e-06, + "loss": 0.0, + "num_input_tokens_seen": 67766448, + "step": 116815 + }, + { + "epoch": 17.39946380697051, + "grad_norm": 0.00011188316420884803, + "learning_rate": 2.5314594995675845e-06, + "loss": 0.0, + "num_input_tokens_seen": 67769392, + "step": 116820 + }, + { + "epoch": 17.40020851951147, + "grad_norm": 6.672136805718765e-05, + "learning_rate": 2.530034887526789e-06, + "loss": 0.0, + "num_input_tokens_seen": 67772016, + "step": 116825 + }, + { + "epoch": 17.400953232052427, + "grad_norm": 6.488033704954432e-06, + "learning_rate": 2.5286106550932164e-06, + "loss": 0.0, + "num_input_tokens_seen": 67774960, + "step": 116830 + }, + { + "epoch": 17.401697944593387, + "grad_norm": 1.4230344277166296e-05, + "learning_rate": 2.527186802290926e-06, + "loss": 0.0, + "num_input_tokens_seen": 67777616, + "step": 116835 + }, + { + "epoch": 17.402442657134348, + "grad_norm": 5.836910077050561e-06, + "learning_rate": 2.525763329143971e-06, + "loss": 0.0, + "num_input_tokens_seen": 67780240, + "step": 116840 + }, + { + "epoch": 17.403187369675305, + "grad_norm": 4.01924171455903e-06, + "learning_rate": 2.5243402356764063e-06, + "loss": 0.0, + "num_input_tokens_seen": 67782896, + "step": 116845 + }, + { + "epoch": 17.403932082216265, + "grad_norm": 1.047625937644625e-05, + "learning_rate": 2.522917521912266e-06, + "loss": 0.0, + "num_input_tokens_seen": 67785712, + "step": 116850 + }, + { + "epoch": 17.404676794757222, + "grad_norm": 6.828866753494367e-05, + "learning_rate": 2.5214951878755944e-06, + "loss": 0.0, + "num_input_tokens_seen": 67788464, + "step": 116855 + }, + { + "epoch": 17.405421507298183, + "grad_norm": 6.878263775433879e-06, + "learning_rate": 2.520073233590414e-06, + "loss": 0.0, + "num_input_tokens_seen": 67791600, + "step": 116860 + }, + { + "epoch": 17.406166219839143, + "grad_norm": 4.37653943663463e-06, + "learning_rate": 2.5186516590807453e-06, + "loss": 0.0, + "num_input_tokens_seen": 67794480, + "step": 116865 + }, + { + "epoch": 17.4069109323801, + "grad_norm": 4.1242501538363285e-06, + "learning_rate": 2.5172304643706123e-06, + "loss": 0.0, + "num_input_tokens_seen": 67797488, + "step": 116870 + }, + { + "epoch": 17.40765564492106, + "grad_norm": 0.001399546512402594, + "learning_rate": 2.515809649484016e-06, + "loss": 0.0, + "num_input_tokens_seen": 67800272, + "step": 116875 + }, + { + "epoch": 17.40840035746202, + "grad_norm": 3.1110403142520227e-06, + "learning_rate": 2.5143892144449676e-06, + "loss": 0.0, + "num_input_tokens_seen": 67802992, + "step": 116880 + }, + { + "epoch": 17.409145070002978, + "grad_norm": 8.55978942126967e-06, + "learning_rate": 2.512969159277459e-06, + "loss": 0.0, + "num_input_tokens_seen": 67805904, + "step": 116885 + }, + { + "epoch": 17.40988978254394, + "grad_norm": 1.035649620462209e-05, + "learning_rate": 2.511549484005485e-06, + "loss": 0.0, + "num_input_tokens_seen": 67808880, + "step": 116890 + }, + { + "epoch": 17.410634495084896, + "grad_norm": 1.8827304302249104e-05, + "learning_rate": 2.5101301886530177e-06, + "loss": 0.0, + "num_input_tokens_seen": 67811792, + "step": 116895 + }, + { + "epoch": 17.411379207625856, + "grad_norm": 5.958776910119923e-06, + "learning_rate": 2.508711273244052e-06, + "loss": 0.0, + "num_input_tokens_seen": 67814864, + "step": 116900 + }, + { + "epoch": 17.412123920166817, + "grad_norm": 0.0002396110794506967, + "learning_rate": 2.5072927378025467e-06, + "loss": 0.0, + "num_input_tokens_seen": 67818064, + "step": 116905 + }, + { + "epoch": 17.412868632707774, + "grad_norm": 8.249739039456472e-05, + "learning_rate": 2.505874582352466e-06, + "loss": 0.0, + "num_input_tokens_seen": 67821232, + "step": 116910 + }, + { + "epoch": 17.413613345248734, + "grad_norm": 0.00018618436297401786, + "learning_rate": 2.5044568069177794e-06, + "loss": 0.0, + "num_input_tokens_seen": 67824336, + "step": 116915 + }, + { + "epoch": 17.414358057789695, + "grad_norm": 0.0028363438323140144, + "learning_rate": 2.503039411522423e-06, + "loss": 0.0, + "num_input_tokens_seen": 67826864, + "step": 116920 + }, + { + "epoch": 17.415102770330652, + "grad_norm": 0.0002488620229996741, + "learning_rate": 2.501622396190359e-06, + "loss": 0.0, + "num_input_tokens_seen": 67829520, + "step": 116925 + }, + { + "epoch": 17.415847482871612, + "grad_norm": 5.313861038303003e-06, + "learning_rate": 2.500205760945512e-06, + "loss": 0.0, + "num_input_tokens_seen": 67832176, + "step": 116930 + }, + { + "epoch": 17.41659219541257, + "grad_norm": 2.4522212243027752e-06, + "learning_rate": 2.4987895058118244e-06, + "loss": 0.0, + "num_input_tokens_seen": 67834864, + "step": 116935 + }, + { + "epoch": 17.41733690795353, + "grad_norm": 1.8192993593402207e-05, + "learning_rate": 2.4973736308132214e-06, + "loss": 0.0, + "num_input_tokens_seen": 67838032, + "step": 116940 + }, + { + "epoch": 17.41808162049449, + "grad_norm": 3.976322477683425e-05, + "learning_rate": 2.4959581359736137e-06, + "loss": 0.0, + "num_input_tokens_seen": 67840752, + "step": 116945 + }, + { + "epoch": 17.418826333035447, + "grad_norm": 6.289118755375966e-05, + "learning_rate": 2.494543021316928e-06, + "loss": 0.0, + "num_input_tokens_seen": 67843824, + "step": 116950 + }, + { + "epoch": 17.419571045576408, + "grad_norm": 3.928826845367439e-05, + "learning_rate": 2.4931282868670634e-06, + "loss": 0.0012, + "num_input_tokens_seen": 67846800, + "step": 116955 + }, + { + "epoch": 17.42031575811737, + "grad_norm": 5.098190285934834e-06, + "learning_rate": 2.4917139326479213e-06, + "loss": 0.0, + "num_input_tokens_seen": 67849712, + "step": 116960 + }, + { + "epoch": 17.421060470658325, + "grad_norm": 0.003428203985095024, + "learning_rate": 2.4902999586833897e-06, + "loss": 0.0, + "num_input_tokens_seen": 67852336, + "step": 116965 + }, + { + "epoch": 17.421805183199286, + "grad_norm": 2.501245035091415e-06, + "learning_rate": 2.48888636499737e-06, + "loss": 0.0, + "num_input_tokens_seen": 67855088, + "step": 116970 + }, + { + "epoch": 17.422549895740243, + "grad_norm": 0.003173442790284753, + "learning_rate": 2.4874731516137283e-06, + "loss": 0.0, + "num_input_tokens_seen": 67857872, + "step": 116975 + }, + { + "epoch": 17.423294608281203, + "grad_norm": 1.173239706986351e-05, + "learning_rate": 2.486060318556352e-06, + "loss": 0.0, + "num_input_tokens_seen": 67860816, + "step": 116980 + }, + { + "epoch": 17.424039320822164, + "grad_norm": 8.249267011706252e-06, + "learning_rate": 2.4846478658491075e-06, + "loss": 0.0, + "num_input_tokens_seen": 67863824, + "step": 116985 + }, + { + "epoch": 17.42478403336312, + "grad_norm": 6.818224210292101e-05, + "learning_rate": 2.483235793515845e-06, + "loss": 0.0, + "num_input_tokens_seen": 67866768, + "step": 116990 + }, + { + "epoch": 17.42552874590408, + "grad_norm": 3.44123545801267e-05, + "learning_rate": 2.4818241015804376e-06, + "loss": 0.0, + "num_input_tokens_seen": 67869840, + "step": 116995 + }, + { + "epoch": 17.42627345844504, + "grad_norm": 7.87263888923917e-06, + "learning_rate": 2.480412790066719e-06, + "loss": 0.0, + "num_input_tokens_seen": 67872464, + "step": 117000 + }, + { + "epoch": 17.427018170986, + "grad_norm": 2.5488518531346926e-06, + "learning_rate": 2.479001858998542e-06, + "loss": 0.0, + "num_input_tokens_seen": 67875824, + "step": 117005 + }, + { + "epoch": 17.42776288352696, + "grad_norm": 0.00031151057919487357, + "learning_rate": 2.4775913083997348e-06, + "loss": 0.0, + "num_input_tokens_seen": 67878832, + "step": 117010 + }, + { + "epoch": 17.428507596067917, + "grad_norm": 0.00032547989394515753, + "learning_rate": 2.476181138294137e-06, + "loss": 0.0, + "num_input_tokens_seen": 67881552, + "step": 117015 + }, + { + "epoch": 17.429252308608877, + "grad_norm": 8.55545476952102e-06, + "learning_rate": 2.474771348705568e-06, + "loss": 0.0, + "num_input_tokens_seen": 67884304, + "step": 117020 + }, + { + "epoch": 17.429997021149838, + "grad_norm": 5.870193945156643e-06, + "learning_rate": 2.4733619396578422e-06, + "loss": 0.0, + "num_input_tokens_seen": 67887024, + "step": 117025 + }, + { + "epoch": 17.430741733690795, + "grad_norm": 4.4859384615847375e-06, + "learning_rate": 2.4719529111747715e-06, + "loss": 0.0, + "num_input_tokens_seen": 67890320, + "step": 117030 + }, + { + "epoch": 17.431486446231755, + "grad_norm": 0.002067526802420616, + "learning_rate": 2.470544263280153e-06, + "loss": 0.0, + "num_input_tokens_seen": 67893168, + "step": 117035 + }, + { + "epoch": 17.432231158772712, + "grad_norm": 3.905144694726914e-05, + "learning_rate": 2.4691359959977985e-06, + "loss": 0.0, + "num_input_tokens_seen": 67895856, + "step": 117040 + }, + { + "epoch": 17.432975871313673, + "grad_norm": 4.64602362626465e-06, + "learning_rate": 2.467728109351486e-06, + "loss": 0.0, + "num_input_tokens_seen": 67898640, + "step": 117045 + }, + { + "epoch": 17.433720583854633, + "grad_norm": 1.744554720062297e-05, + "learning_rate": 2.466320603365013e-06, + "loss": 0.0, + "num_input_tokens_seen": 67901648, + "step": 117050 + }, + { + "epoch": 17.43446529639559, + "grad_norm": 3.062804898945615e-05, + "learning_rate": 2.4649134780621446e-06, + "loss": 0.0, + "num_input_tokens_seen": 67904336, + "step": 117055 + }, + { + "epoch": 17.43521000893655, + "grad_norm": 1.0376394129707478e-05, + "learning_rate": 2.463506733466667e-06, + "loss": 0.0, + "num_input_tokens_seen": 67907312, + "step": 117060 + }, + { + "epoch": 17.43595472147751, + "grad_norm": 7.641217052878346e-06, + "learning_rate": 2.4621003696023354e-06, + "loss": 0.0, + "num_input_tokens_seen": 67910320, + "step": 117065 + }, + { + "epoch": 17.43669943401847, + "grad_norm": 0.00015041259757708758, + "learning_rate": 2.4606943864929064e-06, + "loss": 0.0, + "num_input_tokens_seen": 67913040, + "step": 117070 + }, + { + "epoch": 17.43744414655943, + "grad_norm": 0.00016879137547221035, + "learning_rate": 2.459288784162142e-06, + "loss": 0.0, + "num_input_tokens_seen": 67915984, + "step": 117075 + }, + { + "epoch": 17.438188859100386, + "grad_norm": 6.675082840956748e-05, + "learning_rate": 2.457883562633781e-06, + "loss": 0.0, + "num_input_tokens_seen": 67919120, + "step": 117080 + }, + { + "epoch": 17.438933571641346, + "grad_norm": 1.4510456821881235e-05, + "learning_rate": 2.456478721931571e-06, + "loss": 0.0, + "num_input_tokens_seen": 67922000, + "step": 117085 + }, + { + "epoch": 17.439678284182307, + "grad_norm": 5.520797458302695e-06, + "learning_rate": 2.4550742620792404e-06, + "loss": 0.0001, + "num_input_tokens_seen": 67924784, + "step": 117090 + }, + { + "epoch": 17.440422996723264, + "grad_norm": 3.75856848222611e-06, + "learning_rate": 2.4536701831005177e-06, + "loss": 0.0, + "num_input_tokens_seen": 67927792, + "step": 117095 + }, + { + "epoch": 17.441167709264224, + "grad_norm": 2.930997879957431e-06, + "learning_rate": 2.4522664850191223e-06, + "loss": 0.0001, + "num_input_tokens_seen": 67931056, + "step": 117100 + }, + { + "epoch": 17.441912421805185, + "grad_norm": 3.1292815947381314e-06, + "learning_rate": 2.4508631678587635e-06, + "loss": 0.0, + "num_input_tokens_seen": 67934192, + "step": 117105 + }, + { + "epoch": 17.442657134346142, + "grad_norm": 0.0003717676445376128, + "learning_rate": 2.4494602316431554e-06, + "loss": 0.0, + "num_input_tokens_seen": 67937520, + "step": 117110 + }, + { + "epoch": 17.443401846887102, + "grad_norm": 9.589602996129543e-05, + "learning_rate": 2.4480576763959956e-06, + "loss": 0.0, + "num_input_tokens_seen": 67940592, + "step": 117115 + }, + { + "epoch": 17.44414655942806, + "grad_norm": 2.997695901285624e-06, + "learning_rate": 2.4466555021409848e-06, + "loss": 0.0, + "num_input_tokens_seen": 67943376, + "step": 117120 + }, + { + "epoch": 17.44489127196902, + "grad_norm": 8.918640560295898e-06, + "learning_rate": 2.445253708901801e-06, + "loss": 0.0, + "num_input_tokens_seen": 67946160, + "step": 117125 + }, + { + "epoch": 17.44563598450998, + "grad_norm": 4.132192316319561e-06, + "learning_rate": 2.443852296702137e-06, + "loss": 0.0, + "num_input_tokens_seen": 67948848, + "step": 117130 + }, + { + "epoch": 17.446380697050937, + "grad_norm": 0.000629100133664906, + "learning_rate": 2.4424512655656597e-06, + "loss": 0.0, + "num_input_tokens_seen": 67951536, + "step": 117135 + }, + { + "epoch": 17.447125409591898, + "grad_norm": 6.786518497392535e-06, + "learning_rate": 2.4410506155160467e-06, + "loss": 0.0, + "num_input_tokens_seen": 67954672, + "step": 117140 + }, + { + "epoch": 17.447870122132855, + "grad_norm": 6.783968728996115e-06, + "learning_rate": 2.439650346576955e-06, + "loss": 0.0, + "num_input_tokens_seen": 67957520, + "step": 117145 + }, + { + "epoch": 17.448614834673815, + "grad_norm": 4.03750436817063e-06, + "learning_rate": 2.438250458772037e-06, + "loss": 0.0, + "num_input_tokens_seen": 67960752, + "step": 117150 + }, + { + "epoch": 17.449359547214776, + "grad_norm": 1.6749121641623788e-05, + "learning_rate": 2.436850952124953e-06, + "loss": 0.0, + "num_input_tokens_seen": 67963696, + "step": 117155 + }, + { + "epoch": 17.450104259755733, + "grad_norm": 2.2408879885915667e-05, + "learning_rate": 2.435451826659338e-06, + "loss": 0.0, + "num_input_tokens_seen": 67966512, + "step": 117160 + }, + { + "epoch": 17.450848972296694, + "grad_norm": 8.486591468681581e-06, + "learning_rate": 2.4340530823988292e-06, + "loss": 0.0, + "num_input_tokens_seen": 67969552, + "step": 117165 + }, + { + "epoch": 17.451593684837654, + "grad_norm": 4.294469908927567e-05, + "learning_rate": 2.4326547193670556e-06, + "loss": 0.0478, + "num_input_tokens_seen": 67972400, + "step": 117170 + }, + { + "epoch": 17.45233839737861, + "grad_norm": 3.8759659219067544e-05, + "learning_rate": 2.4312567375876503e-06, + "loss": 0.0, + "num_input_tokens_seen": 67975440, + "step": 117175 + }, + { + "epoch": 17.45308310991957, + "grad_norm": 9.686442353995517e-05, + "learning_rate": 2.4298591370842227e-06, + "loss": 0.0, + "num_input_tokens_seen": 67978160, + "step": 117180 + }, + { + "epoch": 17.45382782246053, + "grad_norm": 0.31652113795280457, + "learning_rate": 2.4284619178803814e-06, + "loss": 0.0009, + "num_input_tokens_seen": 67981168, + "step": 117185 + }, + { + "epoch": 17.45457253500149, + "grad_norm": 0.0014937520027160645, + "learning_rate": 2.427065079999741e-06, + "loss": 0.0, + "num_input_tokens_seen": 67984240, + "step": 117190 + }, + { + "epoch": 17.45531724754245, + "grad_norm": 4.640411862055771e-05, + "learning_rate": 2.4256686234658877e-06, + "loss": 0.0, + "num_input_tokens_seen": 67987056, + "step": 117195 + }, + { + "epoch": 17.456061960083407, + "grad_norm": 4.105544576304965e-06, + "learning_rate": 2.4242725483024252e-06, + "loss": 0.0, + "num_input_tokens_seen": 67990160, + "step": 117200 + }, + { + "epoch": 17.456806672624367, + "grad_norm": 5.145483009982854e-05, + "learning_rate": 2.4228768545329267e-06, + "loss": 0.0227, + "num_input_tokens_seen": 67993488, + "step": 117205 + }, + { + "epoch": 17.457551385165328, + "grad_norm": 736.1730346679688, + "learning_rate": 2.421481542180984e-06, + "loss": 0.0645, + "num_input_tokens_seen": 67996432, + "step": 117210 + }, + { + "epoch": 17.458296097706285, + "grad_norm": 0.00010382964683230966, + "learning_rate": 2.4200866112701643e-06, + "loss": 0.0, + "num_input_tokens_seen": 67999504, + "step": 117215 + }, + { + "epoch": 17.459040810247245, + "grad_norm": 6.231802672118647e-06, + "learning_rate": 2.4186920618240294e-06, + "loss": 0.0, + "num_input_tokens_seen": 68002288, + "step": 117220 + }, + { + "epoch": 17.459785522788202, + "grad_norm": 2.767312935247901e-06, + "learning_rate": 2.417297893866138e-06, + "loss": 0.0, + "num_input_tokens_seen": 68005552, + "step": 117225 + }, + { + "epoch": 17.460530235329163, + "grad_norm": 2.4776323698461056e-05, + "learning_rate": 2.41590410742005e-06, + "loss": 0.0, + "num_input_tokens_seen": 68008304, + "step": 117230 + }, + { + "epoch": 17.461274947870123, + "grad_norm": 8.857828106556553e-06, + "learning_rate": 2.4145107025093095e-06, + "loss": 0.0, + "num_input_tokens_seen": 68011056, + "step": 117235 + }, + { + "epoch": 17.46201966041108, + "grad_norm": 1.6614612832199782e-05, + "learning_rate": 2.4131176791574504e-06, + "loss": 0.0, + "num_input_tokens_seen": 68014160, + "step": 117240 + }, + { + "epoch": 17.46276437295204, + "grad_norm": 2.811613285302883e-06, + "learning_rate": 2.4117250373880184e-06, + "loss": 0.0, + "num_input_tokens_seen": 68017008, + "step": 117245 + }, + { + "epoch": 17.463509085493, + "grad_norm": 1.0098173333972227e-05, + "learning_rate": 2.4103327772245275e-06, + "loss": 0.0, + "num_input_tokens_seen": 68019824, + "step": 117250 + }, + { + "epoch": 17.46425379803396, + "grad_norm": 7.444313268933911e-06, + "learning_rate": 2.408940898690512e-06, + "loss": 0.0, + "num_input_tokens_seen": 68022448, + "step": 117255 + }, + { + "epoch": 17.46499851057492, + "grad_norm": 7.802870095474645e-05, + "learning_rate": 2.407549401809478e-06, + "loss": 0.119, + "num_input_tokens_seen": 68025552, + "step": 117260 + }, + { + "epoch": 17.465743223115876, + "grad_norm": 0.023083264008164406, + "learning_rate": 2.4061582866049285e-06, + "loss": 0.0, + "num_input_tokens_seen": 68028272, + "step": 117265 + }, + { + "epoch": 17.466487935656836, + "grad_norm": 3.193687234670506e-06, + "learning_rate": 2.4047675531003787e-06, + "loss": 0.0, + "num_input_tokens_seen": 68031088, + "step": 117270 + }, + { + "epoch": 17.467232648197797, + "grad_norm": 9.83354857453378e-06, + "learning_rate": 2.4033772013193123e-06, + "loss": 0.0, + "num_input_tokens_seen": 68033968, + "step": 117275 + }, + { + "epoch": 17.467977360738754, + "grad_norm": 5.033500929130241e-05, + "learning_rate": 2.4019872312852244e-06, + "loss": 0.0, + "num_input_tokens_seen": 68037008, + "step": 117280 + }, + { + "epoch": 17.468722073279714, + "grad_norm": 3.4830411550501594e-06, + "learning_rate": 2.400597643021596e-06, + "loss": 0.0, + "num_input_tokens_seen": 68040176, + "step": 117285 + }, + { + "epoch": 17.469466785820675, + "grad_norm": 4.810847531189211e-05, + "learning_rate": 2.3992084365519004e-06, + "loss": 0.0, + "num_input_tokens_seen": 68042992, + "step": 117290 + }, + { + "epoch": 17.470211498361632, + "grad_norm": 2.037771810137201e-05, + "learning_rate": 2.3978196118996043e-06, + "loss": 0.0, + "num_input_tokens_seen": 68046000, + "step": 117295 + }, + { + "epoch": 17.470956210902592, + "grad_norm": 9.280713129555807e-05, + "learning_rate": 2.3964311690881786e-06, + "loss": 0.0, + "num_input_tokens_seen": 68048944, + "step": 117300 + }, + { + "epoch": 17.47170092344355, + "grad_norm": 3.275825292803347e-06, + "learning_rate": 2.3950431081410734e-06, + "loss": 0.0, + "num_input_tokens_seen": 68051792, + "step": 117305 + }, + { + "epoch": 17.47244563598451, + "grad_norm": 4.026315582450479e-05, + "learning_rate": 2.393655429081737e-06, + "loss": 0.0, + "num_input_tokens_seen": 68055088, + "step": 117310 + }, + { + "epoch": 17.47319034852547, + "grad_norm": 0.00014627882046625018, + "learning_rate": 2.3922681319336197e-06, + "loss": 0.0005, + "num_input_tokens_seen": 68057808, + "step": 117315 + }, + { + "epoch": 17.473935061066427, + "grad_norm": 2.2522768631461076e-05, + "learning_rate": 2.3908812167201532e-06, + "loss": 0.0, + "num_input_tokens_seen": 68060560, + "step": 117320 + }, + { + "epoch": 17.474679773607388, + "grad_norm": 3.676432015708997e-06, + "learning_rate": 2.3894946834647713e-06, + "loss": 0.0, + "num_input_tokens_seen": 68063568, + "step": 117325 + }, + { + "epoch": 17.47542448614835, + "grad_norm": 5.8368646023154724e-06, + "learning_rate": 2.3881085321908912e-06, + "loss": 0.0, + "num_input_tokens_seen": 68066416, + "step": 117330 + }, + { + "epoch": 17.476169198689306, + "grad_norm": 0.00028526998357847333, + "learning_rate": 2.3867227629219417e-06, + "loss": 0.0, + "num_input_tokens_seen": 68069552, + "step": 117335 + }, + { + "epoch": 17.476913911230266, + "grad_norm": 0.00014976553211454302, + "learning_rate": 2.385337375681329e-06, + "loss": 0.0, + "num_input_tokens_seen": 68072368, + "step": 117340 + }, + { + "epoch": 17.477658623771223, + "grad_norm": 2.446341795803164e-06, + "learning_rate": 2.383952370492451e-06, + "loss": 0.0, + "num_input_tokens_seen": 68075280, + "step": 117345 + }, + { + "epoch": 17.478403336312184, + "grad_norm": 4.591639299178496e-05, + "learning_rate": 2.3825677473787168e-06, + "loss": 0.0, + "num_input_tokens_seen": 68078160, + "step": 117350 + }, + { + "epoch": 17.479148048853144, + "grad_norm": 0.0002516631502658129, + "learning_rate": 2.381183506363513e-06, + "loss": 0.0, + "num_input_tokens_seen": 68081136, + "step": 117355 + }, + { + "epoch": 17.4798927613941, + "grad_norm": 3.0863007850712165e-05, + "learning_rate": 2.3797996474702267e-06, + "loss": 0.0, + "num_input_tokens_seen": 68083920, + "step": 117360 + }, + { + "epoch": 17.48063747393506, + "grad_norm": 8.818210517347325e-06, + "learning_rate": 2.378416170722228e-06, + "loss": 0.0, + "num_input_tokens_seen": 68087056, + "step": 117365 + }, + { + "epoch": 17.48138218647602, + "grad_norm": 0.00013410262181423604, + "learning_rate": 2.3770330761429012e-06, + "loss": 0.0, + "num_input_tokens_seen": 68090000, + "step": 117370 + }, + { + "epoch": 17.48212689901698, + "grad_norm": 0.0001581459800945595, + "learning_rate": 2.375650363755605e-06, + "loss": 0.0, + "num_input_tokens_seen": 68093168, + "step": 117375 + }, + { + "epoch": 17.48287161155794, + "grad_norm": 7.2379925768473186e-06, + "learning_rate": 2.3742680335837042e-06, + "loss": 0.0, + "num_input_tokens_seen": 68096336, + "step": 117380 + }, + { + "epoch": 17.483616324098897, + "grad_norm": 0.0001887391263153404, + "learning_rate": 2.3728860856505526e-06, + "loss": 0.0, + "num_input_tokens_seen": 68099024, + "step": 117385 + }, + { + "epoch": 17.484361036639857, + "grad_norm": 0.00018695020116865635, + "learning_rate": 2.371504519979484e-06, + "loss": 0.0, + "num_input_tokens_seen": 68101840, + "step": 117390 + }, + { + "epoch": 17.485105749180818, + "grad_norm": 1.92789902939694e-05, + "learning_rate": 2.3701233365938547e-06, + "loss": 0.0, + "num_input_tokens_seen": 68104400, + "step": 117395 + }, + { + "epoch": 17.485850461721775, + "grad_norm": 2.4304042653966462e-06, + "learning_rate": 2.368742535516988e-06, + "loss": 0.0, + "num_input_tokens_seen": 68107088, + "step": 117400 + }, + { + "epoch": 17.486595174262735, + "grad_norm": 1.23347963381093e-05, + "learning_rate": 2.3673621167722202e-06, + "loss": 0.0, + "num_input_tokens_seen": 68109904, + "step": 117405 + }, + { + "epoch": 17.487339886803692, + "grad_norm": 2.6349531253799796e-05, + "learning_rate": 2.3659820803828586e-06, + "loss": 0.0, + "num_input_tokens_seen": 68113008, + "step": 117410 + }, + { + "epoch": 17.488084599344653, + "grad_norm": 2.9028194603597512e-06, + "learning_rate": 2.364602426372234e-06, + "loss": 0.0, + "num_input_tokens_seen": 68116208, + "step": 117415 + }, + { + "epoch": 17.488829311885613, + "grad_norm": 5.961723672953667e-06, + "learning_rate": 2.3632231547636443e-06, + "loss": 0.0, + "num_input_tokens_seen": 68119152, + "step": 117420 + }, + { + "epoch": 17.48957402442657, + "grad_norm": 0.0009311743779107928, + "learning_rate": 2.3618442655803903e-06, + "loss": 0.0, + "num_input_tokens_seen": 68121776, + "step": 117425 + }, + { + "epoch": 17.49031873696753, + "grad_norm": 2.6660770799935563e-06, + "learning_rate": 2.36046575884577e-06, + "loss": 0.0, + "num_input_tokens_seen": 68124720, + "step": 117430 + }, + { + "epoch": 17.49106344950849, + "grad_norm": 0.0003032337117474526, + "learning_rate": 2.359087634583068e-06, + "loss": 0.0, + "num_input_tokens_seen": 68127728, + "step": 117435 + }, + { + "epoch": 17.49180816204945, + "grad_norm": 2.669706873348332e-06, + "learning_rate": 2.357709892815574e-06, + "loss": 0.0, + "num_input_tokens_seen": 68130736, + "step": 117440 + }, + { + "epoch": 17.49255287459041, + "grad_norm": 1.9843409972963855e-05, + "learning_rate": 2.356332533566552e-06, + "loss": 0.0, + "num_input_tokens_seen": 68133424, + "step": 117445 + }, + { + "epoch": 17.493297587131366, + "grad_norm": 7.257421202666592e-06, + "learning_rate": 2.354955556859284e-06, + "loss": 0.0, + "num_input_tokens_seen": 68136528, + "step": 117450 + }, + { + "epoch": 17.494042299672326, + "grad_norm": 3.658966670627706e-05, + "learning_rate": 2.3535789627170205e-06, + "loss": 0.0, + "num_input_tokens_seen": 68139568, + "step": 117455 + }, + { + "epoch": 17.494787012213287, + "grad_norm": 3.688685319502838e-05, + "learning_rate": 2.3522027511630297e-06, + "loss": 0.0, + "num_input_tokens_seen": 68142224, + "step": 117460 + }, + { + "epoch": 17.495531724754244, + "grad_norm": 2.1737025235779583e-05, + "learning_rate": 2.350826922220553e-06, + "loss": 0.0, + "num_input_tokens_seen": 68144816, + "step": 117465 + }, + { + "epoch": 17.496276437295204, + "grad_norm": 4.453216206457e-06, + "learning_rate": 2.349451475912834e-06, + "loss": 0.0, + "num_input_tokens_seen": 68147728, + "step": 117470 + }, + { + "epoch": 17.497021149836165, + "grad_norm": 9.823685104493052e-05, + "learning_rate": 2.3480764122631143e-06, + "loss": 0.0, + "num_input_tokens_seen": 68150640, + "step": 117475 + }, + { + "epoch": 17.497765862377122, + "grad_norm": 4.489185448619537e-05, + "learning_rate": 2.3467017312946175e-06, + "loss": 0.0, + "num_input_tokens_seen": 68153584, + "step": 117480 + }, + { + "epoch": 17.498510574918082, + "grad_norm": 1.4960627595428377e-05, + "learning_rate": 2.345327433030575e-06, + "loss": 0.0, + "num_input_tokens_seen": 68156368, + "step": 117485 + }, + { + "epoch": 17.49925528745904, + "grad_norm": 0.00019100226927548647, + "learning_rate": 2.343953517494202e-06, + "loss": 0.0, + "num_input_tokens_seen": 68159344, + "step": 117490 + }, + { + "epoch": 17.5, + "grad_norm": 7.286477921297774e-05, + "learning_rate": 2.3425799847087066e-06, + "loss": 0.0, + "num_input_tokens_seen": 68162384, + "step": 117495 + }, + { + "epoch": 17.50074471254096, + "grad_norm": 0.00011890831956407055, + "learning_rate": 2.341206834697288e-06, + "loss": 0.0, + "num_input_tokens_seen": 68165328, + "step": 117500 + }, + { + "epoch": 17.501489425081918, + "grad_norm": 6.973677635192871, + "learning_rate": 2.339834067483157e-06, + "loss": 0.0704, + "num_input_tokens_seen": 68168112, + "step": 117505 + }, + { + "epoch": 17.502234137622878, + "grad_norm": 1.0088662747875787e-05, + "learning_rate": 2.338461683089499e-06, + "loss": 0.0, + "num_input_tokens_seen": 68171184, + "step": 117510 + }, + { + "epoch": 17.502978850163835, + "grad_norm": 8.658405022288207e-06, + "learning_rate": 2.337089681539495e-06, + "loss": 0.0, + "num_input_tokens_seen": 68173968, + "step": 117515 + }, + { + "epoch": 17.503723562704796, + "grad_norm": 2.928908543253783e-05, + "learning_rate": 2.335718062856329e-06, + "loss": 0.0, + "num_input_tokens_seen": 68177104, + "step": 117520 + }, + { + "epoch": 17.504468275245756, + "grad_norm": 4.5941917051095515e-05, + "learning_rate": 2.334346827063169e-06, + "loss": 0.0, + "num_input_tokens_seen": 68179952, + "step": 117525 + }, + { + "epoch": 17.505212987786713, + "grad_norm": 2.1467501483130036e-06, + "learning_rate": 2.332975974183185e-06, + "loss": 0.0, + "num_input_tokens_seen": 68182864, + "step": 117530 + }, + { + "epoch": 17.505957700327674, + "grad_norm": 4.11826931667747e-06, + "learning_rate": 2.331605504239534e-06, + "loss": 0.0, + "num_input_tokens_seen": 68185744, + "step": 117535 + }, + { + "epoch": 17.506702412868634, + "grad_norm": 0.0006134203285910189, + "learning_rate": 2.330235417255369e-06, + "loss": 0.0, + "num_input_tokens_seen": 68188560, + "step": 117540 + }, + { + "epoch": 17.50744712540959, + "grad_norm": 0.0012934927362948656, + "learning_rate": 2.3288657132538387e-06, + "loss": 0.0, + "num_input_tokens_seen": 68191504, + "step": 117545 + }, + { + "epoch": 17.50819183795055, + "grad_norm": 4.140122200624319e-06, + "learning_rate": 2.32749639225808e-06, + "loss": 0.0, + "num_input_tokens_seen": 68194640, + "step": 117550 + }, + { + "epoch": 17.50893655049151, + "grad_norm": 1.2977967344340868e-05, + "learning_rate": 2.3261274542912213e-06, + "loss": 0.0, + "num_input_tokens_seen": 68197520, + "step": 117555 + }, + { + "epoch": 17.50968126303247, + "grad_norm": 5.2693144425575156e-06, + "learning_rate": 2.3247588993764e-06, + "loss": 0.0, + "num_input_tokens_seen": 68200752, + "step": 117560 + }, + { + "epoch": 17.51042597557343, + "grad_norm": 1.0140370250155684e-05, + "learning_rate": 2.3233907275367283e-06, + "loss": 0.0, + "num_input_tokens_seen": 68203472, + "step": 117565 + }, + { + "epoch": 17.511170688114387, + "grad_norm": 3.337319867569022e-05, + "learning_rate": 2.3220229387953207e-06, + "loss": 0.0, + "num_input_tokens_seen": 68206224, + "step": 117570 + }, + { + "epoch": 17.511915400655347, + "grad_norm": 0.00014451690367422998, + "learning_rate": 2.3206555331752922e-06, + "loss": 0.0, + "num_input_tokens_seen": 68208976, + "step": 117575 + }, + { + "epoch": 17.512660113196308, + "grad_norm": 2.7946971385972574e-05, + "learning_rate": 2.3192885106997327e-06, + "loss": 0.0, + "num_input_tokens_seen": 68211888, + "step": 117580 + }, + { + "epoch": 17.513404825737265, + "grad_norm": 0.0001054465101333335, + "learning_rate": 2.3179218713917456e-06, + "loss": 0.0, + "num_input_tokens_seen": 68214736, + "step": 117585 + }, + { + "epoch": 17.514149538278225, + "grad_norm": 5.5835182138253e-05, + "learning_rate": 2.3165556152744154e-06, + "loss": 0.0, + "num_input_tokens_seen": 68217584, + "step": 117590 + }, + { + "epoch": 17.514894250819182, + "grad_norm": 0.00023580784909427166, + "learning_rate": 2.3151897423708214e-06, + "loss": 0.0, + "num_input_tokens_seen": 68220432, + "step": 117595 + }, + { + "epoch": 17.515638963360143, + "grad_norm": 4.7717380766698625e-06, + "learning_rate": 2.3138242527040416e-06, + "loss": 0.0, + "num_input_tokens_seen": 68223344, + "step": 117600 + }, + { + "epoch": 17.516383675901103, + "grad_norm": 0.00013331935042515397, + "learning_rate": 2.3124591462971414e-06, + "loss": 0.0, + "num_input_tokens_seen": 68226384, + "step": 117605 + }, + { + "epoch": 17.51712838844206, + "grad_norm": 6.302042311290279e-05, + "learning_rate": 2.3110944231731907e-06, + "loss": 0.0, + "num_input_tokens_seen": 68229136, + "step": 117610 + }, + { + "epoch": 17.51787310098302, + "grad_norm": 4.985087434761226e-05, + "learning_rate": 2.309730083355238e-06, + "loss": 0.0, + "num_input_tokens_seen": 68232336, + "step": 117615 + }, + { + "epoch": 17.51861781352398, + "grad_norm": 6.125746585894376e-05, + "learning_rate": 2.3083661268663376e-06, + "loss": 0.0, + "num_input_tokens_seen": 68235152, + "step": 117620 + }, + { + "epoch": 17.51936252606494, + "grad_norm": 5.564196908380836e-05, + "learning_rate": 2.3070025537295257e-06, + "loss": 0.0, + "num_input_tokens_seen": 68238256, + "step": 117625 + }, + { + "epoch": 17.5201072386059, + "grad_norm": 2.287988354510162e-05, + "learning_rate": 2.3056393639678374e-06, + "loss": 0.0, + "num_input_tokens_seen": 68241264, + "step": 117630 + }, + { + "epoch": 17.520851951146856, + "grad_norm": 2.350542445128667e-06, + "learning_rate": 2.3042765576043124e-06, + "loss": 0.0, + "num_input_tokens_seen": 68244144, + "step": 117635 + }, + { + "epoch": 17.521596663687816, + "grad_norm": 0.0005494356155395508, + "learning_rate": 2.3029141346619624e-06, + "loss": 0.0, + "num_input_tokens_seen": 68247024, + "step": 117640 + }, + { + "epoch": 17.522341376228777, + "grad_norm": 3.5496319469530135e-05, + "learning_rate": 2.301552095163814e-06, + "loss": 0.0, + "num_input_tokens_seen": 68249520, + "step": 117645 + }, + { + "epoch": 17.523086088769734, + "grad_norm": 0.00045821419917047024, + "learning_rate": 2.3001904391328683e-06, + "loss": 0.0, + "num_input_tokens_seen": 68252560, + "step": 117650 + }, + { + "epoch": 17.523830801310694, + "grad_norm": 1.021219213725999e-05, + "learning_rate": 2.2988291665921396e-06, + "loss": 0.0, + "num_input_tokens_seen": 68255568, + "step": 117655 + }, + { + "epoch": 17.52457551385165, + "grad_norm": 4.231512866681442e-05, + "learning_rate": 2.2974682775646157e-06, + "loss": 0.0, + "num_input_tokens_seen": 68258832, + "step": 117660 + }, + { + "epoch": 17.525320226392612, + "grad_norm": 7.2547780291643e-05, + "learning_rate": 2.2961077720732943e-06, + "loss": 0.0, + "num_input_tokens_seen": 68262032, + "step": 117665 + }, + { + "epoch": 17.526064938933573, + "grad_norm": 3.996897703473223e-06, + "learning_rate": 2.2947476501411573e-06, + "loss": 0.0, + "num_input_tokens_seen": 68265232, + "step": 117670 + }, + { + "epoch": 17.52680965147453, + "grad_norm": 5.105992386233993e-06, + "learning_rate": 2.293387911791178e-06, + "loss": 0.0, + "num_input_tokens_seen": 68268048, + "step": 117675 + }, + { + "epoch": 17.52755436401549, + "grad_norm": 0.00016708079783711582, + "learning_rate": 2.2920285570463386e-06, + "loss": 0.0, + "num_input_tokens_seen": 68270896, + "step": 117680 + }, + { + "epoch": 17.52829907655645, + "grad_norm": 1.055729626386892e-05, + "learning_rate": 2.2906695859295946e-06, + "loss": 0.0, + "num_input_tokens_seen": 68273936, + "step": 117685 + }, + { + "epoch": 17.529043789097408, + "grad_norm": 0.00013191261678002775, + "learning_rate": 2.2893109984639086e-06, + "loss": 0.0, + "num_input_tokens_seen": 68276720, + "step": 117690 + }, + { + "epoch": 17.529788501638368, + "grad_norm": 0.0015446977922692895, + "learning_rate": 2.287952794672227e-06, + "loss": 0.0, + "num_input_tokens_seen": 68279600, + "step": 117695 + }, + { + "epoch": 17.530533214179325, + "grad_norm": 2.1906431356910616e-05, + "learning_rate": 2.2865949745775024e-06, + "loss": 0.0, + "num_input_tokens_seen": 68282448, + "step": 117700 + }, + { + "epoch": 17.531277926720286, + "grad_norm": 1.0000314432545565e-05, + "learning_rate": 2.2852375382026727e-06, + "loss": 0.0, + "num_input_tokens_seen": 68285200, + "step": 117705 + }, + { + "epoch": 17.532022639261246, + "grad_norm": 4.42428354290314e-05, + "learning_rate": 2.2838804855706606e-06, + "loss": 0.0674, + "num_input_tokens_seen": 68288080, + "step": 117710 + }, + { + "epoch": 17.532767351802203, + "grad_norm": 3.0337300813698675e-06, + "learning_rate": 2.282523816704407e-06, + "loss": 0.0, + "num_input_tokens_seen": 68290928, + "step": 117715 + }, + { + "epoch": 17.533512064343164, + "grad_norm": 5.923106073169038e-05, + "learning_rate": 2.2811675316268212e-06, + "loss": 0.0019, + "num_input_tokens_seen": 68293584, + "step": 117720 + }, + { + "epoch": 17.534256776884124, + "grad_norm": 9.22039271245012e-06, + "learning_rate": 2.279811630360823e-06, + "loss": 0.002, + "num_input_tokens_seen": 68296688, + "step": 117725 + }, + { + "epoch": 17.53500148942508, + "grad_norm": 2.1601813386951108e-06, + "learning_rate": 2.2784561129293086e-06, + "loss": 0.0, + "num_input_tokens_seen": 68299696, + "step": 117730 + }, + { + "epoch": 17.53574620196604, + "grad_norm": 2.8390988973114872e-06, + "learning_rate": 2.2771009793551927e-06, + "loss": 0.0, + "num_input_tokens_seen": 68302608, + "step": 117735 + }, + { + "epoch": 17.536490914507, + "grad_norm": 4.320620064390823e-05, + "learning_rate": 2.275746229661355e-06, + "loss": 0.0, + "num_input_tokens_seen": 68305584, + "step": 117740 + }, + { + "epoch": 17.53723562704796, + "grad_norm": 7.70598944654921e-06, + "learning_rate": 2.2743918638706952e-06, + "loss": 0.0, + "num_input_tokens_seen": 68308784, + "step": 117745 + }, + { + "epoch": 17.53798033958892, + "grad_norm": 1.640987284190487e-05, + "learning_rate": 2.2730378820060847e-06, + "loss": 0.0, + "num_input_tokens_seen": 68311792, + "step": 117750 + }, + { + "epoch": 17.538725052129877, + "grad_norm": 7.984055991983041e-05, + "learning_rate": 2.271684284090403e-06, + "loss": 0.0, + "num_input_tokens_seen": 68314544, + "step": 117755 + }, + { + "epoch": 17.539469764670837, + "grad_norm": 3.3102267025242327e-06, + "learning_rate": 2.270331070146514e-06, + "loss": 0.0, + "num_input_tokens_seen": 68317360, + "step": 117760 + }, + { + "epoch": 17.540214477211798, + "grad_norm": 4.615153102349723e-06, + "learning_rate": 2.2689782401972783e-06, + "loss": 0.0, + "num_input_tokens_seen": 68320560, + "step": 117765 + }, + { + "epoch": 17.540959189752755, + "grad_norm": 2.05581309273839e-05, + "learning_rate": 2.2676257942655544e-06, + "loss": 0.0, + "num_input_tokens_seen": 68324656, + "step": 117770 + }, + { + "epoch": 17.541703902293715, + "grad_norm": 3.21672014251817e-05, + "learning_rate": 2.266273732374183e-06, + "loss": 0.0, + "num_input_tokens_seen": 68327504, + "step": 117775 + }, + { + "epoch": 17.542448614834672, + "grad_norm": 8.341506327269599e-05, + "learning_rate": 2.2649220545460176e-06, + "loss": 0.0, + "num_input_tokens_seen": 68330512, + "step": 117780 + }, + { + "epoch": 17.543193327375633, + "grad_norm": 0.00025159146753139794, + "learning_rate": 2.2635707608038877e-06, + "loss": 0.0, + "num_input_tokens_seen": 68333392, + "step": 117785 + }, + { + "epoch": 17.543938039916593, + "grad_norm": 2.9123914373485604e-06, + "learning_rate": 2.262219851170616e-06, + "loss": 0.0, + "num_input_tokens_seen": 68336272, + "step": 117790 + }, + { + "epoch": 17.54468275245755, + "grad_norm": 1.1314801668049768e-05, + "learning_rate": 2.260869325669035e-06, + "loss": 0.0, + "num_input_tokens_seen": 68339280, + "step": 117795 + }, + { + "epoch": 17.54542746499851, + "grad_norm": 0.0021630702540278435, + "learning_rate": 2.259519184321951e-06, + "loss": 0.0, + "num_input_tokens_seen": 68342128, + "step": 117800 + }, + { + "epoch": 17.54617217753947, + "grad_norm": 5.367320045479573e-05, + "learning_rate": 2.2581694271521817e-06, + "loss": 0.0, + "num_input_tokens_seen": 68345392, + "step": 117805 + }, + { + "epoch": 17.54691689008043, + "grad_norm": 4.821285983780399e-05, + "learning_rate": 2.2568200541825236e-06, + "loss": 0.0, + "num_input_tokens_seen": 68348880, + "step": 117810 + }, + { + "epoch": 17.54766160262139, + "grad_norm": 1.3451421182253398e-05, + "learning_rate": 2.2554710654357796e-06, + "loss": 0.0, + "num_input_tokens_seen": 68352176, + "step": 117815 + }, + { + "epoch": 17.548406315162346, + "grad_norm": 2.0517786651907954e-06, + "learning_rate": 2.254122460934735e-06, + "loss": 0.0, + "num_input_tokens_seen": 68355440, + "step": 117820 + }, + { + "epoch": 17.549151027703306, + "grad_norm": 3.3059400266211014e-06, + "learning_rate": 2.2527742407021742e-06, + "loss": 0.1035, + "num_input_tokens_seen": 68358288, + "step": 117825 + }, + { + "epoch": 17.549895740244267, + "grad_norm": 1.716055703582242e-05, + "learning_rate": 2.251426404760873e-06, + "loss": 0.0, + "num_input_tokens_seen": 68361136, + "step": 117830 + }, + { + "epoch": 17.550640452785224, + "grad_norm": 0.047674596309661865, + "learning_rate": 2.250078953133597e-06, + "loss": 0.0, + "num_input_tokens_seen": 68364272, + "step": 117835 + }, + { + "epoch": 17.551385165326185, + "grad_norm": 3.080016540479846e-05, + "learning_rate": 2.2487318858431193e-06, + "loss": 0.0, + "num_input_tokens_seen": 68366896, + "step": 117840 + }, + { + "epoch": 17.552129877867145, + "grad_norm": 3.0857770525472006e-06, + "learning_rate": 2.247385202912189e-06, + "loss": 0.0, + "num_input_tokens_seen": 68369552, + "step": 117845 + }, + { + "epoch": 17.552874590408102, + "grad_norm": 6.778152601327747e-05, + "learning_rate": 2.246038904363565e-06, + "loss": 0.0, + "num_input_tokens_seen": 68372304, + "step": 117850 + }, + { + "epoch": 17.553619302949063, + "grad_norm": 7.929990533739328e-05, + "learning_rate": 2.2446929902199847e-06, + "loss": 0.0, + "num_input_tokens_seen": 68375088, + "step": 117855 + }, + { + "epoch": 17.55436401549002, + "grad_norm": 4.064177119289525e-06, + "learning_rate": 2.2433474605041917e-06, + "loss": 0.0, + "num_input_tokens_seen": 68378000, + "step": 117860 + }, + { + "epoch": 17.55510872803098, + "grad_norm": 3.181976353516802e-05, + "learning_rate": 2.242002315238917e-06, + "loss": 0.0, + "num_input_tokens_seen": 68380880, + "step": 117865 + }, + { + "epoch": 17.55585344057194, + "grad_norm": 2.468573256919626e-06, + "learning_rate": 2.240657554446876e-06, + "loss": 0.0, + "num_input_tokens_seen": 68383664, + "step": 117870 + }, + { + "epoch": 17.556598153112898, + "grad_norm": 1.2767111911671236e-05, + "learning_rate": 2.239313178150798e-06, + "loss": 0.0, + "num_input_tokens_seen": 68386320, + "step": 117875 + }, + { + "epoch": 17.557342865653858, + "grad_norm": 4.6787077735643834e-05, + "learning_rate": 2.2379691863733927e-06, + "loss": 0.0, + "num_input_tokens_seen": 68389488, + "step": 117880 + }, + { + "epoch": 17.558087578194815, + "grad_norm": 1.7348456822219305e-06, + "learning_rate": 2.236625579137358e-06, + "loss": 0.0, + "num_input_tokens_seen": 68392464, + "step": 117885 + }, + { + "epoch": 17.558832290735776, + "grad_norm": 1.639089714444708e-05, + "learning_rate": 2.235282356465404e-06, + "loss": 0.0, + "num_input_tokens_seen": 68395280, + "step": 117890 + }, + { + "epoch": 17.559577003276736, + "grad_norm": 0.00017358537297695875, + "learning_rate": 2.233939518380218e-06, + "loss": 0.0, + "num_input_tokens_seen": 68398256, + "step": 117895 + }, + { + "epoch": 17.560321715817693, + "grad_norm": 3.7019065075583057e-06, + "learning_rate": 2.232597064904479e-06, + "loss": 0.0, + "num_input_tokens_seen": 68400976, + "step": 117900 + }, + { + "epoch": 17.561066428358654, + "grad_norm": 1.8594656694403966e-06, + "learning_rate": 2.2312549960608804e-06, + "loss": 0.0, + "num_input_tokens_seen": 68403824, + "step": 117905 + }, + { + "epoch": 17.561811140899614, + "grad_norm": 5.78205026613432e-06, + "learning_rate": 2.2299133118720837e-06, + "loss": 0.0, + "num_input_tokens_seen": 68406448, + "step": 117910 + }, + { + "epoch": 17.56255585344057, + "grad_norm": 1.7809348719310947e-05, + "learning_rate": 2.228572012360758e-06, + "loss": 0.0, + "num_input_tokens_seen": 68409168, + "step": 117915 + }, + { + "epoch": 17.56330056598153, + "grad_norm": 3.851417659461731e-06, + "learning_rate": 2.2272310975495673e-06, + "loss": 0.0, + "num_input_tokens_seen": 68411984, + "step": 117920 + }, + { + "epoch": 17.56404527852249, + "grad_norm": 0.00010011781705543399, + "learning_rate": 2.2258905674611556e-06, + "loss": 0.0, + "num_input_tokens_seen": 68415216, + "step": 117925 + }, + { + "epoch": 17.56478999106345, + "grad_norm": 5.329987743607489e-06, + "learning_rate": 2.224550422118185e-06, + "loss": 0.0, + "num_input_tokens_seen": 68418032, + "step": 117930 + }, + { + "epoch": 17.56553470360441, + "grad_norm": 5.300770681060385e-06, + "learning_rate": 2.223210661543279e-06, + "loss": 0.0, + "num_input_tokens_seen": 68420752, + "step": 117935 + }, + { + "epoch": 17.566279416145367, + "grad_norm": 7.57508405513363e-06, + "learning_rate": 2.221871285759086e-06, + "loss": 0.0, + "num_input_tokens_seen": 68423792, + "step": 117940 + }, + { + "epoch": 17.567024128686327, + "grad_norm": 4.817787612410029e-06, + "learning_rate": 2.2205322947882245e-06, + "loss": 0.0, + "num_input_tokens_seen": 68426992, + "step": 117945 + }, + { + "epoch": 17.567768841227288, + "grad_norm": 2.498791945981793e-05, + "learning_rate": 2.2191936886533206e-06, + "loss": 0.0, + "num_input_tokens_seen": 68429776, + "step": 117950 + }, + { + "epoch": 17.568513553768245, + "grad_norm": 1.4177548109728377e-05, + "learning_rate": 2.2178554673769863e-06, + "loss": 0.0, + "num_input_tokens_seen": 68433072, + "step": 117955 + }, + { + "epoch": 17.569258266309205, + "grad_norm": 0.0024104323238134384, + "learning_rate": 2.216517630981821e-06, + "loss": 0.0, + "num_input_tokens_seen": 68435888, + "step": 117960 + }, + { + "epoch": 17.570002978850162, + "grad_norm": 1.1783587979152799e-05, + "learning_rate": 2.215180179490442e-06, + "loss": 0.0, + "num_input_tokens_seen": 68439088, + "step": 117965 + }, + { + "epoch": 17.570747691391123, + "grad_norm": 1.4723368622071575e-05, + "learning_rate": 2.2138431129254318e-06, + "loss": 0.0, + "num_input_tokens_seen": 68441936, + "step": 117970 + }, + { + "epoch": 17.571492403932083, + "grad_norm": 5.4598758651991375e-06, + "learning_rate": 2.2125064313093857e-06, + "loss": 0.0, + "num_input_tokens_seen": 68444688, + "step": 117975 + }, + { + "epoch": 17.57223711647304, + "grad_norm": 5.1597930905700196e-06, + "learning_rate": 2.2111701346648806e-06, + "loss": 0.0, + "num_input_tokens_seen": 68447536, + "step": 117980 + }, + { + "epoch": 17.572981829014, + "grad_norm": 7.198296225396916e-05, + "learning_rate": 2.209834223014498e-06, + "loss": 0.0, + "num_input_tokens_seen": 68450448, + "step": 117985 + }, + { + "epoch": 17.57372654155496, + "grad_norm": 0.00010765873594209552, + "learning_rate": 2.2084986963808036e-06, + "loss": 0.0, + "num_input_tokens_seen": 68453360, + "step": 117990 + }, + { + "epoch": 17.57447125409592, + "grad_norm": 1.3769667930318974e-05, + "learning_rate": 2.2071635547863565e-06, + "loss": 0.0, + "num_input_tokens_seen": 68456272, + "step": 117995 + }, + { + "epoch": 17.57521596663688, + "grad_norm": 1.650124613661319e-05, + "learning_rate": 2.2058287982537175e-06, + "loss": 0.0, + "num_input_tokens_seen": 68459184, + "step": 118000 + }, + { + "epoch": 17.575960679177836, + "grad_norm": 2.4135297280736268e-05, + "learning_rate": 2.2044944268054315e-06, + "loss": 0.0, + "num_input_tokens_seen": 68461744, + "step": 118005 + }, + { + "epoch": 17.576705391718797, + "grad_norm": 1.7729860701365396e-05, + "learning_rate": 2.203160440464047e-06, + "loss": 0.0, + "num_input_tokens_seen": 68464336, + "step": 118010 + }, + { + "epoch": 17.577450104259757, + "grad_norm": 2.972560878333752e-06, + "learning_rate": 2.2018268392520998e-06, + "loss": 0.0, + "num_input_tokens_seen": 68467504, + "step": 118015 + }, + { + "epoch": 17.578194816800714, + "grad_norm": 0.0003963906201533973, + "learning_rate": 2.2004936231921153e-06, + "loss": 0.0, + "num_input_tokens_seen": 68470320, + "step": 118020 + }, + { + "epoch": 17.578939529341675, + "grad_norm": 3.6393169011716964e-06, + "learning_rate": 2.1991607923066176e-06, + "loss": 0.0, + "num_input_tokens_seen": 68472976, + "step": 118025 + }, + { + "epoch": 17.57968424188263, + "grad_norm": 8.67641210788861e-06, + "learning_rate": 2.197828346618122e-06, + "loss": 0.0, + "num_input_tokens_seen": 68475696, + "step": 118030 + }, + { + "epoch": 17.580428954423592, + "grad_norm": 2.440284333715681e-06, + "learning_rate": 2.196496286149144e-06, + "loss": 0.0, + "num_input_tokens_seen": 68478384, + "step": 118035 + }, + { + "epoch": 17.581173666964553, + "grad_norm": 2.007657167268917e-05, + "learning_rate": 2.195164610922182e-06, + "loss": 0.0, + "num_input_tokens_seen": 68481232, + "step": 118040 + }, + { + "epoch": 17.58191837950551, + "grad_norm": 1.1915607501578052e-05, + "learning_rate": 2.1938333209597373e-06, + "loss": 0.0, + "num_input_tokens_seen": 68484496, + "step": 118045 + }, + { + "epoch": 17.58266309204647, + "grad_norm": 0.0006639143102802336, + "learning_rate": 2.1925024162842978e-06, + "loss": 0.0001, + "num_input_tokens_seen": 68487376, + "step": 118050 + }, + { + "epoch": 17.58340780458743, + "grad_norm": 0.0011266600340604782, + "learning_rate": 2.1911718969183535e-06, + "loss": 0.0, + "num_input_tokens_seen": 68490544, + "step": 118055 + }, + { + "epoch": 17.584152517128388, + "grad_norm": 6.525914068333805e-05, + "learning_rate": 2.189841762884376e-06, + "loss": 0.0, + "num_input_tokens_seen": 68493232, + "step": 118060 + }, + { + "epoch": 17.584897229669348, + "grad_norm": 1.3553252756537404e-05, + "learning_rate": 2.188512014204841e-06, + "loss": 0.0, + "num_input_tokens_seen": 68495728, + "step": 118065 + }, + { + "epoch": 17.585641942210305, + "grad_norm": 1.2859472008130979e-05, + "learning_rate": 2.1871826509022086e-06, + "loss": 0.0, + "num_input_tokens_seen": 68498544, + "step": 118070 + }, + { + "epoch": 17.586386654751266, + "grad_norm": 2.1933712559984997e-05, + "learning_rate": 2.1858536729989388e-06, + "loss": 0.0, + "num_input_tokens_seen": 68501456, + "step": 118075 + }, + { + "epoch": 17.587131367292226, + "grad_norm": 1.2149977010267321e-05, + "learning_rate": 2.1845250805174854e-06, + "loss": 0.0, + "num_input_tokens_seen": 68504176, + "step": 118080 + }, + { + "epoch": 17.587876079833183, + "grad_norm": 5.279106972011505e-06, + "learning_rate": 2.183196873480295e-06, + "loss": 0.0, + "num_input_tokens_seen": 68507024, + "step": 118085 + }, + { + "epoch": 17.588620792374144, + "grad_norm": 0.0024097422137856483, + "learning_rate": 2.1818690519098018e-06, + "loss": 0.0, + "num_input_tokens_seen": 68510096, + "step": 118090 + }, + { + "epoch": 17.589365504915104, + "grad_norm": 5.8994643040932715e-05, + "learning_rate": 2.1805416158284355e-06, + "loss": 0.0, + "num_input_tokens_seen": 68513168, + "step": 118095 + }, + { + "epoch": 17.59011021745606, + "grad_norm": 8.221957978093997e-05, + "learning_rate": 2.1792145652586305e-06, + "loss": 0.0, + "num_input_tokens_seen": 68516016, + "step": 118100 + }, + { + "epoch": 17.590854929997022, + "grad_norm": 9.675711226009298e-06, + "learning_rate": 2.177887900222797e-06, + "loss": 0.0, + "num_input_tokens_seen": 68519152, + "step": 118105 + }, + { + "epoch": 17.59159964253798, + "grad_norm": 9.718986802909058e-06, + "learning_rate": 2.176561620743356e-06, + "loss": 0.0, + "num_input_tokens_seen": 68522192, + "step": 118110 + }, + { + "epoch": 17.59234435507894, + "grad_norm": 0.0002256254811072722, + "learning_rate": 2.1752357268427086e-06, + "loss": 0.0, + "num_input_tokens_seen": 68525168, + "step": 118115 + }, + { + "epoch": 17.5930890676199, + "grad_norm": 3.374423386048875e-06, + "learning_rate": 2.173910218543254e-06, + "loss": 0.0, + "num_input_tokens_seen": 68528240, + "step": 118120 + }, + { + "epoch": 17.593833780160857, + "grad_norm": 8.438350778305903e-05, + "learning_rate": 2.172585095867391e-06, + "loss": 0.0, + "num_input_tokens_seen": 68530896, + "step": 118125 + }, + { + "epoch": 17.594578492701817, + "grad_norm": 2.5064509827643633e-06, + "learning_rate": 2.1712603588374956e-06, + "loss": 0.0, + "num_input_tokens_seen": 68533936, + "step": 118130 + }, + { + "epoch": 17.595323205242778, + "grad_norm": 1.6715745005058125e-05, + "learning_rate": 2.169936007475959e-06, + "loss": 0.0, + "num_input_tokens_seen": 68536848, + "step": 118135 + }, + { + "epoch": 17.596067917783735, + "grad_norm": 5.382896415540017e-05, + "learning_rate": 2.1686120418051457e-06, + "loss": 0.0, + "num_input_tokens_seen": 68539824, + "step": 118140 + }, + { + "epoch": 17.596812630324695, + "grad_norm": 9.308874723501503e-06, + "learning_rate": 2.167288461847433e-06, + "loss": 0.0, + "num_input_tokens_seen": 68542800, + "step": 118145 + }, + { + "epoch": 17.597557342865652, + "grad_norm": 5.960137423244305e-05, + "learning_rate": 2.165965267625175e-06, + "loss": 0.0, + "num_input_tokens_seen": 68545904, + "step": 118150 + }, + { + "epoch": 17.598302055406613, + "grad_norm": 3.565820952644572e-05, + "learning_rate": 2.164642459160726e-06, + "loss": 0.0, + "num_input_tokens_seen": 68548720, + "step": 118155 + }, + { + "epoch": 17.599046767947573, + "grad_norm": 0.0002453602210152894, + "learning_rate": 2.163320036476435e-06, + "loss": 0.0, + "num_input_tokens_seen": 68551696, + "step": 118160 + }, + { + "epoch": 17.59979148048853, + "grad_norm": 1.0776469935080968e-05, + "learning_rate": 2.1619979995946365e-06, + "loss": 0.0, + "num_input_tokens_seen": 68554896, + "step": 118165 + }, + { + "epoch": 17.60053619302949, + "grad_norm": 3.7841284211026505e-05, + "learning_rate": 2.160676348537674e-06, + "loss": 0.0, + "num_input_tokens_seen": 68557904, + "step": 118170 + }, + { + "epoch": 17.601280905570448, + "grad_norm": 6.6466918724472634e-06, + "learning_rate": 2.159355083327866e-06, + "loss": 0.0, + "num_input_tokens_seen": 68560656, + "step": 118175 + }, + { + "epoch": 17.60202561811141, + "grad_norm": 3.111294518021168e-06, + "learning_rate": 2.158034203987547e-06, + "loss": 0.0, + "num_input_tokens_seen": 68563408, + "step": 118180 + }, + { + "epoch": 17.60277033065237, + "grad_norm": 0.0001750214141793549, + "learning_rate": 2.1567137105390183e-06, + "loss": 0.0, + "num_input_tokens_seen": 68566416, + "step": 118185 + }, + { + "epoch": 17.603515043193326, + "grad_norm": 0.0006719452212564647, + "learning_rate": 2.155393603004602e-06, + "loss": 0.0, + "num_input_tokens_seen": 68569232, + "step": 118190 + }, + { + "epoch": 17.604259755734287, + "grad_norm": 5.555686129810056e-06, + "learning_rate": 2.15407388140659e-06, + "loss": 0.0, + "num_input_tokens_seen": 68571888, + "step": 118195 + }, + { + "epoch": 17.605004468275247, + "grad_norm": 3.426371768000536e-05, + "learning_rate": 2.1527545457672743e-06, + "loss": 0.0, + "num_input_tokens_seen": 68574960, + "step": 118200 + }, + { + "epoch": 17.605749180816204, + "grad_norm": 4.494592940318398e-05, + "learning_rate": 2.1514355961089583e-06, + "loss": 0.0, + "num_input_tokens_seen": 68577968, + "step": 118205 + }, + { + "epoch": 17.606493893357165, + "grad_norm": 4.94719461130444e-06, + "learning_rate": 2.1501170324539134e-06, + "loss": 0.0, + "num_input_tokens_seen": 68580720, + "step": 118210 + }, + { + "epoch": 17.60723860589812, + "grad_norm": 7.805275345162954e-06, + "learning_rate": 2.1487988548244133e-06, + "loss": 0.0, + "num_input_tokens_seen": 68583664, + "step": 118215 + }, + { + "epoch": 17.607983318439082, + "grad_norm": 0.005124448798596859, + "learning_rate": 2.1474810632427344e-06, + "loss": 0.0, + "num_input_tokens_seen": 68586640, + "step": 118220 + }, + { + "epoch": 17.608728030980043, + "grad_norm": 4.121048732486088e-06, + "learning_rate": 2.1461636577311373e-06, + "loss": 0.0, + "num_input_tokens_seen": 68589520, + "step": 118225 + }, + { + "epoch": 17.609472743521, + "grad_norm": 3.4068027161993086e-05, + "learning_rate": 2.1448466383118786e-06, + "loss": 0.0, + "num_input_tokens_seen": 68592272, + "step": 118230 + }, + { + "epoch": 17.61021745606196, + "grad_norm": 0.0018797846278175712, + "learning_rate": 2.143530005007202e-06, + "loss": 0.0, + "num_input_tokens_seen": 68595312, + "step": 118235 + }, + { + "epoch": 17.61096216860292, + "grad_norm": 5.262968898023246e-06, + "learning_rate": 2.142213757839362e-06, + "loss": 0.0, + "num_input_tokens_seen": 68598032, + "step": 118240 + }, + { + "epoch": 17.611706881143878, + "grad_norm": 2.895861143770162e-06, + "learning_rate": 2.140897896830582e-06, + "loss": 0.0, + "num_input_tokens_seen": 68601232, + "step": 118245 + }, + { + "epoch": 17.61245159368484, + "grad_norm": 3.2357331747334683e-06, + "learning_rate": 2.139582422003106e-06, + "loss": 0.0, + "num_input_tokens_seen": 68604272, + "step": 118250 + }, + { + "epoch": 17.613196306225795, + "grad_norm": 3.513890987960622e-05, + "learning_rate": 2.138267333379146e-06, + "loss": 0.0, + "num_input_tokens_seen": 68607440, + "step": 118255 + }, + { + "epoch": 17.613941018766756, + "grad_norm": 6.6633015194383916e-06, + "learning_rate": 2.1369526309809263e-06, + "loss": 0.0004, + "num_input_tokens_seen": 68610288, + "step": 118260 + }, + { + "epoch": 17.614685731307716, + "grad_norm": 5.971370228508022e-06, + "learning_rate": 2.1356383148306515e-06, + "loss": 0.0, + "num_input_tokens_seen": 68612816, + "step": 118265 + }, + { + "epoch": 17.615430443848673, + "grad_norm": 2.2690423975291196e-06, + "learning_rate": 2.134324384950531e-06, + "loss": 0.0, + "num_input_tokens_seen": 68615632, + "step": 118270 + }, + { + "epoch": 17.616175156389634, + "grad_norm": 3.912460670107976e-06, + "learning_rate": 2.1330108413627647e-06, + "loss": 0.0, + "num_input_tokens_seen": 68618416, + "step": 118275 + }, + { + "epoch": 17.616919868930594, + "grad_norm": 2.4483272227371344e-06, + "learning_rate": 2.131697684089537e-06, + "loss": 0.0, + "num_input_tokens_seen": 68621456, + "step": 118280 + }, + { + "epoch": 17.61766458147155, + "grad_norm": 3.2041893973655533e-06, + "learning_rate": 2.1303849131530357e-06, + "loss": 0.0, + "num_input_tokens_seen": 68624368, + "step": 118285 + }, + { + "epoch": 17.618409294012512, + "grad_norm": 0.00024382746778428555, + "learning_rate": 2.129072528575432e-06, + "loss": 0.0, + "num_input_tokens_seen": 68627216, + "step": 118290 + }, + { + "epoch": 17.61915400655347, + "grad_norm": 1.0016434316639788e-05, + "learning_rate": 2.1277605303789057e-06, + "loss": 0.0, + "num_input_tokens_seen": 68629936, + "step": 118295 + }, + { + "epoch": 17.61989871909443, + "grad_norm": 5.089012483949773e-05, + "learning_rate": 2.126448918585619e-06, + "loss": 0.0, + "num_input_tokens_seen": 68633072, + "step": 118300 + }, + { + "epoch": 17.62064343163539, + "grad_norm": 4.8153601710509975e-06, + "learning_rate": 2.12513769321773e-06, + "loss": 0.0, + "num_input_tokens_seen": 68635824, + "step": 118305 + }, + { + "epoch": 17.621388144176347, + "grad_norm": 9.349182073492557e-05, + "learning_rate": 2.123826854297395e-06, + "loss": 0.0, + "num_input_tokens_seen": 68638672, + "step": 118310 + }, + { + "epoch": 17.622132856717307, + "grad_norm": 3.529622517817188e-06, + "learning_rate": 2.1225164018467468e-06, + "loss": 0.0, + "num_input_tokens_seen": 68641648, + "step": 118315 + }, + { + "epoch": 17.622877569258268, + "grad_norm": 5.617386250378331e-06, + "learning_rate": 2.1212063358879374e-06, + "loss": 0.0, + "num_input_tokens_seen": 68644304, + "step": 118320 + }, + { + "epoch": 17.623622281799225, + "grad_norm": 1.6214955394389108e-05, + "learning_rate": 2.1198966564430935e-06, + "loss": 0.0056, + "num_input_tokens_seen": 68647088, + "step": 118325 + }, + { + "epoch": 17.624366994340185, + "grad_norm": 4.549784534901846e-06, + "learning_rate": 2.1185873635343413e-06, + "loss": 0.0, + "num_input_tokens_seen": 68650000, + "step": 118330 + }, + { + "epoch": 17.625111706881142, + "grad_norm": 1.3522461813408881e-05, + "learning_rate": 2.1172784571837965e-06, + "loss": 0.0, + "num_input_tokens_seen": 68652816, + "step": 118335 + }, + { + "epoch": 17.625856419422103, + "grad_norm": 0.00013797271822113544, + "learning_rate": 2.115969937413581e-06, + "loss": 0.0045, + "num_input_tokens_seen": 68655632, + "step": 118340 + }, + { + "epoch": 17.626601131963064, + "grad_norm": 9.457986379857175e-06, + "learning_rate": 2.1146618042457935e-06, + "loss": 0.0, + "num_input_tokens_seen": 68658672, + "step": 118345 + }, + { + "epoch": 17.62734584450402, + "grad_norm": 0.0003422972222324461, + "learning_rate": 2.113354057702535e-06, + "loss": 0.0, + "num_input_tokens_seen": 68661776, + "step": 118350 + }, + { + "epoch": 17.62809055704498, + "grad_norm": 1.3317157936398871e-05, + "learning_rate": 2.1120466978059e-06, + "loss": 0.0, + "num_input_tokens_seen": 68664464, + "step": 118355 + }, + { + "epoch": 17.62883526958594, + "grad_norm": 8.901819455786608e-06, + "learning_rate": 2.1107397245779705e-06, + "loss": 0.0, + "num_input_tokens_seen": 68667472, + "step": 118360 + }, + { + "epoch": 17.6295799821269, + "grad_norm": 3.117535015917383e-06, + "learning_rate": 2.109433138040834e-06, + "loss": 0.0, + "num_input_tokens_seen": 68670160, + "step": 118365 + }, + { + "epoch": 17.63032469466786, + "grad_norm": 1.0080157153424807e-05, + "learning_rate": 2.1081269382165536e-06, + "loss": 0.0, + "num_input_tokens_seen": 68673072, + "step": 118370 + }, + { + "epoch": 17.631069407208816, + "grad_norm": 3.4950915051013e-06, + "learning_rate": 2.1068211251272063e-06, + "loss": 0.0, + "num_input_tokens_seen": 68676016, + "step": 118375 + }, + { + "epoch": 17.631814119749777, + "grad_norm": 0.0036803847178816795, + "learning_rate": 2.105515698794844e-06, + "loss": 0.0, + "num_input_tokens_seen": 68678896, + "step": 118380 + }, + { + "epoch": 17.632558832290737, + "grad_norm": 0.0007203866844065487, + "learning_rate": 2.104210659241532e-06, + "loss": 0.0, + "num_input_tokens_seen": 68681744, + "step": 118385 + }, + { + "epoch": 17.633303544831694, + "grad_norm": 7.205427664302988e-06, + "learning_rate": 2.102906006489308e-06, + "loss": 0.0, + "num_input_tokens_seen": 68684240, + "step": 118390 + }, + { + "epoch": 17.634048257372655, + "grad_norm": 6.516572466352955e-05, + "learning_rate": 2.1016017405602135e-06, + "loss": 0.0, + "num_input_tokens_seen": 68687184, + "step": 118395 + }, + { + "epoch": 17.63479296991361, + "grad_norm": 3.1193007998808753e-06, + "learning_rate": 2.100297861476286e-06, + "loss": 0.0, + "num_input_tokens_seen": 68689680, + "step": 118400 + }, + { + "epoch": 17.635537682454572, + "grad_norm": 3.641552393673919e-05, + "learning_rate": 2.0989943692595495e-06, + "loss": 0.0, + "num_input_tokens_seen": 68692368, + "step": 118405 + }, + { + "epoch": 17.636282394995533, + "grad_norm": 3.5655073588714004e-05, + "learning_rate": 2.0976912639320336e-06, + "loss": 0.0, + "num_input_tokens_seen": 68695152, + "step": 118410 + }, + { + "epoch": 17.63702710753649, + "grad_norm": 6.099064194131643e-06, + "learning_rate": 2.096388545515743e-06, + "loss": 0.0, + "num_input_tokens_seen": 68697840, + "step": 118415 + }, + { + "epoch": 17.63777182007745, + "grad_norm": 2.9813600121997297e-06, + "learning_rate": 2.095086214032693e-06, + "loss": 0.0, + "num_input_tokens_seen": 68700912, + "step": 118420 + }, + { + "epoch": 17.63851653261841, + "grad_norm": 1.338307174592046e-05, + "learning_rate": 2.0937842695048754e-06, + "loss": 0.0, + "num_input_tokens_seen": 68703664, + "step": 118425 + }, + { + "epoch": 17.639261245159368, + "grad_norm": 3.3244364203710575e-06, + "learning_rate": 2.0924827119542965e-06, + "loss": 0.0, + "num_input_tokens_seen": 68706704, + "step": 118430 + }, + { + "epoch": 17.64000595770033, + "grad_norm": 6.569985271198675e-05, + "learning_rate": 2.0911815414029423e-06, + "loss": 0.0, + "num_input_tokens_seen": 68709456, + "step": 118435 + }, + { + "epoch": 17.640750670241285, + "grad_norm": 1.6601534298388287e-05, + "learning_rate": 2.089880757872786e-06, + "loss": 0.0, + "num_input_tokens_seen": 68712176, + "step": 118440 + }, + { + "epoch": 17.641495382782246, + "grad_norm": 3.242658931412734e-05, + "learning_rate": 2.088580361385814e-06, + "loss": 0.0, + "num_input_tokens_seen": 68714992, + "step": 118445 + }, + { + "epoch": 17.642240095323206, + "grad_norm": 1.5026987966848537e-05, + "learning_rate": 2.0872803519639856e-06, + "loss": 0.0, + "num_input_tokens_seen": 68717936, + "step": 118450 + }, + { + "epoch": 17.642984807864163, + "grad_norm": 3.1810125165065983e-06, + "learning_rate": 2.085980729629275e-06, + "loss": 0.0, + "num_input_tokens_seen": 68720688, + "step": 118455 + }, + { + "epoch": 17.643729520405124, + "grad_norm": 8.621505548944697e-05, + "learning_rate": 2.084681494403623e-06, + "loss": 0.0, + "num_input_tokens_seen": 68723600, + "step": 118460 + }, + { + "epoch": 17.644474232946084, + "grad_norm": 8.6495265350095e-06, + "learning_rate": 2.0833826463089957e-06, + "loss": 0.0, + "num_input_tokens_seen": 68726416, + "step": 118465 + }, + { + "epoch": 17.64521894548704, + "grad_norm": 8.681976396474056e-06, + "learning_rate": 2.0820841853673252e-06, + "loss": 0.0, + "num_input_tokens_seen": 68729072, + "step": 118470 + }, + { + "epoch": 17.645963658028002, + "grad_norm": 1.8354436406298191e-06, + "learning_rate": 2.080786111600544e-06, + "loss": 0.0, + "num_input_tokens_seen": 68731920, + "step": 118475 + }, + { + "epoch": 17.64670837056896, + "grad_norm": 6.491688964160858e-06, + "learning_rate": 2.079488425030593e-06, + "loss": 0.0, + "num_input_tokens_seen": 68734992, + "step": 118480 + }, + { + "epoch": 17.64745308310992, + "grad_norm": 1.8123273548553698e-05, + "learning_rate": 2.0781911256793905e-06, + "loss": 0.0, + "num_input_tokens_seen": 68738128, + "step": 118485 + }, + { + "epoch": 17.64819779565088, + "grad_norm": 5.769736162619665e-06, + "learning_rate": 2.076894213568853e-06, + "loss": 0.0, + "num_input_tokens_seen": 68740944, + "step": 118490 + }, + { + "epoch": 17.648942508191837, + "grad_norm": 1.1469188393675722e-05, + "learning_rate": 2.0755976887208815e-06, + "loss": 0.0, + "num_input_tokens_seen": 68743728, + "step": 118495 + }, + { + "epoch": 17.649687220732797, + "grad_norm": 7.556452146673109e-06, + "learning_rate": 2.0743015511573952e-06, + "loss": 0.0012, + "num_input_tokens_seen": 68746928, + "step": 118500 + }, + { + "epoch": 17.650431933273758, + "grad_norm": 1.7059304809663445e-05, + "learning_rate": 2.0730058009002793e-06, + "loss": 0.0002, + "num_input_tokens_seen": 68749840, + "step": 118505 + }, + { + "epoch": 17.651176645814715, + "grad_norm": 5.314570444170386e-06, + "learning_rate": 2.0717104379714304e-06, + "loss": 0.0, + "num_input_tokens_seen": 68752784, + "step": 118510 + }, + { + "epoch": 17.651921358355676, + "grad_norm": 6.635325462411856e-06, + "learning_rate": 2.070415462392733e-06, + "loss": 0.0, + "num_input_tokens_seen": 68755792, + "step": 118515 + }, + { + "epoch": 17.652666070896633, + "grad_norm": 5.644327302434249e-06, + "learning_rate": 2.0691208741860562e-06, + "loss": 0.0, + "num_input_tokens_seen": 68759024, + "step": 118520 + }, + { + "epoch": 17.653410783437593, + "grad_norm": 2.6118957975995727e-05, + "learning_rate": 2.06782667337328e-06, + "loss": 0.0, + "num_input_tokens_seen": 68762032, + "step": 118525 + }, + { + "epoch": 17.654155495978554, + "grad_norm": 1.3925183338869829e-05, + "learning_rate": 2.0665328599762613e-06, + "loss": 0.0, + "num_input_tokens_seen": 68764848, + "step": 118530 + }, + { + "epoch": 17.65490020851951, + "grad_norm": 2.9077659746690188e-06, + "learning_rate": 2.0652394340168664e-06, + "loss": 0.0, + "num_input_tokens_seen": 68767856, + "step": 118535 + }, + { + "epoch": 17.65564492106047, + "grad_norm": 8.428667570115067e-06, + "learning_rate": 2.0639463955169414e-06, + "loss": 0.0, + "num_input_tokens_seen": 68770608, + "step": 118540 + }, + { + "epoch": 17.656389633601428, + "grad_norm": 4.261359663360054e-06, + "learning_rate": 2.0626537444983274e-06, + "loss": 0.0, + "num_input_tokens_seen": 68773584, + "step": 118545 + }, + { + "epoch": 17.65713434614239, + "grad_norm": 433.9204406738281, + "learning_rate": 2.061361480982868e-06, + "loss": 0.0425, + "num_input_tokens_seen": 68776336, + "step": 118550 + }, + { + "epoch": 17.65787905868335, + "grad_norm": 6.9079146669537295e-06, + "learning_rate": 2.060069604992393e-06, + "loss": 0.0, + "num_input_tokens_seen": 68779472, + "step": 118555 + }, + { + "epoch": 17.658623771224306, + "grad_norm": 1.3401660908129998e-05, + "learning_rate": 2.058778116548729e-06, + "loss": 0.0, + "num_input_tokens_seen": 68782320, + "step": 118560 + }, + { + "epoch": 17.659368483765267, + "grad_norm": 2.2712722056894563e-05, + "learning_rate": 2.057487015673684e-06, + "loss": 0.0, + "num_input_tokens_seen": 68785232, + "step": 118565 + }, + { + "epoch": 17.660113196306227, + "grad_norm": 6.893998488521902e-06, + "learning_rate": 2.0561963023890853e-06, + "loss": 0.0, + "num_input_tokens_seen": 68787792, + "step": 118570 + }, + { + "epoch": 17.660857908847184, + "grad_norm": 0.007939211092889309, + "learning_rate": 2.0549059767167255e-06, + "loss": 0.0, + "num_input_tokens_seen": 68790864, + "step": 118575 + }, + { + "epoch": 17.661602621388145, + "grad_norm": 2.093988223350607e-05, + "learning_rate": 2.053616038678416e-06, + "loss": 0.0, + "num_input_tokens_seen": 68793744, + "step": 118580 + }, + { + "epoch": 17.6623473339291, + "grad_norm": 0.35445040464401245, + "learning_rate": 2.0523264882959357e-06, + "loss": 0.0008, + "num_input_tokens_seen": 68796560, + "step": 118585 + }, + { + "epoch": 17.663092046470062, + "grad_norm": 3.189592098351568e-05, + "learning_rate": 2.051037325591079e-06, + "loss": 0.0, + "num_input_tokens_seen": 68799440, + "step": 118590 + }, + { + "epoch": 17.663836759011023, + "grad_norm": 1.2738737495965324e-05, + "learning_rate": 2.0497485505856256e-06, + "loss": 0.0, + "num_input_tokens_seen": 68802256, + "step": 118595 + }, + { + "epoch": 17.66458147155198, + "grad_norm": 0.00025619074585847557, + "learning_rate": 2.0484601633013383e-06, + "loss": 0.0, + "num_input_tokens_seen": 68805424, + "step": 118600 + }, + { + "epoch": 17.66532618409294, + "grad_norm": 1.6696571037755348e-05, + "learning_rate": 2.0471721637599944e-06, + "loss": 0.0, + "num_input_tokens_seen": 68808304, + "step": 118605 + }, + { + "epoch": 17.6660708966339, + "grad_norm": 5.568698270508321e-06, + "learning_rate": 2.0458845519833487e-06, + "loss": 0.0, + "num_input_tokens_seen": 68811312, + "step": 118610 + }, + { + "epoch": 17.666815609174858, + "grad_norm": 1.5929350411170162e-05, + "learning_rate": 2.044597327993153e-06, + "loss": 0.0, + "num_input_tokens_seen": 68814096, + "step": 118615 + }, + { + "epoch": 17.66756032171582, + "grad_norm": 0.0006028783973306417, + "learning_rate": 2.0433104918111514e-06, + "loss": 0.0, + "num_input_tokens_seen": 68816816, + "step": 118620 + }, + { + "epoch": 17.668305034256775, + "grad_norm": 0.0012219126801937819, + "learning_rate": 2.0420240434590925e-06, + "loss": 0.0, + "num_input_tokens_seen": 68819696, + "step": 118625 + }, + { + "epoch": 17.669049746797736, + "grad_norm": 0.002414203714579344, + "learning_rate": 2.0407379829587013e-06, + "loss": 0.0, + "num_input_tokens_seen": 68822576, + "step": 118630 + }, + { + "epoch": 17.669794459338696, + "grad_norm": 2.7164419407199603e-06, + "learning_rate": 2.039452310331705e-06, + "loss": 0.0, + "num_input_tokens_seen": 68825296, + "step": 118635 + }, + { + "epoch": 17.670539171879653, + "grad_norm": 3.826856118394062e-05, + "learning_rate": 2.0381670255998297e-06, + "loss": 0.0, + "num_input_tokens_seen": 68828432, + "step": 118640 + }, + { + "epoch": 17.671283884420614, + "grad_norm": 2.3165857783169486e-06, + "learning_rate": 2.0368821287847785e-06, + "loss": 0.0, + "num_input_tokens_seen": 68831088, + "step": 118645 + }, + { + "epoch": 17.672028596961574, + "grad_norm": 2.7279482310405e-05, + "learning_rate": 2.035597619908272e-06, + "loss": 0.0, + "num_input_tokens_seen": 68833904, + "step": 118650 + }, + { + "epoch": 17.67277330950253, + "grad_norm": 1.6034777218010277e-05, + "learning_rate": 2.0343134989919995e-06, + "loss": 0.0, + "num_input_tokens_seen": 68836784, + "step": 118655 + }, + { + "epoch": 17.673518022043492, + "grad_norm": 4.267268013791181e-05, + "learning_rate": 2.033029766057662e-06, + "loss": 0.0, + "num_input_tokens_seen": 68839600, + "step": 118660 + }, + { + "epoch": 17.67426273458445, + "grad_norm": 0.0008175554685294628, + "learning_rate": 2.03174642112694e-06, + "loss": 0.0, + "num_input_tokens_seen": 68842160, + "step": 118665 + }, + { + "epoch": 17.67500744712541, + "grad_norm": 1.4633922546636313e-05, + "learning_rate": 2.0304634642215215e-06, + "loss": 0.0, + "num_input_tokens_seen": 68845488, + "step": 118670 + }, + { + "epoch": 17.67575215966637, + "grad_norm": 0.00023814148153178394, + "learning_rate": 2.029180895363081e-06, + "loss": 0.0, + "num_input_tokens_seen": 68848560, + "step": 118675 + }, + { + "epoch": 17.676496872207327, + "grad_norm": 4.845385774387978e-05, + "learning_rate": 2.0278987145732786e-06, + "loss": 0.0, + "num_input_tokens_seen": 68851408, + "step": 118680 + }, + { + "epoch": 17.677241584748288, + "grad_norm": 2.7218623017688515e-06, + "learning_rate": 2.0266169218737836e-06, + "loss": 0.0, + "num_input_tokens_seen": 68854000, + "step": 118685 + }, + { + "epoch": 17.677986297289245, + "grad_norm": 5.541827704291791e-06, + "learning_rate": 2.0253355172862394e-06, + "loss": 0.1159, + "num_input_tokens_seen": 68856848, + "step": 118690 + }, + { + "epoch": 17.678731009830205, + "grad_norm": 0.0032562450505793095, + "learning_rate": 2.0240545008323064e-06, + "loss": 0.0, + "num_input_tokens_seen": 68859920, + "step": 118695 + }, + { + "epoch": 17.679475722371166, + "grad_norm": 4.139193606533809e-06, + "learning_rate": 2.0227738725336176e-06, + "loss": 0.0, + "num_input_tokens_seen": 68863248, + "step": 118700 + }, + { + "epoch": 17.680220434912123, + "grad_norm": 1.749915463733487e-05, + "learning_rate": 2.0214936324118137e-06, + "loss": 0.0, + "num_input_tokens_seen": 68865872, + "step": 118705 + }, + { + "epoch": 17.680965147453083, + "grad_norm": 7.499827006540727e-06, + "learning_rate": 2.0202137804885196e-06, + "loss": 0.0, + "num_input_tokens_seen": 68868976, + "step": 118710 + }, + { + "epoch": 17.681709859994044, + "grad_norm": 1.99421319848625e-05, + "learning_rate": 2.018934316785359e-06, + "loss": 0.0, + "num_input_tokens_seen": 68871952, + "step": 118715 + }, + { + "epoch": 17.682454572535, + "grad_norm": 1.3022999155509751e-05, + "learning_rate": 2.017655241323946e-06, + "loss": 0.0, + "num_input_tokens_seen": 68874832, + "step": 118720 + }, + { + "epoch": 17.68319928507596, + "grad_norm": 0.0025871379766613245, + "learning_rate": 2.016376554125887e-06, + "loss": 0.0, + "num_input_tokens_seen": 68877648, + "step": 118725 + }, + { + "epoch": 17.683943997616918, + "grad_norm": 1.7695747374091297e-06, + "learning_rate": 2.0150982552127913e-06, + "loss": 0.0, + "num_input_tokens_seen": 68880560, + "step": 118730 + }, + { + "epoch": 17.68468871015788, + "grad_norm": 4.464576250029495e-06, + "learning_rate": 2.0138203446062433e-06, + "loss": 0.0, + "num_input_tokens_seen": 68883280, + "step": 118735 + }, + { + "epoch": 17.68543342269884, + "grad_norm": 5.133550803293474e-05, + "learning_rate": 2.0125428223278453e-06, + "loss": 0.0, + "num_input_tokens_seen": 68886128, + "step": 118740 + }, + { + "epoch": 17.686178135239796, + "grad_norm": 1.1987546713498887e-05, + "learning_rate": 2.011265688399172e-06, + "loss": 0.0, + "num_input_tokens_seen": 68888944, + "step": 118745 + }, + { + "epoch": 17.686922847780757, + "grad_norm": 9.166375093627721e-06, + "learning_rate": 2.009988942841798e-06, + "loss": 0.0, + "num_input_tokens_seen": 68892112, + "step": 118750 + }, + { + "epoch": 17.687667560321717, + "grad_norm": 7.3041446739807725e-06, + "learning_rate": 2.008712585677297e-06, + "loss": 0.0, + "num_input_tokens_seen": 68894928, + "step": 118755 + }, + { + "epoch": 17.688412272862674, + "grad_norm": 0.0003165742673445493, + "learning_rate": 2.007436616927225e-06, + "loss": 0.0, + "num_input_tokens_seen": 68898032, + "step": 118760 + }, + { + "epoch": 17.689156985403635, + "grad_norm": 5.676859473169316e-06, + "learning_rate": 2.006161036613147e-06, + "loss": 0.0, + "num_input_tokens_seen": 68901136, + "step": 118765 + }, + { + "epoch": 17.68990169794459, + "grad_norm": 6.607187970075756e-05, + "learning_rate": 2.0048858447566045e-06, + "loss": 0.0004, + "num_input_tokens_seen": 68903664, + "step": 118770 + }, + { + "epoch": 17.690646410485552, + "grad_norm": 2.505789598217234e-05, + "learning_rate": 2.003611041379147e-06, + "loss": 0.0, + "num_input_tokens_seen": 68906544, + "step": 118775 + }, + { + "epoch": 17.691391123026513, + "grad_norm": 4.305778475099942e-06, + "learning_rate": 2.0023366265023074e-06, + "loss": 0.0, + "num_input_tokens_seen": 68909744, + "step": 118780 + }, + { + "epoch": 17.69213583556747, + "grad_norm": 1.2699938451987691e-05, + "learning_rate": 2.0010626001476184e-06, + "loss": 0.0, + "num_input_tokens_seen": 68912688, + "step": 118785 + }, + { + "epoch": 17.69288054810843, + "grad_norm": 3.872272372973384e-06, + "learning_rate": 1.999788962336599e-06, + "loss": 0.0247, + "num_input_tokens_seen": 68915472, + "step": 118790 + }, + { + "epoch": 17.69362526064939, + "grad_norm": 3.709177690325305e-05, + "learning_rate": 1.9985157130907707e-06, + "loss": 0.0, + "num_input_tokens_seen": 68918416, + "step": 118795 + }, + { + "epoch": 17.694369973190348, + "grad_norm": 1.4446705790760461e-05, + "learning_rate": 1.997242852431644e-06, + "loss": 0.0, + "num_input_tokens_seen": 68921296, + "step": 118800 + }, + { + "epoch": 17.69511468573131, + "grad_norm": 4.967148470313987e-06, + "learning_rate": 1.9959703803807156e-06, + "loss": 0.0, + "num_input_tokens_seen": 68924208, + "step": 118805 + }, + { + "epoch": 17.695859398272265, + "grad_norm": 0.0025877910666167736, + "learning_rate": 1.994698296959491e-06, + "loss": 0.0, + "num_input_tokens_seen": 68927312, + "step": 118810 + }, + { + "epoch": 17.696604110813226, + "grad_norm": 2.1156854927539825e-05, + "learning_rate": 1.9934266021894575e-06, + "loss": 0.0, + "num_input_tokens_seen": 68930384, + "step": 118815 + }, + { + "epoch": 17.697348823354186, + "grad_norm": 1.2157965102232993e-05, + "learning_rate": 1.9921552960920994e-06, + "loss": 0.0, + "num_input_tokens_seen": 68933392, + "step": 118820 + }, + { + "epoch": 17.698093535895143, + "grad_norm": 0.017977531999349594, + "learning_rate": 1.990884378688887e-06, + "loss": 0.0, + "num_input_tokens_seen": 68936464, + "step": 118825 + }, + { + "epoch": 17.698838248436104, + "grad_norm": 3.5050852602580562e-06, + "learning_rate": 1.989613850001304e-06, + "loss": 0.0, + "num_input_tokens_seen": 68939248, + "step": 118830 + }, + { + "epoch": 17.699582960977065, + "grad_norm": 2.6415809770696796e-05, + "learning_rate": 1.988343710050808e-06, + "loss": 0.0, + "num_input_tokens_seen": 68942416, + "step": 118835 + }, + { + "epoch": 17.70032767351802, + "grad_norm": 1.2340615285211243e-05, + "learning_rate": 1.987073958858851e-06, + "loss": 0.0, + "num_input_tokens_seen": 68945392, + "step": 118840 + }, + { + "epoch": 17.701072386058982, + "grad_norm": 1.9719942429219373e-05, + "learning_rate": 1.985804596446897e-06, + "loss": 0.0, + "num_input_tokens_seen": 68948272, + "step": 118845 + }, + { + "epoch": 17.70181709859994, + "grad_norm": 2.2055778572394047e-06, + "learning_rate": 1.984535622836378e-06, + "loss": 0.0, + "num_input_tokens_seen": 68951024, + "step": 118850 + }, + { + "epoch": 17.7025618111409, + "grad_norm": 4.408863787830342e-06, + "learning_rate": 1.983267038048742e-06, + "loss": 0.0, + "num_input_tokens_seen": 68954096, + "step": 118855 + }, + { + "epoch": 17.70330652368186, + "grad_norm": 3.875608854286838e-06, + "learning_rate": 1.981998842105412e-06, + "loss": 0.0, + "num_input_tokens_seen": 68957136, + "step": 118860 + }, + { + "epoch": 17.704051236222817, + "grad_norm": 3.09811935039761e-06, + "learning_rate": 1.980731035027822e-06, + "loss": 0.0, + "num_input_tokens_seen": 68959792, + "step": 118865 + }, + { + "epoch": 17.704795948763778, + "grad_norm": 8.316378625750076e-06, + "learning_rate": 1.979463616837385e-06, + "loss": 0.0, + "num_input_tokens_seen": 68962768, + "step": 118870 + }, + { + "epoch": 17.705540661304738, + "grad_norm": 0.0015277289785444736, + "learning_rate": 1.9781965875555087e-06, + "loss": 0.0, + "num_input_tokens_seen": 68965648, + "step": 118875 + }, + { + "epoch": 17.706285373845695, + "grad_norm": 0.00039953869418241084, + "learning_rate": 1.976929947203607e-06, + "loss": 0.0, + "num_input_tokens_seen": 68968656, + "step": 118880 + }, + { + "epoch": 17.707030086386656, + "grad_norm": 4.5141282498661894e-06, + "learning_rate": 1.9756636958030733e-06, + "loss": 0.0, + "num_input_tokens_seen": 68971472, + "step": 118885 + }, + { + "epoch": 17.707774798927613, + "grad_norm": 1.8134273886971641e-06, + "learning_rate": 1.9743978333753023e-06, + "loss": 0.0, + "num_input_tokens_seen": 68975312, + "step": 118890 + }, + { + "epoch": 17.708519511468573, + "grad_norm": 0.00022679225367028266, + "learning_rate": 1.9731323599416736e-06, + "loss": 0.0, + "num_input_tokens_seen": 68978256, + "step": 118895 + }, + { + "epoch": 17.709264224009534, + "grad_norm": 6.382977881003171e-05, + "learning_rate": 1.9718672755235728e-06, + "loss": 0.0, + "num_input_tokens_seen": 68981040, + "step": 118900 + }, + { + "epoch": 17.71000893655049, + "grad_norm": 43.6408576965332, + "learning_rate": 1.9706025801423666e-06, + "loss": 0.0247, + "num_input_tokens_seen": 68984144, + "step": 118905 + }, + { + "epoch": 17.71075364909145, + "grad_norm": 4.038651240989566e-06, + "learning_rate": 1.969338273819429e-06, + "loss": 0.0, + "num_input_tokens_seen": 68986960, + "step": 118910 + }, + { + "epoch": 17.711498361632408, + "grad_norm": 0.0024215872399508953, + "learning_rate": 1.9680743565761107e-06, + "loss": 0.0, + "num_input_tokens_seen": 68989584, + "step": 118915 + }, + { + "epoch": 17.71224307417337, + "grad_norm": 0.00014681140601169318, + "learning_rate": 1.9668108284337654e-06, + "loss": 0.0, + "num_input_tokens_seen": 68992464, + "step": 118920 + }, + { + "epoch": 17.71298778671433, + "grad_norm": 4.162779532634886e-06, + "learning_rate": 1.9655476894137465e-06, + "loss": 0.0, + "num_input_tokens_seen": 68995184, + "step": 118925 + }, + { + "epoch": 17.713732499255286, + "grad_norm": 2.9491909572243458e-06, + "learning_rate": 1.9642849395373836e-06, + "loss": 0.0, + "num_input_tokens_seen": 68997936, + "step": 118930 + }, + { + "epoch": 17.714477211796247, + "grad_norm": 7.930894753371831e-06, + "learning_rate": 1.963022578826018e-06, + "loss": 0.0, + "num_input_tokens_seen": 69000752, + "step": 118935 + }, + { + "epoch": 17.715221924337207, + "grad_norm": 3.3289882139797555e-06, + "learning_rate": 1.961760607300972e-06, + "loss": 0.0, + "num_input_tokens_seen": 69003504, + "step": 118940 + }, + { + "epoch": 17.715966636878164, + "grad_norm": 3.5755871067522094e-05, + "learning_rate": 1.960499024983564e-06, + "loss": 0.0, + "num_input_tokens_seen": 69006384, + "step": 118945 + }, + { + "epoch": 17.716711349419125, + "grad_norm": 2.6442171474627685e-06, + "learning_rate": 1.9592378318951054e-06, + "loss": 0.0, + "num_input_tokens_seen": 69009232, + "step": 118950 + }, + { + "epoch": 17.717456061960082, + "grad_norm": 2.815673042277922e-06, + "learning_rate": 1.957977028056912e-06, + "loss": 0.0, + "num_input_tokens_seen": 69011856, + "step": 118955 + }, + { + "epoch": 17.718200774501042, + "grad_norm": 6.162531008158112e-06, + "learning_rate": 1.9567166134902752e-06, + "loss": 0.0, + "num_input_tokens_seen": 69014576, + "step": 118960 + }, + { + "epoch": 17.718945487042003, + "grad_norm": 1.3790935554425232e-05, + "learning_rate": 1.955456588216489e-06, + "loss": 0.0, + "num_input_tokens_seen": 69017296, + "step": 118965 + }, + { + "epoch": 17.71969019958296, + "grad_norm": 1.0106709851243068e-05, + "learning_rate": 1.9541969522568456e-06, + "loss": 0.0, + "num_input_tokens_seen": 69020176, + "step": 118970 + }, + { + "epoch": 17.72043491212392, + "grad_norm": 1.230402722285362e-05, + "learning_rate": 1.9529377056326183e-06, + "loss": 0.0, + "num_input_tokens_seen": 69022992, + "step": 118975 + }, + { + "epoch": 17.72117962466488, + "grad_norm": 4.191237621853361e-06, + "learning_rate": 1.951678848365088e-06, + "loss": 0.0, + "num_input_tokens_seen": 69026064, + "step": 118980 + }, + { + "epoch": 17.721924337205838, + "grad_norm": 4.2564297473290935e-06, + "learning_rate": 1.950420380475515e-06, + "loss": 0.0, + "num_input_tokens_seen": 69029072, + "step": 118985 + }, + { + "epoch": 17.7226690497468, + "grad_norm": 2.513782419555355e-06, + "learning_rate": 1.949162301985166e-06, + "loss": 0.0, + "num_input_tokens_seen": 69032080, + "step": 118990 + }, + { + "epoch": 17.723413762287755, + "grad_norm": 2.555452738306485e-06, + "learning_rate": 1.947904612915294e-06, + "loss": 0.0, + "num_input_tokens_seen": 69034928, + "step": 118995 + }, + { + "epoch": 17.724158474828716, + "grad_norm": 3.09592633129796e-06, + "learning_rate": 1.9466473132871392e-06, + "loss": 0.0, + "num_input_tokens_seen": 69037904, + "step": 119000 + }, + { + "epoch": 17.724903187369677, + "grad_norm": 1.3611738722829614e-05, + "learning_rate": 1.945390403121952e-06, + "loss": 0.0, + "num_input_tokens_seen": 69040848, + "step": 119005 + }, + { + "epoch": 17.725647899910633, + "grad_norm": 8.124022315314505e-06, + "learning_rate": 1.944133882440963e-06, + "loss": 0.0, + "num_input_tokens_seen": 69043696, + "step": 119010 + }, + { + "epoch": 17.726392612451594, + "grad_norm": 9.529750241199508e-05, + "learning_rate": 1.9428777512653957e-06, + "loss": 0.0, + "num_input_tokens_seen": 69046512, + "step": 119015 + }, + { + "epoch": 17.727137324992555, + "grad_norm": 1.6573734683333896e-05, + "learning_rate": 1.941622009616473e-06, + "loss": 0.0, + "num_input_tokens_seen": 69049232, + "step": 119020 + }, + { + "epoch": 17.72788203753351, + "grad_norm": 6.860224675619975e-05, + "learning_rate": 1.9403666575154163e-06, + "loss": 0.0, + "num_input_tokens_seen": 69052016, + "step": 119025 + }, + { + "epoch": 17.728626750074472, + "grad_norm": 2.4021121134865098e-05, + "learning_rate": 1.9391116949834227e-06, + "loss": 0.0, + "num_input_tokens_seen": 69054864, + "step": 119030 + }, + { + "epoch": 17.72937146261543, + "grad_norm": 2.893749751819996e-06, + "learning_rate": 1.937857122041703e-06, + "loss": 0.0, + "num_input_tokens_seen": 69058000, + "step": 119035 + }, + { + "epoch": 17.73011617515639, + "grad_norm": 6.945423592696898e-06, + "learning_rate": 1.936602938711449e-06, + "loss": 0.0, + "num_input_tokens_seen": 69060944, + "step": 119040 + }, + { + "epoch": 17.73086088769735, + "grad_norm": 2.8945629310328513e-05, + "learning_rate": 1.93534914501384e-06, + "loss": 0.0, + "num_input_tokens_seen": 69063984, + "step": 119045 + }, + { + "epoch": 17.731605600238307, + "grad_norm": 4.97411638207268e-05, + "learning_rate": 1.934095740970074e-06, + "loss": 0.0413, + "num_input_tokens_seen": 69066672, + "step": 119050 + }, + { + "epoch": 17.732350312779268, + "grad_norm": 2.7758966098190285e-05, + "learning_rate": 1.932842726601311e-06, + "loss": 0.0, + "num_input_tokens_seen": 69069424, + "step": 119055 + }, + { + "epoch": 17.733095025320225, + "grad_norm": 2.527350488890079e-06, + "learning_rate": 1.931590101928729e-06, + "loss": 0.0009, + "num_input_tokens_seen": 69072144, + "step": 119060 + }, + { + "epoch": 17.733839737861185, + "grad_norm": 7.33814931663801e-06, + "learning_rate": 1.9303378669734834e-06, + "loss": 0.0, + "num_input_tokens_seen": 69075088, + "step": 119065 + }, + { + "epoch": 17.734584450402146, + "grad_norm": 1.2213215995870996e-05, + "learning_rate": 1.9290860217567374e-06, + "loss": 0.0, + "num_input_tokens_seen": 69077904, + "step": 119070 + }, + { + "epoch": 17.735329162943103, + "grad_norm": 3.968659802922048e-05, + "learning_rate": 1.9278345662996356e-06, + "loss": 0.0, + "num_input_tokens_seen": 69080816, + "step": 119075 + }, + { + "epoch": 17.736073875484063, + "grad_norm": 3.547141750459559e-05, + "learning_rate": 1.926583500623316e-06, + "loss": 0.0, + "num_input_tokens_seen": 69083632, + "step": 119080 + }, + { + "epoch": 17.736818588025024, + "grad_norm": 7.044984522508457e-05, + "learning_rate": 1.925332824748921e-06, + "loss": 0.0, + "num_input_tokens_seen": 69086672, + "step": 119085 + }, + { + "epoch": 17.73756330056598, + "grad_norm": 2.144392055924982e-05, + "learning_rate": 1.9240825386975692e-06, + "loss": 0.0, + "num_input_tokens_seen": 69089808, + "step": 119090 + }, + { + "epoch": 17.73830801310694, + "grad_norm": 4.681189238908701e-06, + "learning_rate": 1.9228326424903966e-06, + "loss": 0.0, + "num_input_tokens_seen": 69092624, + "step": 119095 + }, + { + "epoch": 17.7390527256479, + "grad_norm": 2.4766234218986938e-06, + "learning_rate": 1.9215831361485054e-06, + "loss": 0.0, + "num_input_tokens_seen": 69095632, + "step": 119100 + }, + { + "epoch": 17.73979743818886, + "grad_norm": 1.9009125026059337e-05, + "learning_rate": 1.920334019693015e-06, + "loss": 0.0, + "num_input_tokens_seen": 69098256, + "step": 119105 + }, + { + "epoch": 17.74054215072982, + "grad_norm": 1.1016131793439854e-05, + "learning_rate": 1.9190852931450204e-06, + "loss": 0.0, + "num_input_tokens_seen": 69101424, + "step": 119110 + }, + { + "epoch": 17.741286863270776, + "grad_norm": 2.718496943998616e-06, + "learning_rate": 1.917836956525626e-06, + "loss": 0.0, + "num_input_tokens_seen": 69104368, + "step": 119115 + }, + { + "epoch": 17.742031575811737, + "grad_norm": 2.4266191758215427e-06, + "learning_rate": 1.916589009855918e-06, + "loss": 0.0, + "num_input_tokens_seen": 69107440, + "step": 119120 + }, + { + "epoch": 17.742776288352697, + "grad_norm": 5.867395157110877e-05, + "learning_rate": 1.9153414531569713e-06, + "loss": 0.0, + "num_input_tokens_seen": 69110256, + "step": 119125 + }, + { + "epoch": 17.743521000893654, + "grad_norm": 0.0007180788088589907, + "learning_rate": 1.9140942864498747e-06, + "loss": 0.0079, + "num_input_tokens_seen": 69112944, + "step": 119130 + }, + { + "epoch": 17.744265713434615, + "grad_norm": 6.193840090418234e-05, + "learning_rate": 1.912847509755686e-06, + "loss": 0.0, + "num_input_tokens_seen": 69115568, + "step": 119135 + }, + { + "epoch": 17.745010425975572, + "grad_norm": 3.862311586999567e-06, + "learning_rate": 1.911601123095477e-06, + "loss": 0.0, + "num_input_tokens_seen": 69118480, + "step": 119140 + }, + { + "epoch": 17.745755138516532, + "grad_norm": 5.473945930134505e-05, + "learning_rate": 1.910355126490304e-06, + "loss": 0.0, + "num_input_tokens_seen": 69121328, + "step": 119145 + }, + { + "epoch": 17.746499851057493, + "grad_norm": 4.419173365022289e-06, + "learning_rate": 1.909109519961211e-06, + "loss": 0.0, + "num_input_tokens_seen": 69124048, + "step": 119150 + }, + { + "epoch": 17.74724456359845, + "grad_norm": 3.6432536489883205e-06, + "learning_rate": 1.907864303529247e-06, + "loss": 0.0, + "num_input_tokens_seen": 69126832, + "step": 119155 + }, + { + "epoch": 17.74798927613941, + "grad_norm": 0.0001544324477436021, + "learning_rate": 1.9066194772154379e-06, + "loss": 0.0, + "num_input_tokens_seen": 69129840, + "step": 119160 + }, + { + "epoch": 17.74873398868037, + "grad_norm": 6.6815300669986755e-06, + "learning_rate": 1.90537504104083e-06, + "loss": 0.0, + "num_input_tokens_seen": 69132656, + "step": 119165 + }, + { + "epoch": 17.749478701221328, + "grad_norm": 3.4423601391608827e-06, + "learning_rate": 1.9041309950264319e-06, + "loss": 0.0, + "num_input_tokens_seen": 69135888, + "step": 119170 + }, + { + "epoch": 17.75022341376229, + "grad_norm": 3.5088610275124665e-06, + "learning_rate": 1.902887339193271e-06, + "loss": 0.0, + "num_input_tokens_seen": 69138672, + "step": 119175 + }, + { + "epoch": 17.750968126303245, + "grad_norm": 6.489207862614421e-06, + "learning_rate": 1.9016440735623503e-06, + "loss": 0.0, + "num_input_tokens_seen": 69141200, + "step": 119180 + }, + { + "epoch": 17.751712838844206, + "grad_norm": 1.675290332059376e-05, + "learning_rate": 1.9004011981546804e-06, + "loss": 0.0, + "num_input_tokens_seen": 69144304, + "step": 119185 + }, + { + "epoch": 17.752457551385167, + "grad_norm": 0.00013907365791965276, + "learning_rate": 1.8991587129912531e-06, + "loss": 0.0, + "num_input_tokens_seen": 69147376, + "step": 119190 + }, + { + "epoch": 17.753202263926124, + "grad_norm": 1.7499158275313675e-05, + "learning_rate": 1.8979166180930625e-06, + "loss": 0.0, + "num_input_tokens_seen": 69150224, + "step": 119195 + }, + { + "epoch": 17.753946976467084, + "grad_norm": 5.6081526054185815e-06, + "learning_rate": 1.896674913481089e-06, + "loss": 0.0, + "num_input_tokens_seen": 69153008, + "step": 119200 + }, + { + "epoch": 17.75469168900804, + "grad_norm": 3.944761374441441e-06, + "learning_rate": 1.8954335991763107e-06, + "loss": 0.0, + "num_input_tokens_seen": 69156176, + "step": 119205 + }, + { + "epoch": 17.755436401549, + "grad_norm": 0.0001790378737496212, + "learning_rate": 1.8941926751997018e-06, + "loss": 0.0, + "num_input_tokens_seen": 69158864, + "step": 119210 + }, + { + "epoch": 17.756181114089962, + "grad_norm": 4.560951128951274e-05, + "learning_rate": 1.8929521415722267e-06, + "loss": 0.0, + "num_input_tokens_seen": 69161776, + "step": 119215 + }, + { + "epoch": 17.75692582663092, + "grad_norm": 0.00028136096079833806, + "learning_rate": 1.8917119983148378e-06, + "loss": 0.0, + "num_input_tokens_seen": 69164944, + "step": 119220 + }, + { + "epoch": 17.75767053917188, + "grad_norm": 6.254618347156793e-05, + "learning_rate": 1.8904722454484825e-06, + "loss": 0.0, + "num_input_tokens_seen": 69167728, + "step": 119225 + }, + { + "epoch": 17.75841525171284, + "grad_norm": 0.00029618976986967027, + "learning_rate": 1.8892328829941186e-06, + "loss": 0.0, + "num_input_tokens_seen": 69170576, + "step": 119230 + }, + { + "epoch": 17.759159964253797, + "grad_norm": 1.6179050362552516e-05, + "learning_rate": 1.8879939109726713e-06, + "loss": 0.0, + "num_input_tokens_seen": 69173296, + "step": 119235 + }, + { + "epoch": 17.759904676794758, + "grad_norm": 2.424587910354603e-05, + "learning_rate": 1.8867553294050795e-06, + "loss": 0.0, + "num_input_tokens_seen": 69176304, + "step": 119240 + }, + { + "epoch": 17.76064938933572, + "grad_norm": 2.5172284949803725e-06, + "learning_rate": 1.8855171383122677e-06, + "loss": 0.0, + "num_input_tokens_seen": 69179152, + "step": 119245 + }, + { + "epoch": 17.761394101876675, + "grad_norm": 1.562012403155677e-05, + "learning_rate": 1.8842793377151446e-06, + "loss": 0.0, + "num_input_tokens_seen": 69182032, + "step": 119250 + }, + { + "epoch": 17.762138814417636, + "grad_norm": 2.2860303943161853e-06, + "learning_rate": 1.8830419276346352e-06, + "loss": 0.0, + "num_input_tokens_seen": 69184976, + "step": 119255 + }, + { + "epoch": 17.762883526958593, + "grad_norm": 0.0003298632218502462, + "learning_rate": 1.8818049080916305e-06, + "loss": 0.0, + "num_input_tokens_seen": 69187856, + "step": 119260 + }, + { + "epoch": 17.763628239499553, + "grad_norm": 2.5742633624759037e-06, + "learning_rate": 1.8805682791070422e-06, + "loss": 0.0, + "num_input_tokens_seen": 69190768, + "step": 119265 + }, + { + "epoch": 17.764372952040514, + "grad_norm": 4.6947357077442575e-06, + "learning_rate": 1.8793320407017534e-06, + "loss": 0.0, + "num_input_tokens_seen": 69193776, + "step": 119270 + }, + { + "epoch": 17.76511766458147, + "grad_norm": 1.9669757875817595e-06, + "learning_rate": 1.8780961928966528e-06, + "loss": 0.0, + "num_input_tokens_seen": 69196752, + "step": 119275 + }, + { + "epoch": 17.76586237712243, + "grad_norm": 4.354582870291779e-06, + "learning_rate": 1.8768607357126128e-06, + "loss": 0.0, + "num_input_tokens_seen": 69199824, + "step": 119280 + }, + { + "epoch": 17.76660708966339, + "grad_norm": 5.841087840963155e-06, + "learning_rate": 1.875625669170511e-06, + "loss": 0.0, + "num_input_tokens_seen": 69202608, + "step": 119285 + }, + { + "epoch": 17.76735180220435, + "grad_norm": 3.105282303295098e-05, + "learning_rate": 1.874390993291214e-06, + "loss": 0.0, + "num_input_tokens_seen": 69205392, + "step": 119290 + }, + { + "epoch": 17.76809651474531, + "grad_norm": 6.38720939605264e-06, + "learning_rate": 1.8731567080955692e-06, + "loss": 0.0, + "num_input_tokens_seen": 69208240, + "step": 119295 + }, + { + "epoch": 17.768841227286266, + "grad_norm": 1.9972330846940167e-06, + "learning_rate": 1.871922813604443e-06, + "loss": 0.0, + "num_input_tokens_seen": 69211056, + "step": 119300 + }, + { + "epoch": 17.769585939827227, + "grad_norm": 2.236924274257035e-06, + "learning_rate": 1.870689309838672e-06, + "loss": 0.0, + "num_input_tokens_seen": 69213872, + "step": 119305 + }, + { + "epoch": 17.770330652368187, + "grad_norm": 6.234441570995841e-06, + "learning_rate": 1.8694561968191e-06, + "loss": 0.0, + "num_input_tokens_seen": 69216816, + "step": 119310 + }, + { + "epoch": 17.771075364909144, + "grad_norm": 2.6037375846499344e-06, + "learning_rate": 1.8682234745665522e-06, + "loss": 0.0, + "num_input_tokens_seen": 69219824, + "step": 119315 + }, + { + "epoch": 17.771820077450105, + "grad_norm": 3.735169002538896e-06, + "learning_rate": 1.866991143101865e-06, + "loss": 0.0, + "num_input_tokens_seen": 69222992, + "step": 119320 + }, + { + "epoch": 17.772564789991062, + "grad_norm": 0.00014380364154931158, + "learning_rate": 1.8657592024458491e-06, + "loss": 0.0, + "num_input_tokens_seen": 69225840, + "step": 119325 + }, + { + "epoch": 17.773309502532022, + "grad_norm": 4.849652668781346e-06, + "learning_rate": 1.8645276526193162e-06, + "loss": 0.0, + "num_input_tokens_seen": 69228656, + "step": 119330 + }, + { + "epoch": 17.774054215072983, + "grad_norm": 3.996636223746464e-06, + "learning_rate": 1.8632964936430768e-06, + "loss": 0.0, + "num_input_tokens_seen": 69231312, + "step": 119335 + }, + { + "epoch": 17.77479892761394, + "grad_norm": 1.521480044175405e-05, + "learning_rate": 1.8620657255379314e-06, + "loss": 0.0, + "num_input_tokens_seen": 69234576, + "step": 119340 + }, + { + "epoch": 17.7755436401549, + "grad_norm": 3.863547681248747e-05, + "learning_rate": 1.860835348324666e-06, + "loss": 0.0, + "num_input_tokens_seen": 69237520, + "step": 119345 + }, + { + "epoch": 17.77628835269586, + "grad_norm": 5.013810550735798e-06, + "learning_rate": 1.8596053620240667e-06, + "loss": 0.0, + "num_input_tokens_seen": 69240528, + "step": 119350 + }, + { + "epoch": 17.777033065236818, + "grad_norm": 1.947333203133894e-06, + "learning_rate": 1.8583757666569196e-06, + "loss": 0.0, + "num_input_tokens_seen": 69243760, + "step": 119355 + }, + { + "epoch": 17.77777777777778, + "grad_norm": 0.00022512429859489202, + "learning_rate": 1.8571465622439943e-06, + "loss": 0.0, + "num_input_tokens_seen": 69246704, + "step": 119360 + }, + { + "epoch": 17.778522490318736, + "grad_norm": 2.5967367037083022e-05, + "learning_rate": 1.8559177488060547e-06, + "loss": 0.0, + "num_input_tokens_seen": 69249584, + "step": 119365 + }, + { + "epoch": 17.779267202859696, + "grad_norm": 3.0212695492082275e-06, + "learning_rate": 1.854689326363862e-06, + "loss": 0.0, + "num_input_tokens_seen": 69252272, + "step": 119370 + }, + { + "epoch": 17.780011915400657, + "grad_norm": 8.44029345898889e-05, + "learning_rate": 1.8534612949381691e-06, + "loss": 0.0, + "num_input_tokens_seen": 69254928, + "step": 119375 + }, + { + "epoch": 17.780756627941614, + "grad_norm": 0.8093740940093994, + "learning_rate": 1.8522336545497232e-06, + "loss": 0.0001, + "num_input_tokens_seen": 69257840, + "step": 119380 + }, + { + "epoch": 17.781501340482574, + "grad_norm": 2.271666562592145e-05, + "learning_rate": 1.8510064052192604e-06, + "loss": 0.0, + "num_input_tokens_seen": 69260688, + "step": 119385 + }, + { + "epoch": 17.782246053023535, + "grad_norm": 3.163985093124211e-05, + "learning_rate": 1.8497795469675227e-06, + "loss": 0.0, + "num_input_tokens_seen": 69263504, + "step": 119390 + }, + { + "epoch": 17.78299076556449, + "grad_norm": 4.009820258943364e-06, + "learning_rate": 1.848553079815224e-06, + "loss": 0.0, + "num_input_tokens_seen": 69266416, + "step": 119395 + }, + { + "epoch": 17.783735478105452, + "grad_norm": 2.1294429188856157e-06, + "learning_rate": 1.8473270037830975e-06, + "loss": 0.0, + "num_input_tokens_seen": 69269488, + "step": 119400 + }, + { + "epoch": 17.78448019064641, + "grad_norm": 7.19477884558728e-06, + "learning_rate": 1.8461013188918492e-06, + "loss": 0.0, + "num_input_tokens_seen": 69272272, + "step": 119405 + }, + { + "epoch": 17.78522490318737, + "grad_norm": 1.6373745893361047e-05, + "learning_rate": 1.8448760251621844e-06, + "loss": 0.0, + "num_input_tokens_seen": 69275216, + "step": 119410 + }, + { + "epoch": 17.78596961572833, + "grad_norm": 2.52440145231958e-06, + "learning_rate": 1.843651122614809e-06, + "loss": 0.001, + "num_input_tokens_seen": 69278224, + "step": 119415 + }, + { + "epoch": 17.786714328269287, + "grad_norm": 3.8840378692839295e-05, + "learning_rate": 1.8424266112704064e-06, + "loss": 0.0, + "num_input_tokens_seen": 69281392, + "step": 119420 + }, + { + "epoch": 17.787459040810248, + "grad_norm": 9.865306310530286e-06, + "learning_rate": 1.841202491149674e-06, + "loss": 0.0, + "num_input_tokens_seen": 69284912, + "step": 119425 + }, + { + "epoch": 17.788203753351205, + "grad_norm": 4.323663688410306e-06, + "learning_rate": 1.839978762273284e-06, + "loss": 0.0, + "num_input_tokens_seen": 69287984, + "step": 119430 + }, + { + "epoch": 17.788948465892165, + "grad_norm": 3.8937811041250825e-05, + "learning_rate": 1.838755424661917e-06, + "loss": 0.0, + "num_input_tokens_seen": 69290736, + "step": 119435 + }, + { + "epoch": 17.789693178433126, + "grad_norm": 8.796443580649793e-06, + "learning_rate": 1.8375324783362402e-06, + "loss": 0.0, + "num_input_tokens_seen": 69293712, + "step": 119440 + }, + { + "epoch": 17.790437890974083, + "grad_norm": 0.005006465129554272, + "learning_rate": 1.8363099233169034e-06, + "loss": 0.0, + "num_input_tokens_seen": 69296592, + "step": 119445 + }, + { + "epoch": 17.791182603515043, + "grad_norm": 2.5498731702100486e-05, + "learning_rate": 1.8350877596245735e-06, + "loss": 0.0, + "num_input_tokens_seen": 69299664, + "step": 119450 + }, + { + "epoch": 17.791927316056004, + "grad_norm": 4.339097358752042e-06, + "learning_rate": 1.8338659872798896e-06, + "loss": 0.0, + "num_input_tokens_seen": 69302512, + "step": 119455 + }, + { + "epoch": 17.79267202859696, + "grad_norm": 0.00015423729200847447, + "learning_rate": 1.8326446063034964e-06, + "loss": 0.0535, + "num_input_tokens_seen": 69305168, + "step": 119460 + }, + { + "epoch": 17.79341674113792, + "grad_norm": 0.00010705617751227692, + "learning_rate": 1.8314236167160243e-06, + "loss": 0.004, + "num_input_tokens_seen": 69308304, + "step": 119465 + }, + { + "epoch": 17.79416145367888, + "grad_norm": 5.267202141112648e-06, + "learning_rate": 1.8302030185381042e-06, + "loss": 0.0, + "num_input_tokens_seen": 69310928, + "step": 119470 + }, + { + "epoch": 17.79490616621984, + "grad_norm": 0.00019491031707730144, + "learning_rate": 1.8289828117903584e-06, + "loss": 0.0, + "num_input_tokens_seen": 69313584, + "step": 119475 + }, + { + "epoch": 17.7956508787608, + "grad_norm": 4.6055574784986675e-06, + "learning_rate": 1.8277629964933958e-06, + "loss": 0.0, + "num_input_tokens_seen": 69316464, + "step": 119480 + }, + { + "epoch": 17.796395591301756, + "grad_norm": 0.0027808239683508873, + "learning_rate": 1.8265435726678271e-06, + "loss": 0.0, + "num_input_tokens_seen": 69319600, + "step": 119485 + }, + { + "epoch": 17.797140303842717, + "grad_norm": 9.024854989547748e-06, + "learning_rate": 1.8253245403342472e-06, + "loss": 0.0, + "num_input_tokens_seen": 69322512, + "step": 119490 + }, + { + "epoch": 17.797885016383677, + "grad_norm": 1.517643340775976e-05, + "learning_rate": 1.824105899513262e-06, + "loss": 0.0, + "num_input_tokens_seen": 69325392, + "step": 119495 + }, + { + "epoch": 17.798629728924634, + "grad_norm": 2.2525009626406245e-05, + "learning_rate": 1.8228876502254465e-06, + "loss": 0.0, + "num_input_tokens_seen": 69328560, + "step": 119500 + }, + { + "epoch": 17.799374441465595, + "grad_norm": 3.778581231017597e-05, + "learning_rate": 1.8216697924913928e-06, + "loss": 0.0, + "num_input_tokens_seen": 69331472, + "step": 119505 + }, + { + "epoch": 17.800119154006552, + "grad_norm": 2.049884460575413e-05, + "learning_rate": 1.8204523263316647e-06, + "loss": 0.0, + "num_input_tokens_seen": 69334448, + "step": 119510 + }, + { + "epoch": 17.800863866547513, + "grad_norm": 4.617755985236727e-05, + "learning_rate": 1.8192352517668432e-06, + "loss": 0.0, + "num_input_tokens_seen": 69337456, + "step": 119515 + }, + { + "epoch": 17.801608579088473, + "grad_norm": 1.0799717529152986e-05, + "learning_rate": 1.818018568817481e-06, + "loss": 0.0, + "num_input_tokens_seen": 69340272, + "step": 119520 + }, + { + "epoch": 17.80235329162943, + "grad_norm": 1.3317439879756421e-05, + "learning_rate": 1.8168022775041288e-06, + "loss": 0.0, + "num_input_tokens_seen": 69342928, + "step": 119525 + }, + { + "epoch": 17.80309800417039, + "grad_norm": 1.1498913409013767e-05, + "learning_rate": 1.8155863778473447e-06, + "loss": 0.3906, + "num_input_tokens_seen": 69346224, + "step": 119530 + }, + { + "epoch": 17.80384271671135, + "grad_norm": 1.2966733265784569e-05, + "learning_rate": 1.8143708698676597e-06, + "loss": 0.0, + "num_input_tokens_seen": 69348912, + "step": 119535 + }, + { + "epoch": 17.804587429252308, + "grad_norm": 1.6237689123954624e-05, + "learning_rate": 1.8131557535856214e-06, + "loss": 0.0, + "num_input_tokens_seen": 69351632, + "step": 119540 + }, + { + "epoch": 17.80533214179327, + "grad_norm": 4.014686055597849e-06, + "learning_rate": 1.8119410290217465e-06, + "loss": 0.0, + "num_input_tokens_seen": 69354704, + "step": 119545 + }, + { + "epoch": 17.806076854334226, + "grad_norm": 0.00016815619892440736, + "learning_rate": 1.810726696196563e-06, + "loss": 0.0, + "num_input_tokens_seen": 69357456, + "step": 119550 + }, + { + "epoch": 17.806821566875186, + "grad_norm": 1.179637274617562e-05, + "learning_rate": 1.8095127551305797e-06, + "loss": 0.0, + "num_input_tokens_seen": 69360176, + "step": 119555 + }, + { + "epoch": 17.807566279416147, + "grad_norm": 1.833337410062086e-05, + "learning_rate": 1.8082992058443132e-06, + "loss": 0.0, + "num_input_tokens_seen": 69363376, + "step": 119560 + }, + { + "epoch": 17.808310991957104, + "grad_norm": 3.6288699902797816e-06, + "learning_rate": 1.8070860483582585e-06, + "loss": 0.04, + "num_input_tokens_seen": 69366352, + "step": 119565 + }, + { + "epoch": 17.809055704498064, + "grad_norm": 5.848521686857566e-05, + "learning_rate": 1.8058732826929104e-06, + "loss": 0.0, + "num_input_tokens_seen": 69369296, + "step": 119570 + }, + { + "epoch": 17.80980041703902, + "grad_norm": 5.847942884429358e-05, + "learning_rate": 1.8046609088687633e-06, + "loss": 0.0, + "num_input_tokens_seen": 69371984, + "step": 119575 + }, + { + "epoch": 17.81054512957998, + "grad_norm": 2.126301069438341e-06, + "learning_rate": 1.8034489269062899e-06, + "loss": 0.0, + "num_input_tokens_seen": 69374800, + "step": 119580 + }, + { + "epoch": 17.811289842120942, + "grad_norm": 9.445442628930323e-06, + "learning_rate": 1.8022373368259765e-06, + "loss": 0.0, + "num_input_tokens_seen": 69377808, + "step": 119585 + }, + { + "epoch": 17.8120345546619, + "grad_norm": 1.6520220015081577e-05, + "learning_rate": 1.801026138648282e-06, + "loss": 0.0, + "num_input_tokens_seen": 69380688, + "step": 119590 + }, + { + "epoch": 17.81277926720286, + "grad_norm": 4.3237818317720667e-05, + "learning_rate": 1.7998153323936755e-06, + "loss": 0.0, + "num_input_tokens_seen": 69383760, + "step": 119595 + }, + { + "epoch": 17.81352397974382, + "grad_norm": 2.608186605357332e-06, + "learning_rate": 1.798604918082611e-06, + "loss": 0.0, + "num_input_tokens_seen": 69386512, + "step": 119600 + }, + { + "epoch": 17.814268692284777, + "grad_norm": 6.281728929025121e-06, + "learning_rate": 1.7973948957355352e-06, + "loss": 0.0, + "num_input_tokens_seen": 69389136, + "step": 119605 + }, + { + "epoch": 17.815013404825738, + "grad_norm": 3.5336666769580916e-05, + "learning_rate": 1.796185265372885e-06, + "loss": 0.0, + "num_input_tokens_seen": 69391984, + "step": 119610 + }, + { + "epoch": 17.815758117366695, + "grad_norm": 9.752395271789283e-06, + "learning_rate": 1.7949760270151078e-06, + "loss": 0.0, + "num_input_tokens_seen": 69394928, + "step": 119615 + }, + { + "epoch": 17.816502829907655, + "grad_norm": 5.870249424333451e-06, + "learning_rate": 1.7937671806826262e-06, + "loss": 0.0, + "num_input_tokens_seen": 69397776, + "step": 119620 + }, + { + "epoch": 17.817247542448616, + "grad_norm": 4.965268453815952e-05, + "learning_rate": 1.792558726395857e-06, + "loss": 0.0, + "num_input_tokens_seen": 69400496, + "step": 119625 + }, + { + "epoch": 17.817992254989573, + "grad_norm": 2.0544791823340347e-06, + "learning_rate": 1.791350664175223e-06, + "loss": 0.0, + "num_input_tokens_seen": 69403440, + "step": 119630 + }, + { + "epoch": 17.818736967530533, + "grad_norm": 2.579385181888938e-05, + "learning_rate": 1.7901429940411301e-06, + "loss": 0.0, + "num_input_tokens_seen": 69406448, + "step": 119635 + }, + { + "epoch": 17.819481680071494, + "grad_norm": 3.3409489788027713e-06, + "learning_rate": 1.788935716013987e-06, + "loss": 0.0, + "num_input_tokens_seen": 69409232, + "step": 119640 + }, + { + "epoch": 17.82022639261245, + "grad_norm": 5.837352546222974e-06, + "learning_rate": 1.7877288301141826e-06, + "loss": 0.0, + "num_input_tokens_seen": 69412112, + "step": 119645 + }, + { + "epoch": 17.82097110515341, + "grad_norm": 2.6872196485783206e-06, + "learning_rate": 1.7865223363621037e-06, + "loss": 0.0, + "num_input_tokens_seen": 69414960, + "step": 119650 + }, + { + "epoch": 17.82171581769437, + "grad_norm": 2.809327952491003e-06, + "learning_rate": 1.7853162347781394e-06, + "loss": 0.0, + "num_input_tokens_seen": 69417904, + "step": 119655 + }, + { + "epoch": 17.82246053023533, + "grad_norm": 3.8054672586440574e-06, + "learning_rate": 1.7841105253826596e-06, + "loss": 0.0, + "num_input_tokens_seen": 69420752, + "step": 119660 + }, + { + "epoch": 17.82320524277629, + "grad_norm": 2.724218120420119e-06, + "learning_rate": 1.7829052081960423e-06, + "loss": 0.0, + "num_input_tokens_seen": 69423568, + "step": 119665 + }, + { + "epoch": 17.823949955317246, + "grad_norm": 2.814619392665918e-06, + "learning_rate": 1.7817002832386436e-06, + "loss": 0.0, + "num_input_tokens_seen": 69426352, + "step": 119670 + }, + { + "epoch": 17.824694667858207, + "grad_norm": 2.326421054021921e-05, + "learning_rate": 1.7804957505308224e-06, + "loss": 0.0, + "num_input_tokens_seen": 69429424, + "step": 119675 + }, + { + "epoch": 17.825439380399168, + "grad_norm": 8.210479791159742e-06, + "learning_rate": 1.7792916100929258e-06, + "loss": 0.0, + "num_input_tokens_seen": 69432400, + "step": 119680 + }, + { + "epoch": 17.826184092940125, + "grad_norm": 3.4732647691271268e-06, + "learning_rate": 1.7780878619452905e-06, + "loss": 0.0, + "num_input_tokens_seen": 69435024, + "step": 119685 + }, + { + "epoch": 17.826928805481085, + "grad_norm": 1.3762946764472872e-05, + "learning_rate": 1.7768845061082646e-06, + "loss": 0.0, + "num_input_tokens_seen": 69437776, + "step": 119690 + }, + { + "epoch": 17.827673518022042, + "grad_norm": 1.1666136742860544e-05, + "learning_rate": 1.7756815426021673e-06, + "loss": 0.0, + "num_input_tokens_seen": 69440656, + "step": 119695 + }, + { + "epoch": 17.828418230563003, + "grad_norm": 1.3304278581927065e-05, + "learning_rate": 1.7744789714473325e-06, + "loss": 0.275, + "num_input_tokens_seen": 69443376, + "step": 119700 + }, + { + "epoch": 17.829162943103963, + "grad_norm": 3.728780575329438e-05, + "learning_rate": 1.7732767926640636e-06, + "loss": 0.0, + "num_input_tokens_seen": 69446544, + "step": 119705 + }, + { + "epoch": 17.82990765564492, + "grad_norm": 2.1764210032415576e-05, + "learning_rate": 1.7720750062726831e-06, + "loss": 0.0, + "num_input_tokens_seen": 69449360, + "step": 119710 + }, + { + "epoch": 17.83065236818588, + "grad_norm": 1.0447155545989517e-05, + "learning_rate": 1.7708736122934805e-06, + "loss": 0.0, + "num_input_tokens_seen": 69452272, + "step": 119715 + }, + { + "epoch": 17.83139708072684, + "grad_norm": 3.273594120400958e-05, + "learning_rate": 1.7696726107467643e-06, + "loss": 0.0, + "num_input_tokens_seen": 69455088, + "step": 119720 + }, + { + "epoch": 17.832141793267798, + "grad_norm": 1.7534270000396646e-06, + "learning_rate": 1.768472001652821e-06, + "loss": 0.0, + "num_input_tokens_seen": 69458032, + "step": 119725 + }, + { + "epoch": 17.83288650580876, + "grad_norm": 2.788053734548157e-06, + "learning_rate": 1.7672717850319264e-06, + "loss": 0.0, + "num_input_tokens_seen": 69460720, + "step": 119730 + }, + { + "epoch": 17.833631218349716, + "grad_norm": 0.00010954977187793702, + "learning_rate": 1.766071960904367e-06, + "loss": 0.0052, + "num_input_tokens_seen": 69463536, + "step": 119735 + }, + { + "epoch": 17.834375930890676, + "grad_norm": 3.5105840652249753e-05, + "learning_rate": 1.7648725292904067e-06, + "loss": 0.0, + "num_input_tokens_seen": 69466416, + "step": 119740 + }, + { + "epoch": 17.835120643431637, + "grad_norm": 3.6965984691050835e-06, + "learning_rate": 1.7636734902103102e-06, + "loss": 0.0, + "num_input_tokens_seen": 69469520, + "step": 119745 + }, + { + "epoch": 17.835865355972594, + "grad_norm": 2.1837855456396937e-05, + "learning_rate": 1.7624748436843308e-06, + "loss": 0.0, + "num_input_tokens_seen": 69472464, + "step": 119750 + }, + { + "epoch": 17.836610068513554, + "grad_norm": 5.309785137797007e-06, + "learning_rate": 1.7612765897327244e-06, + "loss": 0.0, + "num_input_tokens_seen": 69475216, + "step": 119755 + }, + { + "epoch": 17.837354781054515, + "grad_norm": 5.0799379096133634e-05, + "learning_rate": 1.7600787283757303e-06, + "loss": 0.0, + "num_input_tokens_seen": 69477904, + "step": 119760 + }, + { + "epoch": 17.83809949359547, + "grad_norm": 11.01166820526123, + "learning_rate": 1.7588812596335824e-06, + "loss": 0.019, + "num_input_tokens_seen": 69480688, + "step": 119765 + }, + { + "epoch": 17.838844206136432, + "grad_norm": 9.862698789220303e-05, + "learning_rate": 1.7576841835265202e-06, + "loss": 0.0, + "num_input_tokens_seen": 69483696, + "step": 119770 + }, + { + "epoch": 17.83958891867739, + "grad_norm": 1.1741388334485237e-05, + "learning_rate": 1.756487500074755e-06, + "loss": 0.0, + "num_input_tokens_seen": 69486256, + "step": 119775 + }, + { + "epoch": 17.84033363121835, + "grad_norm": 7.5992925303580705e-06, + "learning_rate": 1.7552912092985153e-06, + "loss": 0.0, + "num_input_tokens_seen": 69489072, + "step": 119780 + }, + { + "epoch": 17.84107834375931, + "grad_norm": 4.9772629608924035e-06, + "learning_rate": 1.7540953112180014e-06, + "loss": 0.0, + "num_input_tokens_seen": 69491920, + "step": 119785 + }, + { + "epoch": 17.841823056300267, + "grad_norm": 1.4480029904007097e-06, + "learning_rate": 1.752899805853425e-06, + "loss": 0.0118, + "num_input_tokens_seen": 69494992, + "step": 119790 + }, + { + "epoch": 17.842567768841228, + "grad_norm": 5.601858902082313e-06, + "learning_rate": 1.7517046932249758e-06, + "loss": 0.0, + "num_input_tokens_seen": 69497968, + "step": 119795 + }, + { + "epoch": 17.843312481382185, + "grad_norm": 1.5955498383846134e-05, + "learning_rate": 1.7505099733528514e-06, + "loss": 0.0, + "num_input_tokens_seen": 69500944, + "step": 119800 + }, + { + "epoch": 17.844057193923145, + "grad_norm": 1.7951611880562268e-05, + "learning_rate": 1.7493156462572296e-06, + "loss": 0.0, + "num_input_tokens_seen": 69503920, + "step": 119805 + }, + { + "epoch": 17.844801906464106, + "grad_norm": 2.061345185211394e-06, + "learning_rate": 1.7481217119582921e-06, + "loss": 0.0, + "num_input_tokens_seen": 69506864, + "step": 119810 + }, + { + "epoch": 17.845546619005063, + "grad_norm": 5.187495844438672e-06, + "learning_rate": 1.746928170476203e-06, + "loss": 0.0, + "num_input_tokens_seen": 69509936, + "step": 119815 + }, + { + "epoch": 17.846291331546023, + "grad_norm": 3.8540033528988715e-06, + "learning_rate": 1.7457350218311269e-06, + "loss": 0.0, + "num_input_tokens_seen": 69512912, + "step": 119820 + }, + { + "epoch": 17.847036044086984, + "grad_norm": 4.96134316563257e-06, + "learning_rate": 1.7445422660432254e-06, + "loss": 0.0, + "num_input_tokens_seen": 69515792, + "step": 119825 + }, + { + "epoch": 17.84778075662794, + "grad_norm": 3.827547061518999e-06, + "learning_rate": 1.7433499031326434e-06, + "loss": 0.0, + "num_input_tokens_seen": 69518704, + "step": 119830 + }, + { + "epoch": 17.8485254691689, + "grad_norm": 2.2346008336171508e-05, + "learning_rate": 1.7421579331195314e-06, + "loss": 0.0, + "num_input_tokens_seen": 69521776, + "step": 119835 + }, + { + "epoch": 17.84927018170986, + "grad_norm": 2.0655309072026284e-06, + "learning_rate": 1.7409663560240209e-06, + "loss": 0.0, + "num_input_tokens_seen": 69524368, + "step": 119840 + }, + { + "epoch": 17.85001489425082, + "grad_norm": 2.1443190689751646e-06, + "learning_rate": 1.7397751718662452e-06, + "loss": 0.0, + "num_input_tokens_seen": 69527472, + "step": 119845 + }, + { + "epoch": 17.85075960679178, + "grad_norm": 5.2766567932849284e-06, + "learning_rate": 1.7385843806663304e-06, + "loss": 0.0, + "num_input_tokens_seen": 69530640, + "step": 119850 + }, + { + "epoch": 17.851504319332737, + "grad_norm": 2.702115580177633e-06, + "learning_rate": 1.7373939824443853e-06, + "loss": 0.0, + "num_input_tokens_seen": 69533616, + "step": 119855 + }, + { + "epoch": 17.852249031873697, + "grad_norm": 2.845580866051023e-06, + "learning_rate": 1.7362039772205296e-06, + "loss": 0.0, + "num_input_tokens_seen": 69537040, + "step": 119860 + }, + { + "epoch": 17.852993744414658, + "grad_norm": 2.5803154130699113e-05, + "learning_rate": 1.7350143650148587e-06, + "loss": 0.0, + "num_input_tokens_seen": 69540080, + "step": 119865 + }, + { + "epoch": 17.853738456955615, + "grad_norm": 8.768441148276906e-06, + "learning_rate": 1.7338251458474786e-06, + "loss": 0.0, + "num_input_tokens_seen": 69542896, + "step": 119870 + }, + { + "epoch": 17.854483169496575, + "grad_norm": 0.0002914035285357386, + "learning_rate": 1.7326363197384788e-06, + "loss": 0.0, + "num_input_tokens_seen": 69546000, + "step": 119875 + }, + { + "epoch": 17.855227882037532, + "grad_norm": 0.0001300037547480315, + "learning_rate": 1.7314478867079376e-06, + "loss": 0.0, + "num_input_tokens_seen": 69548688, + "step": 119880 + }, + { + "epoch": 17.855972594578493, + "grad_norm": 5.291984052746557e-06, + "learning_rate": 1.7302598467759362e-06, + "loss": 0.0, + "num_input_tokens_seen": 69551664, + "step": 119885 + }, + { + "epoch": 17.856717307119453, + "grad_norm": 7.23499761079438e-05, + "learning_rate": 1.729072199962542e-06, + "loss": 0.0, + "num_input_tokens_seen": 69554640, + "step": 119890 + }, + { + "epoch": 17.85746201966041, + "grad_norm": 6.734529961249791e-06, + "learning_rate": 1.7278849462878223e-06, + "loss": 0.0, + "num_input_tokens_seen": 69557520, + "step": 119895 + }, + { + "epoch": 17.85820673220137, + "grad_norm": 2.1953630493953824e-05, + "learning_rate": 1.7266980857718328e-06, + "loss": 0.0, + "num_input_tokens_seen": 69560624, + "step": 119900 + }, + { + "epoch": 17.85895144474233, + "grad_norm": 2.0973977825633483e-06, + "learning_rate": 1.7255116184346277e-06, + "loss": 0.0, + "num_input_tokens_seen": 69563728, + "step": 119905 + }, + { + "epoch": 17.859696157283288, + "grad_norm": 7.1016238507581875e-06, + "learning_rate": 1.724325544296243e-06, + "loss": 0.0, + "num_input_tokens_seen": 69566704, + "step": 119910 + }, + { + "epoch": 17.86044086982425, + "grad_norm": 1.1589812856982462e-05, + "learning_rate": 1.7231398633767272e-06, + "loss": 0.0328, + "num_input_tokens_seen": 69569296, + "step": 119915 + }, + { + "epoch": 17.861185582365206, + "grad_norm": 3.3627470656938385e-06, + "learning_rate": 1.7219545756961025e-06, + "loss": 0.0, + "num_input_tokens_seen": 69572048, + "step": 119920 + }, + { + "epoch": 17.861930294906166, + "grad_norm": 20.479461669921875, + "learning_rate": 1.7207696812744007e-06, + "loss": 0.1066, + "num_input_tokens_seen": 69574640, + "step": 119925 + }, + { + "epoch": 17.862675007447127, + "grad_norm": 6.64297613184317e-06, + "learning_rate": 1.719585180131636e-06, + "loss": 0.0, + "num_input_tokens_seen": 69577488, + "step": 119930 + }, + { + "epoch": 17.863419719988084, + "grad_norm": 0.001988396281376481, + "learning_rate": 1.7184010722878146e-06, + "loss": 0.0, + "num_input_tokens_seen": 69580496, + "step": 119935 + }, + { + "epoch": 17.864164432529044, + "grad_norm": 5.540053280128632e-06, + "learning_rate": 1.7172173577629459e-06, + "loss": 0.0, + "num_input_tokens_seen": 69583568, + "step": 119940 + }, + { + "epoch": 17.86490914507, + "grad_norm": 2.380231080678641e-06, + "learning_rate": 1.7160340365770272e-06, + "loss": 0.0, + "num_input_tokens_seen": 69586480, + "step": 119945 + }, + { + "epoch": 17.865653857610962, + "grad_norm": 0.0005352807929739356, + "learning_rate": 1.7148511087500485e-06, + "loss": 0.0, + "num_input_tokens_seen": 69589168, + "step": 119950 + }, + { + "epoch": 17.866398570151922, + "grad_norm": 2.337984915357083e-05, + "learning_rate": 1.7136685743019909e-06, + "loss": 0.0, + "num_input_tokens_seen": 69592176, + "step": 119955 + }, + { + "epoch": 17.86714328269288, + "grad_norm": 1.2234643691044766e-05, + "learning_rate": 1.7124864332528412e-06, + "loss": 0.0, + "num_input_tokens_seen": 69595152, + "step": 119960 + }, + { + "epoch": 17.86788799523384, + "grad_norm": 6.543635208799969e-06, + "learning_rate": 1.7113046856225611e-06, + "loss": 0.0, + "num_input_tokens_seen": 69598288, + "step": 119965 + }, + { + "epoch": 17.8686327077748, + "grad_norm": 1.6651478290441446e-05, + "learning_rate": 1.7101233314311181e-06, + "loss": 0.0, + "num_input_tokens_seen": 69601168, + "step": 119970 + }, + { + "epoch": 17.869377420315757, + "grad_norm": 2.737630438787164e-06, + "learning_rate": 1.7089423706984742e-06, + "loss": 0.0, + "num_input_tokens_seen": 69603952, + "step": 119975 + }, + { + "epoch": 17.870122132856718, + "grad_norm": 1.586150574439671e-05, + "learning_rate": 1.7077618034445714e-06, + "loss": 0.0, + "num_input_tokens_seen": 69607152, + "step": 119980 + }, + { + "epoch": 17.870866845397675, + "grad_norm": 4.85440523334546e-06, + "learning_rate": 1.706581629689366e-06, + "loss": 0.0, + "num_input_tokens_seen": 69610800, + "step": 119985 + }, + { + "epoch": 17.871611557938635, + "grad_norm": 2.174513838326675e-06, + "learning_rate": 1.705401849452784e-06, + "loss": 0.0, + "num_input_tokens_seen": 69613584, + "step": 119990 + }, + { + "epoch": 17.872356270479596, + "grad_norm": 3.156101456625038e-06, + "learning_rate": 1.7042224627547676e-06, + "loss": 0.0, + "num_input_tokens_seen": 69616368, + "step": 119995 + }, + { + "epoch": 17.873100983020553, + "grad_norm": 2.130860957549885e-05, + "learning_rate": 1.7030434696152342e-06, + "loss": 0.0, + "num_input_tokens_seen": 69619408, + "step": 120000 + }, + { + "epoch": 17.873845695561513, + "grad_norm": 7.224663931992836e-06, + "learning_rate": 1.701864870054104e-06, + "loss": 0.0, + "num_input_tokens_seen": 69622384, + "step": 120005 + }, + { + "epoch": 17.874590408102474, + "grad_norm": 0.00024369395396206528, + "learning_rate": 1.700686664091286e-06, + "loss": 0.0, + "num_input_tokens_seen": 69625520, + "step": 120010 + }, + { + "epoch": 17.87533512064343, + "grad_norm": 0.00137126084882766, + "learning_rate": 1.6995088517466867e-06, + "loss": 0.0, + "num_input_tokens_seen": 69628528, + "step": 120015 + }, + { + "epoch": 17.87607983318439, + "grad_norm": 3.0436297038249904e-06, + "learning_rate": 1.6983314330402039e-06, + "loss": 0.0, + "num_input_tokens_seen": 69631312, + "step": 120020 + }, + { + "epoch": 17.87682454572535, + "grad_norm": 476.8453674316406, + "learning_rate": 1.6971544079917273e-06, + "loss": 0.2125, + "num_input_tokens_seen": 69634480, + "step": 120025 + }, + { + "epoch": 17.87756925826631, + "grad_norm": 1.4516286682919599e-05, + "learning_rate": 1.6959777766211437e-06, + "loss": 0.0, + "num_input_tokens_seen": 69637296, + "step": 120030 + }, + { + "epoch": 17.87831397080727, + "grad_norm": 9.42415226745652e-06, + "learning_rate": 1.6948015389483291e-06, + "loss": 0.0, + "num_input_tokens_seen": 69640080, + "step": 120035 + }, + { + "epoch": 17.879058683348227, + "grad_norm": 2.4507669877493754e-05, + "learning_rate": 1.6936256949931618e-06, + "loss": 0.0, + "num_input_tokens_seen": 69642960, + "step": 120040 + }, + { + "epoch": 17.879803395889187, + "grad_norm": 7.2823836489988025e-06, + "learning_rate": 1.692450244775498e-06, + "loss": 0.0, + "num_input_tokens_seen": 69645680, + "step": 120045 + }, + { + "epoch": 17.880548108430148, + "grad_norm": 2.623773298182641e-06, + "learning_rate": 1.6912751883151945e-06, + "loss": 0.0, + "num_input_tokens_seen": 69648496, + "step": 120050 + }, + { + "epoch": 17.881292820971105, + "grad_norm": 0.003348453901708126, + "learning_rate": 1.6901005256321128e-06, + "loss": 0.0, + "num_input_tokens_seen": 69651376, + "step": 120055 + }, + { + "epoch": 17.882037533512065, + "grad_norm": 2.7769729058491066e-06, + "learning_rate": 1.6889262567460846e-06, + "loss": 0.0, + "num_input_tokens_seen": 69654352, + "step": 120060 + }, + { + "epoch": 17.882782246053022, + "grad_norm": 4.228356192470528e-05, + "learning_rate": 1.6877523816769603e-06, + "loss": 0.0, + "num_input_tokens_seen": 69657616, + "step": 120065 + }, + { + "epoch": 17.883526958593983, + "grad_norm": 2.304435383848613e-06, + "learning_rate": 1.6865789004445686e-06, + "loss": 0.0, + "num_input_tokens_seen": 69660336, + "step": 120070 + }, + { + "epoch": 17.884271671134943, + "grad_norm": 2.6551340397418244e-06, + "learning_rate": 1.6854058130687272e-06, + "loss": 0.0, + "num_input_tokens_seen": 69663248, + "step": 120075 + }, + { + "epoch": 17.8850163836759, + "grad_norm": 3.1754091196489753e-06, + "learning_rate": 1.684233119569259e-06, + "loss": 0.0, + "num_input_tokens_seen": 69665840, + "step": 120080 + }, + { + "epoch": 17.88576109621686, + "grad_norm": 0.0014238611329346895, + "learning_rate": 1.683060819965976e-06, + "loss": 0.0, + "num_input_tokens_seen": 69668720, + "step": 120085 + }, + { + "epoch": 17.886505808757818, + "grad_norm": 7.989518053364009e-05, + "learning_rate": 1.6818889142786842e-06, + "loss": 0.0, + "num_input_tokens_seen": 69671984, + "step": 120090 + }, + { + "epoch": 17.88725052129878, + "grad_norm": 1.4966916751291137e-05, + "learning_rate": 1.6807174025271737e-06, + "loss": 0.0, + "num_input_tokens_seen": 69675152, + "step": 120095 + }, + { + "epoch": 17.88799523383974, + "grad_norm": 2.7572118597163353e-06, + "learning_rate": 1.6795462847312481e-06, + "loss": 0.0, + "num_input_tokens_seen": 69678032, + "step": 120100 + }, + { + "epoch": 17.888739946380696, + "grad_norm": 0.0009793454082682729, + "learning_rate": 1.6783755609106804e-06, + "loss": 0.0, + "num_input_tokens_seen": 69680880, + "step": 120105 + }, + { + "epoch": 17.889484658921656, + "grad_norm": 0.00013607714208774269, + "learning_rate": 1.6772052310852605e-06, + "loss": 0.0, + "num_input_tokens_seen": 69684048, + "step": 120110 + }, + { + "epoch": 17.890229371462617, + "grad_norm": 2.471342213539174e-06, + "learning_rate": 1.6760352952747472e-06, + "loss": 0.0, + "num_input_tokens_seen": 69687600, + "step": 120115 + }, + { + "epoch": 17.890974084003574, + "grad_norm": 4.07643528888002e-06, + "learning_rate": 1.6748657534989194e-06, + "loss": 0.0, + "num_input_tokens_seen": 69690256, + "step": 120120 + }, + { + "epoch": 17.891718796544534, + "grad_norm": 2.1860822016606107e-05, + "learning_rate": 1.673696605777525e-06, + "loss": 0.0, + "num_input_tokens_seen": 69692976, + "step": 120125 + }, + { + "epoch": 17.89246350908549, + "grad_norm": 2.432906057947548e-06, + "learning_rate": 1.6725278521303178e-06, + "loss": 0.0, + "num_input_tokens_seen": 69695792, + "step": 120130 + }, + { + "epoch": 17.893208221626452, + "grad_norm": 2.106109832311631e-06, + "learning_rate": 1.6713594925770459e-06, + "loss": 0.0, + "num_input_tokens_seen": 69698864, + "step": 120135 + }, + { + "epoch": 17.893952934167412, + "grad_norm": 3.5066946111328434e-06, + "learning_rate": 1.6701915271374436e-06, + "loss": 0.0, + "num_input_tokens_seen": 69701456, + "step": 120140 + }, + { + "epoch": 17.89469764670837, + "grad_norm": 2.8498832762124948e-05, + "learning_rate": 1.6690239558312476e-06, + "loss": 0.0, + "num_input_tokens_seen": 69704752, + "step": 120145 + }, + { + "epoch": 17.89544235924933, + "grad_norm": 1.482200877944706e-05, + "learning_rate": 1.667856778678173e-06, + "loss": 0.0, + "num_input_tokens_seen": 69708048, + "step": 120150 + }, + { + "epoch": 17.89618707179029, + "grad_norm": 2.903389213315677e-05, + "learning_rate": 1.6666899956979483e-06, + "loss": 0.0, + "num_input_tokens_seen": 69710800, + "step": 120155 + }, + { + "epoch": 17.896931784331247, + "grad_norm": 3.3539902233314933e-06, + "learning_rate": 1.665523606910277e-06, + "loss": 0.0, + "num_input_tokens_seen": 69714000, + "step": 120160 + }, + { + "epoch": 17.897676496872208, + "grad_norm": 3.824641680694185e-06, + "learning_rate": 1.6643576123348741e-06, + "loss": 0.0, + "num_input_tokens_seen": 69716720, + "step": 120165 + }, + { + "epoch": 17.898421209413165, + "grad_norm": 2.901630296037183e-06, + "learning_rate": 1.6631920119914296e-06, + "loss": 0.0, + "num_input_tokens_seen": 69719440, + "step": 120170 + }, + { + "epoch": 17.899165921954125, + "grad_norm": 6.602391295018606e-06, + "learning_rate": 1.6620268058996357e-06, + "loss": 0.0, + "num_input_tokens_seen": 69722288, + "step": 120175 + }, + { + "epoch": 17.899910634495086, + "grad_norm": 1.78728842001874e-05, + "learning_rate": 1.6608619940791826e-06, + "loss": 0.0, + "num_input_tokens_seen": 69725264, + "step": 120180 + }, + { + "epoch": 17.900655347036043, + "grad_norm": 8.185745173250325e-06, + "learning_rate": 1.6596975765497403e-06, + "loss": 0.0, + "num_input_tokens_seen": 69728304, + "step": 120185 + }, + { + "epoch": 17.901400059577004, + "grad_norm": 4.710742723545991e-05, + "learning_rate": 1.6585335533309903e-06, + "loss": 0.0, + "num_input_tokens_seen": 69730768, + "step": 120190 + }, + { + "epoch": 17.902144772117964, + "grad_norm": 9.822206266107969e-06, + "learning_rate": 1.6573699244425895e-06, + "loss": 0.0, + "num_input_tokens_seen": 69733744, + "step": 120195 + }, + { + "epoch": 17.90288948465892, + "grad_norm": 7.68182690080721e-06, + "learning_rate": 1.6562066899042023e-06, + "loss": 0.0, + "num_input_tokens_seen": 69736432, + "step": 120200 + }, + { + "epoch": 17.90363419719988, + "grad_norm": 3.644100843303022e-06, + "learning_rate": 1.655043849735477e-06, + "loss": 0.0, + "num_input_tokens_seen": 69739152, + "step": 120205 + }, + { + "epoch": 17.90437890974084, + "grad_norm": 2.011788819800131e-05, + "learning_rate": 1.653881403956062e-06, + "loss": 0.0, + "num_input_tokens_seen": 69742160, + "step": 120210 + }, + { + "epoch": 17.9051236222818, + "grad_norm": 0.00018888873455580324, + "learning_rate": 1.6527193525855911e-06, + "loss": 0.0, + "num_input_tokens_seen": 69744880, + "step": 120215 + }, + { + "epoch": 17.90586833482276, + "grad_norm": 3.6130718399363104e-06, + "learning_rate": 1.6515576956436906e-06, + "loss": 0.0, + "num_input_tokens_seen": 69747728, + "step": 120220 + }, + { + "epoch": 17.906613047363717, + "grad_norm": 6.080184448364889e-06, + "learning_rate": 1.6503964331500004e-06, + "loss": 0.0, + "num_input_tokens_seen": 69751024, + "step": 120225 + }, + { + "epoch": 17.907357759904677, + "grad_norm": 4.525366239249706e-05, + "learning_rate": 1.649235565124127e-06, + "loss": 0.0, + "num_input_tokens_seen": 69753968, + "step": 120230 + }, + { + "epoch": 17.908102472445638, + "grad_norm": 1.3563027096097358e-05, + "learning_rate": 1.648075091585688e-06, + "loss": 0.0, + "num_input_tokens_seen": 69756784, + "step": 120235 + }, + { + "epoch": 17.908847184986595, + "grad_norm": 2.666640284587629e-05, + "learning_rate": 1.6469150125542843e-06, + "loss": 0.0, + "num_input_tokens_seen": 69759792, + "step": 120240 + }, + { + "epoch": 17.909591897527555, + "grad_norm": 8.954090299084783e-05, + "learning_rate": 1.6457553280495168e-06, + "loss": 0.0, + "num_input_tokens_seen": 69762448, + "step": 120245 + }, + { + "epoch": 17.910336610068512, + "grad_norm": 1.5725981938885525e-05, + "learning_rate": 1.6445960380909814e-06, + "loss": 0.0, + "num_input_tokens_seen": 69765200, + "step": 120250 + }, + { + "epoch": 17.911081322609473, + "grad_norm": 1.615851761016529e-05, + "learning_rate": 1.6434371426982508e-06, + "loss": 0.0, + "num_input_tokens_seen": 69768272, + "step": 120255 + }, + { + "epoch": 17.911826035150433, + "grad_norm": 0.0003575743467081338, + "learning_rate": 1.642278641890918e-06, + "loss": 0.0, + "num_input_tokens_seen": 69770960, + "step": 120260 + }, + { + "epoch": 17.91257074769139, + "grad_norm": 2.883666411435115e-06, + "learning_rate": 1.641120535688548e-06, + "loss": 0.0, + "num_input_tokens_seen": 69774000, + "step": 120265 + }, + { + "epoch": 17.91331546023235, + "grad_norm": 3.2477423701493535e-06, + "learning_rate": 1.6399628241106996e-06, + "loss": 0.0, + "num_input_tokens_seen": 69776912, + "step": 120270 + }, + { + "epoch": 17.91406017277331, + "grad_norm": 4.2159081203863025e-06, + "learning_rate": 1.638805507176941e-06, + "loss": 0.0, + "num_input_tokens_seen": 69779920, + "step": 120275 + }, + { + "epoch": 17.91480488531427, + "grad_norm": 3.4633398172445595e-05, + "learning_rate": 1.637648584906823e-06, + "loss": 0.0, + "num_input_tokens_seen": 69783024, + "step": 120280 + }, + { + "epoch": 17.91554959785523, + "grad_norm": 0.000761775707360357, + "learning_rate": 1.6364920573198856e-06, + "loss": 0.0, + "num_input_tokens_seen": 69785936, + "step": 120285 + }, + { + "epoch": 17.916294310396186, + "grad_norm": 3.0843908461974934e-05, + "learning_rate": 1.6353359244356658e-06, + "loss": 0.0, + "num_input_tokens_seen": 69788912, + "step": 120290 + }, + { + "epoch": 17.917039022937146, + "grad_norm": 2.5929270123015158e-05, + "learning_rate": 1.634180186273701e-06, + "loss": 0.0, + "num_input_tokens_seen": 69791664, + "step": 120295 + }, + { + "epoch": 17.917783735478107, + "grad_norm": 2.4254034087789478e-06, + "learning_rate": 1.6330248428535117e-06, + "loss": 0.0, + "num_input_tokens_seen": 69794384, + "step": 120300 + }, + { + "epoch": 17.918528448019064, + "grad_norm": 3.204316044502775e-06, + "learning_rate": 1.6318698941946237e-06, + "loss": 0.0, + "num_input_tokens_seen": 69797680, + "step": 120305 + }, + { + "epoch": 17.919273160560024, + "grad_norm": 3.3612018341955263e-06, + "learning_rate": 1.6307153403165382e-06, + "loss": 0.0, + "num_input_tokens_seen": 69800560, + "step": 120310 + }, + { + "epoch": 17.92001787310098, + "grad_norm": 2.9987334073666716e-06, + "learning_rate": 1.6295611812387673e-06, + "loss": 0.0, + "num_input_tokens_seen": 69803504, + "step": 120315 + }, + { + "epoch": 17.920762585641942, + "grad_norm": 3.662732706288807e-05, + "learning_rate": 1.6284074169808067e-06, + "loss": 0.0, + "num_input_tokens_seen": 69806352, + "step": 120320 + }, + { + "epoch": 17.921507298182902, + "grad_norm": 2.0651965314755216e-05, + "learning_rate": 1.6272540475621518e-06, + "loss": 0.0, + "num_input_tokens_seen": 69809424, + "step": 120325 + }, + { + "epoch": 17.92225201072386, + "grad_norm": 2.1383590137702413e-05, + "learning_rate": 1.6261010730022842e-06, + "loss": 0.0, + "num_input_tokens_seen": 69812464, + "step": 120330 + }, + { + "epoch": 17.92299672326482, + "grad_norm": 4.525452823145315e-06, + "learning_rate": 1.6249484933206853e-06, + "loss": 0.0, + "num_input_tokens_seen": 69815216, + "step": 120335 + }, + { + "epoch": 17.92374143580578, + "grad_norm": 2.456626134517137e-06, + "learning_rate": 1.6237963085368236e-06, + "loss": 0.0, + "num_input_tokens_seen": 69818128, + "step": 120340 + }, + { + "epoch": 17.924486148346737, + "grad_norm": 9.759975000633858e-06, + "learning_rate": 1.6226445186701577e-06, + "loss": 0.0, + "num_input_tokens_seen": 69820880, + "step": 120345 + }, + { + "epoch": 17.925230860887698, + "grad_norm": 6.014166046952596e-06, + "learning_rate": 1.6214931237401588e-06, + "loss": 0.0, + "num_input_tokens_seen": 69824016, + "step": 120350 + }, + { + "epoch": 17.925975573428655, + "grad_norm": 8.671803698234726e-06, + "learning_rate": 1.6203421237662692e-06, + "loss": 0.0, + "num_input_tokens_seen": 69826864, + "step": 120355 + }, + { + "epoch": 17.926720285969616, + "grad_norm": 2.9250888928800123e-06, + "learning_rate": 1.619191518767943e-06, + "loss": 0.0003, + "num_input_tokens_seen": 69829552, + "step": 120360 + }, + { + "epoch": 17.927464998510576, + "grad_norm": 2.176429461542284e-06, + "learning_rate": 1.618041308764609e-06, + "loss": 0.0, + "num_input_tokens_seen": 69832336, + "step": 120365 + }, + { + "epoch": 17.928209711051533, + "grad_norm": 3.0525184229190927e-06, + "learning_rate": 1.6168914937757019e-06, + "loss": 0.0, + "num_input_tokens_seen": 69835120, + "step": 120370 + }, + { + "epoch": 17.928954423592494, + "grad_norm": 6.194361958478112e-06, + "learning_rate": 1.6157420738206503e-06, + "loss": 0.0, + "num_input_tokens_seen": 69838032, + "step": 120375 + }, + { + "epoch": 17.929699136133454, + "grad_norm": 1.234927276527742e-05, + "learning_rate": 1.6145930489188666e-06, + "loss": 0.0, + "num_input_tokens_seen": 69841168, + "step": 120380 + }, + { + "epoch": 17.93044384867441, + "grad_norm": 4.216741672280477e-06, + "learning_rate": 1.6134444190897685e-06, + "loss": 0.0, + "num_input_tokens_seen": 69843824, + "step": 120385 + }, + { + "epoch": 17.93118856121537, + "grad_norm": 6.625918558711419e-06, + "learning_rate": 1.6122961843527546e-06, + "loss": 0.0, + "num_input_tokens_seen": 69846864, + "step": 120390 + }, + { + "epoch": 17.93193327375633, + "grad_norm": 2.8655401820287807e-06, + "learning_rate": 1.6111483447272286e-06, + "loss": 0.0, + "num_input_tokens_seen": 69849552, + "step": 120395 + }, + { + "epoch": 17.93267798629729, + "grad_norm": 3.569646423784434e-06, + "learning_rate": 1.6100009002325806e-06, + "loss": 0.0, + "num_input_tokens_seen": 69853104, + "step": 120400 + }, + { + "epoch": 17.93342269883825, + "grad_norm": 2.01045968424296e-05, + "learning_rate": 1.608853850888195e-06, + "loss": 0.0, + "num_input_tokens_seen": 69855824, + "step": 120405 + }, + { + "epoch": 17.934167411379207, + "grad_norm": 4.32336273661349e-05, + "learning_rate": 1.6077071967134511e-06, + "loss": 0.0, + "num_input_tokens_seen": 69858704, + "step": 120410 + }, + { + "epoch": 17.934912123920167, + "grad_norm": 5.9613776206970215, + "learning_rate": 1.6065609377277136e-06, + "loss": 0.038, + "num_input_tokens_seen": 69861552, + "step": 120415 + }, + { + "epoch": 17.935656836461128, + "grad_norm": 3.466230282356264e-06, + "learning_rate": 1.6054150739503587e-06, + "loss": 0.0, + "num_input_tokens_seen": 69864432, + "step": 120420 + }, + { + "epoch": 17.936401549002085, + "grad_norm": 2.8307071261224337e-05, + "learning_rate": 1.604269605400735e-06, + "loss": 0.0, + "num_input_tokens_seen": 69867440, + "step": 120425 + }, + { + "epoch": 17.937146261543045, + "grad_norm": 3.64489233106724e-06, + "learning_rate": 1.6031245320982018e-06, + "loss": 0.0, + "num_input_tokens_seen": 69870224, + "step": 120430 + }, + { + "epoch": 17.937890974084002, + "grad_norm": 4.019670086563565e-05, + "learning_rate": 1.601979854062094e-06, + "loss": 0.0, + "num_input_tokens_seen": 69873104, + "step": 120435 + }, + { + "epoch": 17.938635686624963, + "grad_norm": 3.181031843269011e-06, + "learning_rate": 1.6008355713117623e-06, + "loss": 0.0, + "num_input_tokens_seen": 69876016, + "step": 120440 + }, + { + "epoch": 17.939380399165923, + "grad_norm": 3.964446477766614e-06, + "learning_rate": 1.599691683866525e-06, + "loss": 0.0, + "num_input_tokens_seen": 69878800, + "step": 120445 + }, + { + "epoch": 17.94012511170688, + "grad_norm": 5.091844286653213e-06, + "learning_rate": 1.5985481917457217e-06, + "loss": 0.0, + "num_input_tokens_seen": 69881680, + "step": 120450 + }, + { + "epoch": 17.94086982424784, + "grad_norm": 4.563436505122809e-06, + "learning_rate": 1.5974050949686597e-06, + "loss": 0.0, + "num_input_tokens_seen": 69884400, + "step": 120455 + }, + { + "epoch": 17.941614536788798, + "grad_norm": 0.000587262911722064, + "learning_rate": 1.5962623935546483e-06, + "loss": 0.0, + "num_input_tokens_seen": 69887056, + "step": 120460 + }, + { + "epoch": 17.94235924932976, + "grad_norm": 6.925432444404578e-06, + "learning_rate": 1.5951200875230055e-06, + "loss": 0.0, + "num_input_tokens_seen": 69889712, + "step": 120465 + }, + { + "epoch": 17.94310396187072, + "grad_norm": 0.00020337056776043028, + "learning_rate": 1.5939781768930185e-06, + "loss": 0.0, + "num_input_tokens_seen": 69892496, + "step": 120470 + }, + { + "epoch": 17.943848674411676, + "grad_norm": 1.0765233128040563e-05, + "learning_rate": 1.59283666168398e-06, + "loss": 0.0, + "num_input_tokens_seen": 69895472, + "step": 120475 + }, + { + "epoch": 17.944593386952636, + "grad_norm": 2.8795063826692058e-06, + "learning_rate": 1.5916955419151725e-06, + "loss": 0.0, + "num_input_tokens_seen": 69898640, + "step": 120480 + }, + { + "epoch": 17.945338099493597, + "grad_norm": 2.7447676984593272e-06, + "learning_rate": 1.5905548176058826e-06, + "loss": 0.0, + "num_input_tokens_seen": 69901456, + "step": 120485 + }, + { + "epoch": 17.946082812034554, + "grad_norm": 2.608909881018917e-06, + "learning_rate": 1.5894144887753786e-06, + "loss": 0.0, + "num_input_tokens_seen": 69904272, + "step": 120490 + }, + { + "epoch": 17.946827524575514, + "grad_norm": 7.232033567561302e-06, + "learning_rate": 1.5882745554429174e-06, + "loss": 0.0, + "num_input_tokens_seen": 69907056, + "step": 120495 + }, + { + "epoch": 17.94757223711647, + "grad_norm": 8.648816583445296e-05, + "learning_rate": 1.5871350176277667e-06, + "loss": 0.0, + "num_input_tokens_seen": 69909808, + "step": 120500 + }, + { + "epoch": 17.948316949657432, + "grad_norm": 1.7033546100719832e-05, + "learning_rate": 1.585995875349172e-06, + "loss": 0.0, + "num_input_tokens_seen": 69912752, + "step": 120505 + }, + { + "epoch": 17.949061662198392, + "grad_norm": 5.701267582480796e-05, + "learning_rate": 1.5848571286263825e-06, + "loss": 0.0, + "num_input_tokens_seen": 69915984, + "step": 120510 + }, + { + "epoch": 17.94980637473935, + "grad_norm": 5.16868349222932e-05, + "learning_rate": 1.5837187774786293e-06, + "loss": 0.0, + "num_input_tokens_seen": 69918864, + "step": 120515 + }, + { + "epoch": 17.95055108728031, + "grad_norm": 3.881106295011705e-06, + "learning_rate": 1.5825808219251532e-06, + "loss": 0.0, + "num_input_tokens_seen": 69921776, + "step": 120520 + }, + { + "epoch": 17.95129579982127, + "grad_norm": 2.0065683656866895e-06, + "learning_rate": 1.5814432619851687e-06, + "loss": 0.0, + "num_input_tokens_seen": 69924624, + "step": 120525 + }, + { + "epoch": 17.952040512362228, + "grad_norm": 0.0004122571262996644, + "learning_rate": 1.5803060976779026e-06, + "loss": 0.0, + "num_input_tokens_seen": 69927440, + "step": 120530 + }, + { + "epoch": 17.952785224903188, + "grad_norm": 0.00018392906349617988, + "learning_rate": 1.5791693290225646e-06, + "loss": 0.0, + "num_input_tokens_seen": 69929904, + "step": 120535 + }, + { + "epoch": 17.953529937444145, + "grad_norm": 2.3342424810834927e-06, + "learning_rate": 1.5780329560383527e-06, + "loss": 0.0, + "num_input_tokens_seen": 69932976, + "step": 120540 + }, + { + "epoch": 17.954274649985106, + "grad_norm": 9.805880836211145e-06, + "learning_rate": 1.5768969787444716e-06, + "loss": 0.0, + "num_input_tokens_seen": 69935792, + "step": 120545 + }, + { + "epoch": 17.955019362526066, + "grad_norm": 1.3123930330039002e-05, + "learning_rate": 1.5757613971601054e-06, + "loss": 0.0, + "num_input_tokens_seen": 69938704, + "step": 120550 + }, + { + "epoch": 17.955764075067023, + "grad_norm": 3.062011819565669e-05, + "learning_rate": 1.5746262113044474e-06, + "loss": 0.0, + "num_input_tokens_seen": 69941456, + "step": 120555 + }, + { + "epoch": 17.956508787607984, + "grad_norm": 2.0207105535519077e-06, + "learning_rate": 1.5734914211966683e-06, + "loss": 0.0, + "num_input_tokens_seen": 69944304, + "step": 120560 + }, + { + "epoch": 17.957253500148944, + "grad_norm": 2.2929584702069405e-06, + "learning_rate": 1.5723570268559445e-06, + "loss": 0.0, + "num_input_tokens_seen": 69947024, + "step": 120565 + }, + { + "epoch": 17.9579982126899, + "grad_norm": 3.209110445823171e-06, + "learning_rate": 1.5712230283014385e-06, + "loss": 0.0, + "num_input_tokens_seen": 69949968, + "step": 120570 + }, + { + "epoch": 17.95874292523086, + "grad_norm": 4.36978871221072e-06, + "learning_rate": 1.570089425552304e-06, + "loss": 0.0, + "num_input_tokens_seen": 69952880, + "step": 120575 + }, + { + "epoch": 17.95948763777182, + "grad_norm": 5.3669004955736455e-06, + "learning_rate": 1.5689562186276986e-06, + "loss": 0.0, + "num_input_tokens_seen": 69955728, + "step": 120580 + }, + { + "epoch": 17.96023235031278, + "grad_norm": 0.00018040930444840342, + "learning_rate": 1.567823407546759e-06, + "loss": 0.0, + "num_input_tokens_seen": 69958768, + "step": 120585 + }, + { + "epoch": 17.96097706285374, + "grad_norm": 4.231426373735303e-06, + "learning_rate": 1.5666909923286315e-06, + "loss": 0.0, + "num_input_tokens_seen": 69961520, + "step": 120590 + }, + { + "epoch": 17.961721775394697, + "grad_norm": 7.538748468505219e-05, + "learning_rate": 1.5655589729924453e-06, + "loss": 0.0, + "num_input_tokens_seen": 69964080, + "step": 120595 + }, + { + "epoch": 17.962466487935657, + "grad_norm": 6.779742398066446e-05, + "learning_rate": 1.564427349557318e-06, + "loss": 0.0, + "num_input_tokens_seen": 69967024, + "step": 120600 + }, + { + "epoch": 17.963211200476614, + "grad_norm": 5.148524905962404e-06, + "learning_rate": 1.5632961220423737e-06, + "loss": 0.0, + "num_input_tokens_seen": 69969712, + "step": 120605 + }, + { + "epoch": 17.963955913017575, + "grad_norm": 2.4078688511508517e-05, + "learning_rate": 1.5621652904667244e-06, + "loss": 0.0, + "num_input_tokens_seen": 69972400, + "step": 120610 + }, + { + "epoch": 17.964700625558535, + "grad_norm": 2.7712503651855513e-05, + "learning_rate": 1.5610348548494692e-06, + "loss": 0.0, + "num_input_tokens_seen": 69975344, + "step": 120615 + }, + { + "epoch": 17.965445338099492, + "grad_norm": 5.306068942445563e-06, + "learning_rate": 1.5599048152097034e-06, + "loss": 0.0, + "num_input_tokens_seen": 69977968, + "step": 120620 + }, + { + "epoch": 17.966190050640453, + "grad_norm": 6.115619726188015e-06, + "learning_rate": 1.558775171566529e-06, + "loss": 0.0, + "num_input_tokens_seen": 69980720, + "step": 120625 + }, + { + "epoch": 17.966934763181413, + "grad_norm": 0.0003782753774430603, + "learning_rate": 1.5576459239390162e-06, + "loss": 0.0, + "num_input_tokens_seen": 69983664, + "step": 120630 + }, + { + "epoch": 17.96767947572237, + "grad_norm": 9.855151802184992e-06, + "learning_rate": 1.5565170723462558e-06, + "loss": 0.0, + "num_input_tokens_seen": 69986160, + "step": 120635 + }, + { + "epoch": 17.96842418826333, + "grad_norm": 1.098082611861173e-05, + "learning_rate": 1.5553886168073073e-06, + "loss": 0.0, + "num_input_tokens_seen": 69989104, + "step": 120640 + }, + { + "epoch": 17.969168900804288, + "grad_norm": 3.0659914500574814e-06, + "learning_rate": 1.5542605573412444e-06, + "loss": 0.0, + "num_input_tokens_seen": 69991888, + "step": 120645 + }, + { + "epoch": 17.96991361334525, + "grad_norm": 0.00020920032693538815, + "learning_rate": 1.5531328939671215e-06, + "loss": 0.0, + "num_input_tokens_seen": 69994800, + "step": 120650 + }, + { + "epoch": 17.97065832588621, + "grad_norm": 2.292483259225264e-06, + "learning_rate": 1.552005626703984e-06, + "loss": 0.0, + "num_input_tokens_seen": 69997808, + "step": 120655 + }, + { + "epoch": 17.971403038427166, + "grad_norm": 4.7085222831810825e-06, + "learning_rate": 1.550878755570881e-06, + "loss": 0.0, + "num_input_tokens_seen": 70000560, + "step": 120660 + }, + { + "epoch": 17.972147750968126, + "grad_norm": 5.178759693080792e-06, + "learning_rate": 1.5497522805868525e-06, + "loss": 0.0046, + "num_input_tokens_seen": 70003344, + "step": 120665 + }, + { + "epoch": 17.972892463509087, + "grad_norm": 9.877172487904318e-06, + "learning_rate": 1.5486262017709225e-06, + "loss": 0.0, + "num_input_tokens_seen": 70006288, + "step": 120670 + }, + { + "epoch": 17.973637176050044, + "grad_norm": 1.607909143785946e-05, + "learning_rate": 1.5475005191421172e-06, + "loss": 0.0, + "num_input_tokens_seen": 70009136, + "step": 120675 + }, + { + "epoch": 17.974381888591004, + "grad_norm": 2.0694210434157867e-06, + "learning_rate": 1.5463752327194548e-06, + "loss": 0.0, + "num_input_tokens_seen": 70011984, + "step": 120680 + }, + { + "epoch": 17.97512660113196, + "grad_norm": 2.7592054721026216e-06, + "learning_rate": 1.545250342521945e-06, + "loss": 0.0, + "num_input_tokens_seen": 70014768, + "step": 120685 + }, + { + "epoch": 17.975871313672922, + "grad_norm": 0.0008789378916844726, + "learning_rate": 1.544125848568595e-06, + "loss": 0.0, + "num_input_tokens_seen": 70017584, + "step": 120690 + }, + { + "epoch": 17.976616026213883, + "grad_norm": 2.5899787488015136e-06, + "learning_rate": 1.5430017508783978e-06, + "loss": 0.0, + "num_input_tokens_seen": 70021232, + "step": 120695 + }, + { + "epoch": 17.97736073875484, + "grad_norm": 1.1602290214796085e-05, + "learning_rate": 1.541878049470344e-06, + "loss": 0.0, + "num_input_tokens_seen": 70024144, + "step": 120700 + }, + { + "epoch": 17.9781054512958, + "grad_norm": 2.3806094759493135e-05, + "learning_rate": 1.5407547443634206e-06, + "loss": 0.0, + "num_input_tokens_seen": 70027536, + "step": 120705 + }, + { + "epoch": 17.97885016383676, + "grad_norm": 2.3391281501972117e-05, + "learning_rate": 1.5396318355765993e-06, + "loss": 0.0, + "num_input_tokens_seen": 70030512, + "step": 120710 + }, + { + "epoch": 17.979594876377718, + "grad_norm": 2.470114668540191e-05, + "learning_rate": 1.538509323128859e-06, + "loss": 0.0, + "num_input_tokens_seen": 70033488, + "step": 120715 + }, + { + "epoch": 17.980339588918678, + "grad_norm": 4.8068286560010165e-06, + "learning_rate": 1.5373872070391536e-06, + "loss": 0.0, + "num_input_tokens_seen": 70036624, + "step": 120720 + }, + { + "epoch": 17.981084301459635, + "grad_norm": 2.9727036690019304e-06, + "learning_rate": 1.5362654873264493e-06, + "loss": 0.0, + "num_input_tokens_seen": 70039440, + "step": 120725 + }, + { + "epoch": 17.981829014000596, + "grad_norm": 6.458687494159676e-06, + "learning_rate": 1.5351441640096941e-06, + "loss": 0.0, + "num_input_tokens_seen": 70042192, + "step": 120730 + }, + { + "epoch": 17.982573726541556, + "grad_norm": 1.145392070611706e-05, + "learning_rate": 1.5340232371078262e-06, + "loss": 0.0, + "num_input_tokens_seen": 70045264, + "step": 120735 + }, + { + "epoch": 17.983318439082513, + "grad_norm": 6.98997291692649e-06, + "learning_rate": 1.5329027066397884e-06, + "loss": 0.0, + "num_input_tokens_seen": 70048016, + "step": 120740 + }, + { + "epoch": 17.984063151623474, + "grad_norm": 1.9803499526460655e-05, + "learning_rate": 1.5317825726245045e-06, + "loss": 0.0, + "num_input_tokens_seen": 70050704, + "step": 120745 + }, + { + "epoch": 17.984807864164434, + "grad_norm": 1.99864552996587e-05, + "learning_rate": 1.5306628350809037e-06, + "loss": 0.0, + "num_input_tokens_seen": 70053424, + "step": 120750 + }, + { + "epoch": 17.98555257670539, + "grad_norm": 2.849343673005933e-06, + "learning_rate": 1.529543494027899e-06, + "loss": 0.0141, + "num_input_tokens_seen": 70056368, + "step": 120755 + }, + { + "epoch": 17.98629728924635, + "grad_norm": 2.899778337450698e-06, + "learning_rate": 1.5284245494844057e-06, + "loss": 0.0, + "num_input_tokens_seen": 70059600, + "step": 120760 + }, + { + "epoch": 17.98704200178731, + "grad_norm": 1.7432776076020673e-05, + "learning_rate": 1.5273060014693224e-06, + "loss": 0.0, + "num_input_tokens_seen": 70062544, + "step": 120765 + }, + { + "epoch": 17.98778671432827, + "grad_norm": 2.8298697998252464e-06, + "learning_rate": 1.526187850001548e-06, + "loss": 0.0, + "num_input_tokens_seen": 70065328, + "step": 120770 + }, + { + "epoch": 17.98853142686923, + "grad_norm": 5.786140900454484e-05, + "learning_rate": 1.5250700950999758e-06, + "loss": 0.0, + "num_input_tokens_seen": 70068304, + "step": 120775 + }, + { + "epoch": 17.989276139410187, + "grad_norm": 0.0002834296028595418, + "learning_rate": 1.5239527367834794e-06, + "loss": 0.0, + "num_input_tokens_seen": 70071184, + "step": 120780 + }, + { + "epoch": 17.990020851951147, + "grad_norm": 6.628256960539147e-05, + "learning_rate": 1.5228357750709465e-06, + "loss": 0.0, + "num_input_tokens_seen": 70074288, + "step": 120785 + }, + { + "epoch": 17.990765564492108, + "grad_norm": 2.262421503473888e-06, + "learning_rate": 1.5217192099812372e-06, + "loss": 0.0, + "num_input_tokens_seen": 70077264, + "step": 120790 + }, + { + "epoch": 17.991510277033065, + "grad_norm": 2.501609742466826e-06, + "learning_rate": 1.5206030415332223e-06, + "loss": 0.0, + "num_input_tokens_seen": 70080720, + "step": 120795 + }, + { + "epoch": 17.992254989574025, + "grad_norm": 3.61054003406025e-06, + "learning_rate": 1.519487269745759e-06, + "loss": 0.0, + "num_input_tokens_seen": 70083600, + "step": 120800 + }, + { + "epoch": 17.992999702114982, + "grad_norm": 5.355208486435004e-06, + "learning_rate": 1.5183718946376907e-06, + "loss": 0.0, + "num_input_tokens_seen": 70086416, + "step": 120805 + }, + { + "epoch": 17.993744414655943, + "grad_norm": 3.032275571968057e-06, + "learning_rate": 1.5172569162278661e-06, + "loss": 0.0, + "num_input_tokens_seen": 70089584, + "step": 120810 + }, + { + "epoch": 17.994489127196903, + "grad_norm": 4.43263479610323e-06, + "learning_rate": 1.5161423345351116e-06, + "loss": 0.0, + "num_input_tokens_seen": 70092528, + "step": 120815 + }, + { + "epoch": 17.99523383973786, + "grad_norm": 2.838531827364932e-06, + "learning_rate": 1.515028149578271e-06, + "loss": 0.0, + "num_input_tokens_seen": 70095696, + "step": 120820 + }, + { + "epoch": 17.99597855227882, + "grad_norm": 0.0001854927686508745, + "learning_rate": 1.5139143613761565e-06, + "loss": 0.0, + "num_input_tokens_seen": 70098480, + "step": 120825 + }, + { + "epoch": 17.996723264819778, + "grad_norm": 4.697355325333774e-05, + "learning_rate": 1.5128009699475948e-06, + "loss": 0.0, + "num_input_tokens_seen": 70101232, + "step": 120830 + }, + { + "epoch": 17.99746797736074, + "grad_norm": 9.006815162138082e-06, + "learning_rate": 1.5116879753113822e-06, + "loss": 0.0, + "num_input_tokens_seen": 70103888, + "step": 120835 + }, + { + "epoch": 17.9982126899017, + "grad_norm": 3.5242205740360077e-06, + "learning_rate": 1.510575377486334e-06, + "loss": 0.0, + "num_input_tokens_seen": 70107280, + "step": 120840 + }, + { + "epoch": 17.998957402442656, + "grad_norm": 0.00016338478599209338, + "learning_rate": 1.5094631764912354e-06, + "loss": 0.0, + "num_input_tokens_seen": 70110128, + "step": 120845 + }, + { + "epoch": 17.999702114983616, + "grad_norm": 0.00021730286243837327, + "learning_rate": 1.5083513723448877e-06, + "loss": 0.0, + "num_input_tokens_seen": 70112976, + "step": 120850 + }, + { + "epoch": 18.0, + "eval_loss": 3.665987491607666, + "eval_runtime": 51.2489, + "eval_samples_per_second": 58.226, + "eval_steps_per_second": 14.556, + "num_input_tokens_seen": 70113640, + "step": 120852 + }, + { + "epoch": 18.000446827524577, + "grad_norm": 1.987319592444692e-06, + "learning_rate": 1.507239965066068e-06, + "loss": 0.0, + "num_input_tokens_seen": 70115656, + "step": 120855 + }, + { + "epoch": 18.001191540065534, + "grad_norm": 2.5834928237600252e-05, + "learning_rate": 1.506128954673547e-06, + "loss": 0.0, + "num_input_tokens_seen": 70118568, + "step": 120860 + }, + { + "epoch": 18.001936252606495, + "grad_norm": 3.1563567972625606e-06, + "learning_rate": 1.505018341186104e-06, + "loss": 0.0, + "num_input_tokens_seen": 70121384, + "step": 120865 + }, + { + "epoch": 18.00268096514745, + "grad_norm": 1.3760130968876183e-05, + "learning_rate": 1.5039081246224967e-06, + "loss": 0.0, + "num_input_tokens_seen": 70124360, + "step": 120870 + }, + { + "epoch": 18.003425677688412, + "grad_norm": 1.1348468433425296e-05, + "learning_rate": 1.502798305001482e-06, + "loss": 0.0, + "num_input_tokens_seen": 70127048, + "step": 120875 + }, + { + "epoch": 18.004170390229373, + "grad_norm": 1.5702558812336065e-05, + "learning_rate": 1.5016888823418035e-06, + "loss": 0.0, + "num_input_tokens_seen": 70130120, + "step": 120880 + }, + { + "epoch": 18.00491510277033, + "grad_norm": 6.326497441477841e-06, + "learning_rate": 1.5005798566622125e-06, + "loss": 0.0, + "num_input_tokens_seen": 70132776, + "step": 120885 + }, + { + "epoch": 18.00565981531129, + "grad_norm": 3.1568929443892557e-06, + "learning_rate": 1.4994712279814415e-06, + "loss": 0.0, + "num_input_tokens_seen": 70135688, + "step": 120890 + }, + { + "epoch": 18.00640452785225, + "grad_norm": 3.698183945743949e-06, + "learning_rate": 1.4983629963182143e-06, + "loss": 0.0, + "num_input_tokens_seen": 70138376, + "step": 120895 + }, + { + "epoch": 18.007149240393208, + "grad_norm": 1.2819487892556936e-05, + "learning_rate": 1.4972551616912633e-06, + "loss": 0.0, + "num_input_tokens_seen": 70141032, + "step": 120900 + }, + { + "epoch": 18.007893952934168, + "grad_norm": 2.9328634809644427e-06, + "learning_rate": 1.4961477241192956e-06, + "loss": 0.0, + "num_input_tokens_seen": 70143976, + "step": 120905 + }, + { + "epoch": 18.008638665475125, + "grad_norm": 3.114187711616978e-05, + "learning_rate": 1.4950406836210267e-06, + "loss": 0.0, + "num_input_tokens_seen": 70147080, + "step": 120910 + }, + { + "epoch": 18.009383378016086, + "grad_norm": 0.0007130679441615939, + "learning_rate": 1.493934040215153e-06, + "loss": 0.0, + "num_input_tokens_seen": 70149992, + "step": 120915 + }, + { + "epoch": 18.010128090557046, + "grad_norm": 1.0423291314509697e-05, + "learning_rate": 1.492827793920376e-06, + "loss": 0.0, + "num_input_tokens_seen": 70152680, + "step": 120920 + }, + { + "epoch": 18.010872803098003, + "grad_norm": 4.411246209201636e-06, + "learning_rate": 1.4917219447553838e-06, + "loss": 0.0, + "num_input_tokens_seen": 70155464, + "step": 120925 + }, + { + "epoch": 18.011617515638964, + "grad_norm": 6.463565568992635e-06, + "learning_rate": 1.49061649273885e-06, + "loss": 0.0, + "num_input_tokens_seen": 70158504, + "step": 120930 + }, + { + "epoch": 18.012362228179924, + "grad_norm": 0.0015270919539034367, + "learning_rate": 1.4895114378894625e-06, + "loss": 0.0, + "num_input_tokens_seen": 70161384, + "step": 120935 + }, + { + "epoch": 18.01310694072088, + "grad_norm": 7.354635272349697e-06, + "learning_rate": 1.4884067802258845e-06, + "loss": 0.0, + "num_input_tokens_seen": 70164232, + "step": 120940 + }, + { + "epoch": 18.013851653261842, + "grad_norm": 2.128581172655686e-06, + "learning_rate": 1.4873025197667756e-06, + "loss": 0.0, + "num_input_tokens_seen": 70167016, + "step": 120945 + }, + { + "epoch": 18.0145963658028, + "grad_norm": 3.1393931294587674e-06, + "learning_rate": 1.4861986565307935e-06, + "loss": 0.0, + "num_input_tokens_seen": 70169960, + "step": 120950 + }, + { + "epoch": 18.01534107834376, + "grad_norm": 3.197972318957909e-06, + "learning_rate": 1.4850951905365868e-06, + "loss": 0.0, + "num_input_tokens_seen": 70172872, + "step": 120955 + }, + { + "epoch": 18.01608579088472, + "grad_norm": 1.103354043152649e-05, + "learning_rate": 1.4839921218027935e-06, + "loss": 0.0, + "num_input_tokens_seen": 70175624, + "step": 120960 + }, + { + "epoch": 18.016830503425677, + "grad_norm": 1.6114958270918578e-05, + "learning_rate": 1.4828894503480601e-06, + "loss": 0.0, + "num_input_tokens_seen": 70178312, + "step": 120965 + }, + { + "epoch": 18.017575215966637, + "grad_norm": 1.6381158275180496e-05, + "learning_rate": 1.4817871761910047e-06, + "loss": 0.004, + "num_input_tokens_seen": 70181256, + "step": 120970 + }, + { + "epoch": 18.018319928507594, + "grad_norm": 2.0773384221683955e-06, + "learning_rate": 1.4806852993502485e-06, + "loss": 0.0, + "num_input_tokens_seen": 70184200, + "step": 120975 + }, + { + "epoch": 18.019064641048555, + "grad_norm": 6.708051387249725e-06, + "learning_rate": 1.4795838198444157e-06, + "loss": 0.0, + "num_input_tokens_seen": 70187112, + "step": 120980 + }, + { + "epoch": 18.019809353589515, + "grad_norm": 2.0015456811961485e-06, + "learning_rate": 1.4784827376921052e-06, + "loss": 0.0, + "num_input_tokens_seen": 70189800, + "step": 120985 + }, + { + "epoch": 18.020554066130472, + "grad_norm": 5.6711364777584095e-06, + "learning_rate": 1.4773820529119243e-06, + "loss": 0.0, + "num_input_tokens_seen": 70192584, + "step": 120990 + }, + { + "epoch": 18.021298778671433, + "grad_norm": 2.860079575839336e-06, + "learning_rate": 1.4762817655224664e-06, + "loss": 0.0, + "num_input_tokens_seen": 70195624, + "step": 120995 + }, + { + "epoch": 18.022043491212393, + "grad_norm": 1.2421719475241844e-05, + "learning_rate": 1.4751818755423225e-06, + "loss": 0.0, + "num_input_tokens_seen": 70198504, + "step": 121000 + }, + { + "epoch": 18.02278820375335, + "grad_norm": 4.1392715502297506e-05, + "learning_rate": 1.4740823829900664e-06, + "loss": 0.0, + "num_input_tokens_seen": 70201800, + "step": 121005 + }, + { + "epoch": 18.02353291629431, + "grad_norm": 3.92256879422348e-06, + "learning_rate": 1.4729832878842803e-06, + "loss": 0.0, + "num_input_tokens_seen": 70204616, + "step": 121010 + }, + { + "epoch": 18.024277628835268, + "grad_norm": 2.686266725504538e-06, + "learning_rate": 1.4718845902435303e-06, + "loss": 0.0, + "num_input_tokens_seen": 70207336, + "step": 121015 + }, + { + "epoch": 18.02502234137623, + "grad_norm": 6.172681423777249e-06, + "learning_rate": 1.4707862900863734e-06, + "loss": 0.0, + "num_input_tokens_seen": 70210440, + "step": 121020 + }, + { + "epoch": 18.02576705391719, + "grad_norm": 1.880824493127875e-05, + "learning_rate": 1.4696883874313727e-06, + "loss": 0.0, + "num_input_tokens_seen": 70213320, + "step": 121025 + }, + { + "epoch": 18.026511766458146, + "grad_norm": 0.00010118292993865907, + "learning_rate": 1.4685908822970663e-06, + "loss": 0.0, + "num_input_tokens_seen": 70216264, + "step": 121030 + }, + { + "epoch": 18.027256478999107, + "grad_norm": 3.7800443806190742e-06, + "learning_rate": 1.4674937747020057e-06, + "loss": 0.0, + "num_input_tokens_seen": 70219208, + "step": 121035 + }, + { + "epoch": 18.028001191540067, + "grad_norm": 0.00012157629680586979, + "learning_rate": 1.4663970646647152e-06, + "loss": 0.0, + "num_input_tokens_seen": 70221992, + "step": 121040 + }, + { + "epoch": 18.028745904081024, + "grad_norm": 1.7093933593059774e-06, + "learning_rate": 1.4653007522037325e-06, + "loss": 0.0, + "num_input_tokens_seen": 70224616, + "step": 121045 + }, + { + "epoch": 18.029490616621985, + "grad_norm": 4.0011750570556615e-06, + "learning_rate": 1.4642048373375712e-06, + "loss": 0.0, + "num_input_tokens_seen": 70227432, + "step": 121050 + }, + { + "epoch": 18.03023532916294, + "grad_norm": 1.478237027185969e-05, + "learning_rate": 1.4631093200847517e-06, + "loss": 0.0, + "num_input_tokens_seen": 70230280, + "step": 121055 + }, + { + "epoch": 18.030980041703902, + "grad_norm": 5.883943231310695e-06, + "learning_rate": 1.4620142004637766e-06, + "loss": 0.0, + "num_input_tokens_seen": 70233224, + "step": 121060 + }, + { + "epoch": 18.031724754244863, + "grad_norm": 0.0009378754184581339, + "learning_rate": 1.4609194784931502e-06, + "loss": 0.0, + "num_input_tokens_seen": 70235944, + "step": 121065 + }, + { + "epoch": 18.03246946678582, + "grad_norm": 2.1976464267936535e-05, + "learning_rate": 1.459825154191366e-06, + "loss": 0.0, + "num_input_tokens_seen": 70238664, + "step": 121070 + }, + { + "epoch": 18.03321417932678, + "grad_norm": 2.752626642177347e-06, + "learning_rate": 1.4587312275769065e-06, + "loss": 0.0, + "num_input_tokens_seen": 70241352, + "step": 121075 + }, + { + "epoch": 18.03395889186774, + "grad_norm": 0.004910280462354422, + "learning_rate": 1.4576376986682598e-06, + "loss": 0.0, + "num_input_tokens_seen": 70244456, + "step": 121080 + }, + { + "epoch": 18.034703604408698, + "grad_norm": 0.00035669165663421154, + "learning_rate": 1.4565445674838919e-06, + "loss": 0.0, + "num_input_tokens_seen": 70247688, + "step": 121085 + }, + { + "epoch": 18.035448316949658, + "grad_norm": 2.643215520947706e-05, + "learning_rate": 1.455451834042279e-06, + "loss": 0.0, + "num_input_tokens_seen": 70250984, + "step": 121090 + }, + { + "epoch": 18.036193029490615, + "grad_norm": 6.854349976492813e-06, + "learning_rate": 1.4543594983618792e-06, + "loss": 0.0, + "num_input_tokens_seen": 70253640, + "step": 121095 + }, + { + "epoch": 18.036937742031576, + "grad_norm": 3.7319844068406383e-06, + "learning_rate": 1.4532675604611412e-06, + "loss": 0.0, + "num_input_tokens_seen": 70256456, + "step": 121100 + }, + { + "epoch": 18.037682454572536, + "grad_norm": 6.959113670745865e-05, + "learning_rate": 1.45217602035852e-06, + "loss": 0.0, + "num_input_tokens_seen": 70259048, + "step": 121105 + }, + { + "epoch": 18.038427167113493, + "grad_norm": 1.68277692864649e-05, + "learning_rate": 1.451084878072448e-06, + "loss": 0.0, + "num_input_tokens_seen": 70262216, + "step": 121110 + }, + { + "epoch": 18.039171879654454, + "grad_norm": 4.726859970105579e-06, + "learning_rate": 1.4499941336213657e-06, + "loss": 0.0, + "num_input_tokens_seen": 70265256, + "step": 121115 + }, + { + "epoch": 18.039916592195414, + "grad_norm": 3.073919060625485e-06, + "learning_rate": 1.448903787023692e-06, + "loss": 0.0, + "num_input_tokens_seen": 70268328, + "step": 121120 + }, + { + "epoch": 18.04066130473637, + "grad_norm": 6.841196591267362e-05, + "learning_rate": 1.4478138382978595e-06, + "loss": 0.0, + "num_input_tokens_seen": 70271240, + "step": 121125 + }, + { + "epoch": 18.041406017277332, + "grad_norm": 1.0884858056670055e-05, + "learning_rate": 1.4467242874622726e-06, + "loss": 0.0, + "num_input_tokens_seen": 70273960, + "step": 121130 + }, + { + "epoch": 18.04215072981829, + "grad_norm": 0.00010098671918967739, + "learning_rate": 1.445635134535342e-06, + "loss": 0.0, + "num_input_tokens_seen": 70276744, + "step": 121135 + }, + { + "epoch": 18.04289544235925, + "grad_norm": 2.9156224172766088e-06, + "learning_rate": 1.4445463795354664e-06, + "loss": 0.0, + "num_input_tokens_seen": 70279848, + "step": 121140 + }, + { + "epoch": 18.04364015490021, + "grad_norm": 2.5178876967402175e-05, + "learning_rate": 1.443458022481034e-06, + "loss": 0.0, + "num_input_tokens_seen": 70282984, + "step": 121145 + }, + { + "epoch": 18.044384867441167, + "grad_norm": 3.864234713546466e-06, + "learning_rate": 1.4423700633904414e-06, + "loss": 0.0, + "num_input_tokens_seen": 70285960, + "step": 121150 + }, + { + "epoch": 18.045129579982127, + "grad_norm": 3.6166331938147778e-06, + "learning_rate": 1.4412825022820598e-06, + "loss": 0.0, + "num_input_tokens_seen": 70288744, + "step": 121155 + }, + { + "epoch": 18.045874292523084, + "grad_norm": 7.179327076300979e-05, + "learning_rate": 1.440195339174269e-06, + "loss": 0.0, + "num_input_tokens_seen": 70291528, + "step": 121160 + }, + { + "epoch": 18.046619005064045, + "grad_norm": 2.2100521164247766e-05, + "learning_rate": 1.4391085740854293e-06, + "loss": 0.0, + "num_input_tokens_seen": 70294600, + "step": 121165 + }, + { + "epoch": 18.047363717605005, + "grad_norm": 0.00012665298709180206, + "learning_rate": 1.4380222070339095e-06, + "loss": 0.0, + "num_input_tokens_seen": 70297512, + "step": 121170 + }, + { + "epoch": 18.048108430145962, + "grad_norm": 8.17674481368158e-06, + "learning_rate": 1.4369362380380558e-06, + "loss": 0.0, + "num_input_tokens_seen": 70300488, + "step": 121175 + }, + { + "epoch": 18.048853142686923, + "grad_norm": 4.9380807467969134e-05, + "learning_rate": 1.435850667116212e-06, + "loss": 0.0, + "num_input_tokens_seen": 70303944, + "step": 121180 + }, + { + "epoch": 18.049597855227884, + "grad_norm": 2.852018496923847e-06, + "learning_rate": 1.4347654942867245e-06, + "loss": 0.0, + "num_input_tokens_seen": 70307272, + "step": 121185 + }, + { + "epoch": 18.05034256776884, + "grad_norm": 4.422503843670711e-05, + "learning_rate": 1.4336807195679203e-06, + "loss": 0.0, + "num_input_tokens_seen": 70310280, + "step": 121190 + }, + { + "epoch": 18.0510872803098, + "grad_norm": 2.503218411220587e-06, + "learning_rate": 1.4325963429781347e-06, + "loss": 0.0, + "num_input_tokens_seen": 70312936, + "step": 121195 + }, + { + "epoch": 18.051831992850758, + "grad_norm": 0.00023865517869126052, + "learning_rate": 1.4315123645356782e-06, + "loss": 0.0, + "num_input_tokens_seen": 70315816, + "step": 121200 + }, + { + "epoch": 18.05257670539172, + "grad_norm": 3.7970419271005085e-06, + "learning_rate": 1.4304287842588665e-06, + "loss": 0.0, + "num_input_tokens_seen": 70318888, + "step": 121205 + }, + { + "epoch": 18.05332141793268, + "grad_norm": 3.7854083529964555e-06, + "learning_rate": 1.4293456021660018e-06, + "loss": 0.0, + "num_input_tokens_seen": 70321928, + "step": 121210 + }, + { + "epoch": 18.054066130473636, + "grad_norm": 0.00018743897089734674, + "learning_rate": 1.4282628182753915e-06, + "loss": 0.0, + "num_input_tokens_seen": 70324744, + "step": 121215 + }, + { + "epoch": 18.054810843014597, + "grad_norm": 2.778046336970874e-06, + "learning_rate": 1.4271804326053239e-06, + "loss": 0.0, + "num_input_tokens_seen": 70327720, + "step": 121220 + }, + { + "epoch": 18.055555555555557, + "grad_norm": 9.402537398273125e-05, + "learning_rate": 1.4260984451740815e-06, + "loss": 0.0, + "num_input_tokens_seen": 70330568, + "step": 121225 + }, + { + "epoch": 18.056300268096514, + "grad_norm": 3.7617242014675867e-06, + "learning_rate": 1.4250168559999499e-06, + "loss": 0.0, + "num_input_tokens_seen": 70333512, + "step": 121230 + }, + { + "epoch": 18.057044980637475, + "grad_norm": 2.6172508569288766e-06, + "learning_rate": 1.423935665101192e-06, + "loss": 0.0, + "num_input_tokens_seen": 70336712, + "step": 121235 + }, + { + "epoch": 18.05778969317843, + "grad_norm": 7.67530727898702e-05, + "learning_rate": 1.422854872496085e-06, + "loss": 0.0, + "num_input_tokens_seen": 70339464, + "step": 121240 + }, + { + "epoch": 18.058534405719392, + "grad_norm": 5.010389031667728e-06, + "learning_rate": 1.421774478202878e-06, + "loss": 0.0, + "num_input_tokens_seen": 70342184, + "step": 121245 + }, + { + "epoch": 18.059279118260353, + "grad_norm": 7.2201132752525155e-06, + "learning_rate": 1.4206944822398316e-06, + "loss": 0.0, + "num_input_tokens_seen": 70345032, + "step": 121250 + }, + { + "epoch": 18.06002383080131, + "grad_norm": 1.7920696336659603e-05, + "learning_rate": 1.4196148846251867e-06, + "loss": 0.0, + "num_input_tokens_seen": 70348040, + "step": 121255 + }, + { + "epoch": 18.06076854334227, + "grad_norm": 3.824781288130907e-06, + "learning_rate": 1.4185356853771787e-06, + "loss": 0.0, + "num_input_tokens_seen": 70350984, + "step": 121260 + }, + { + "epoch": 18.06151325588323, + "grad_norm": 1.0137835488421842e-05, + "learning_rate": 1.4174568845140457e-06, + "loss": 0.0, + "num_input_tokens_seen": 70353832, + "step": 121265 + }, + { + "epoch": 18.062257968424188, + "grad_norm": 2.968737135233823e-06, + "learning_rate": 1.416378482054012e-06, + "loss": 0.0, + "num_input_tokens_seen": 70356904, + "step": 121270 + }, + { + "epoch": 18.06300268096515, + "grad_norm": 2.1983080387144582e-06, + "learning_rate": 1.4153004780152939e-06, + "loss": 0.0, + "num_input_tokens_seen": 70359592, + "step": 121275 + }, + { + "epoch": 18.063747393506105, + "grad_norm": 3.752037446247414e-05, + "learning_rate": 1.414222872416099e-06, + "loss": 0.0, + "num_input_tokens_seen": 70362632, + "step": 121280 + }, + { + "epoch": 18.064492106047066, + "grad_norm": 6.906115231686272e-06, + "learning_rate": 1.4131456652746428e-06, + "loss": 0.0, + "num_input_tokens_seen": 70365544, + "step": 121285 + }, + { + "epoch": 18.065236818588026, + "grad_norm": 2.0169150047877338e-06, + "learning_rate": 1.4120688566091112e-06, + "loss": 0.0, + "num_input_tokens_seen": 70368232, + "step": 121290 + }, + { + "epoch": 18.065981531128983, + "grad_norm": 1.6940450223046355e-05, + "learning_rate": 1.410992446437709e-06, + "loss": 0.0, + "num_input_tokens_seen": 70371080, + "step": 121295 + }, + { + "epoch": 18.066726243669944, + "grad_norm": 0.00013883024803362787, + "learning_rate": 1.4099164347786132e-06, + "loss": 0.0, + "num_input_tokens_seen": 70373672, + "step": 121300 + }, + { + "epoch": 18.0674709562109, + "grad_norm": 3.04688705909939e-06, + "learning_rate": 1.408840821650001e-06, + "loss": 0.0, + "num_input_tokens_seen": 70376264, + "step": 121305 + }, + { + "epoch": 18.06821566875186, + "grad_norm": 2.0371899154270068e-06, + "learning_rate": 1.4077656070700496e-06, + "loss": 0.0, + "num_input_tokens_seen": 70379464, + "step": 121310 + }, + { + "epoch": 18.068960381292822, + "grad_norm": 6.350578132696683e-06, + "learning_rate": 1.4066907910569166e-06, + "loss": 0.0, + "num_input_tokens_seen": 70382600, + "step": 121315 + }, + { + "epoch": 18.06970509383378, + "grad_norm": 4.488251306611346e-06, + "learning_rate": 1.4056163736287682e-06, + "loss": 0.0, + "num_input_tokens_seen": 70385480, + "step": 121320 + }, + { + "epoch": 18.07044980637474, + "grad_norm": 7.70161295804428e-06, + "learning_rate": 1.4045423548037478e-06, + "loss": 0.0, + "num_input_tokens_seen": 70388616, + "step": 121325 + }, + { + "epoch": 18.0711945189157, + "grad_norm": 5.419933131634025e-06, + "learning_rate": 1.4034687346000052e-06, + "loss": 0.0, + "num_input_tokens_seen": 70391496, + "step": 121330 + }, + { + "epoch": 18.071939231456657, + "grad_norm": 4.2987328924937174e-05, + "learning_rate": 1.4023955130356758e-06, + "loss": 0.0, + "num_input_tokens_seen": 70394440, + "step": 121335 + }, + { + "epoch": 18.072683943997617, + "grad_norm": 2.0087674784008414e-05, + "learning_rate": 1.4013226901288868e-06, + "loss": 0.0, + "num_input_tokens_seen": 70397192, + "step": 121340 + }, + { + "epoch": 18.073428656538574, + "grad_norm": 6.897030743857613e-06, + "learning_rate": 1.400250265897768e-06, + "loss": 0.0, + "num_input_tokens_seen": 70400264, + "step": 121345 + }, + { + "epoch": 18.074173369079535, + "grad_norm": 1.0981181731040124e-05, + "learning_rate": 1.3991782403604353e-06, + "loss": 0.0, + "num_input_tokens_seen": 70403144, + "step": 121350 + }, + { + "epoch": 18.074918081620496, + "grad_norm": 5.859719294676324e-06, + "learning_rate": 1.3981066135349995e-06, + "loss": 0.0, + "num_input_tokens_seen": 70406088, + "step": 121355 + }, + { + "epoch": 18.075662794161452, + "grad_norm": 7.0969576881907415e-06, + "learning_rate": 1.397035385439563e-06, + "loss": 0.0, + "num_input_tokens_seen": 70409096, + "step": 121360 + }, + { + "epoch": 18.076407506702413, + "grad_norm": 4.210406586935278e-06, + "learning_rate": 1.3959645560922275e-06, + "loss": 0.0, + "num_input_tokens_seen": 70411848, + "step": 121365 + }, + { + "epoch": 18.077152219243374, + "grad_norm": 1.911051367642358e-05, + "learning_rate": 1.3948941255110787e-06, + "loss": 0.0, + "num_input_tokens_seen": 70414952, + "step": 121370 + }, + { + "epoch": 18.07789693178433, + "grad_norm": 3.9439386455342174e-05, + "learning_rate": 1.393824093714205e-06, + "loss": 0.0001, + "num_input_tokens_seen": 70418216, + "step": 121375 + }, + { + "epoch": 18.07864164432529, + "grad_norm": 4.112901024200255e-06, + "learning_rate": 1.3927544607196807e-06, + "loss": 0.0, + "num_input_tokens_seen": 70420904, + "step": 121380 + }, + { + "epoch": 18.079386356866248, + "grad_norm": 2.7489415970194386e-06, + "learning_rate": 1.3916852265455722e-06, + "loss": 0.0, + "num_input_tokens_seen": 70423752, + "step": 121385 + }, + { + "epoch": 18.08013106940721, + "grad_norm": 3.641335297288606e-06, + "learning_rate": 1.3906163912099506e-06, + "loss": 0.0, + "num_input_tokens_seen": 70426600, + "step": 121390 + }, + { + "epoch": 18.08087578194817, + "grad_norm": 2.735973339440534e-06, + "learning_rate": 1.3895479547308716e-06, + "loss": 0.0, + "num_input_tokens_seen": 70429416, + "step": 121395 + }, + { + "epoch": 18.081620494489126, + "grad_norm": 2.2705829906044528e-05, + "learning_rate": 1.3884799171263841e-06, + "loss": 0.0, + "num_input_tokens_seen": 70432168, + "step": 121400 + }, + { + "epoch": 18.082365207030087, + "grad_norm": 2.41994644056831e-06, + "learning_rate": 1.3874122784145239e-06, + "loss": 0.0, + "num_input_tokens_seen": 70434920, + "step": 121405 + }, + { + "epoch": 18.083109919571047, + "grad_norm": 3.617165248215315e-06, + "learning_rate": 1.3863450386133402e-06, + "loss": 0.0, + "num_input_tokens_seen": 70438088, + "step": 121410 + }, + { + "epoch": 18.083854632112004, + "grad_norm": 2.5512028969387757e-06, + "learning_rate": 1.385278197740858e-06, + "loss": 0.0, + "num_input_tokens_seen": 70440840, + "step": 121415 + }, + { + "epoch": 18.084599344652965, + "grad_norm": 0.0013233049539849162, + "learning_rate": 1.384211755815093e-06, + "loss": 0.0, + "num_input_tokens_seen": 70443720, + "step": 121420 + }, + { + "epoch": 18.08534405719392, + "grad_norm": 7.670125341974199e-05, + "learning_rate": 1.3831457128540753e-06, + "loss": 0.0, + "num_input_tokens_seen": 70446728, + "step": 121425 + }, + { + "epoch": 18.086088769734882, + "grad_norm": 6.780382136639673e-06, + "learning_rate": 1.3820800688758018e-06, + "loss": 0.0, + "num_input_tokens_seen": 70449736, + "step": 121430 + }, + { + "epoch": 18.086833482275843, + "grad_norm": 1.0345248483645264e-05, + "learning_rate": 1.3810148238982857e-06, + "loss": 0.0, + "num_input_tokens_seen": 70452616, + "step": 121435 + }, + { + "epoch": 18.0875781948168, + "grad_norm": 6.717879841744434e-06, + "learning_rate": 1.3799499779395152e-06, + "loss": 0.0, + "num_input_tokens_seen": 70455592, + "step": 121440 + }, + { + "epoch": 18.08832290735776, + "grad_norm": 3.603897312132176e-06, + "learning_rate": 1.3788855310174876e-06, + "loss": 0.0, + "num_input_tokens_seen": 70458696, + "step": 121445 + }, + { + "epoch": 18.08906761989872, + "grad_norm": 1.1024335435649846e-05, + "learning_rate": 1.3778214831501767e-06, + "loss": 0.0, + "num_input_tokens_seen": 70461608, + "step": 121450 + }, + { + "epoch": 18.089812332439678, + "grad_norm": 0.00029429595451802015, + "learning_rate": 1.3767578343555688e-06, + "loss": 0.0, + "num_input_tokens_seen": 70464584, + "step": 121455 + }, + { + "epoch": 18.09055704498064, + "grad_norm": 0.0005782725056633353, + "learning_rate": 1.3756945846516267e-06, + "loss": 0.0, + "num_input_tokens_seen": 70467592, + "step": 121460 + }, + { + "epoch": 18.091301757521595, + "grad_norm": 3.94004655390745e-06, + "learning_rate": 1.3746317340563142e-06, + "loss": 0.0, + "num_input_tokens_seen": 70470568, + "step": 121465 + }, + { + "epoch": 18.092046470062556, + "grad_norm": 1.3441413102555089e-05, + "learning_rate": 1.3735692825875861e-06, + "loss": 0.0, + "num_input_tokens_seen": 70473608, + "step": 121470 + }, + { + "epoch": 18.092791182603516, + "grad_norm": 5.202567081141751e-06, + "learning_rate": 1.3725072302633895e-06, + "loss": 0.0, + "num_input_tokens_seen": 70476552, + "step": 121475 + }, + { + "epoch": 18.093535895144473, + "grad_norm": 1.6055306332418695e-05, + "learning_rate": 1.371445577101671e-06, + "loss": 0.0, + "num_input_tokens_seen": 70479432, + "step": 121480 + }, + { + "epoch": 18.094280607685434, + "grad_norm": 4.933927812089678e-06, + "learning_rate": 1.3703843231203634e-06, + "loss": 0.0, + "num_input_tokens_seen": 70482344, + "step": 121485 + }, + { + "epoch": 18.09502532022639, + "grad_norm": 3.103603603449301e-06, + "learning_rate": 1.3693234683373997e-06, + "loss": 0.0, + "num_input_tokens_seen": 70485032, + "step": 121490 + }, + { + "epoch": 18.09577003276735, + "grad_norm": 7.838664714654442e-06, + "learning_rate": 1.3682630127706958e-06, + "loss": 0.0, + "num_input_tokens_seen": 70487944, + "step": 121495 + }, + { + "epoch": 18.096514745308312, + "grad_norm": 2.8939906769664958e-05, + "learning_rate": 1.3672029564381711e-06, + "loss": 0.0, + "num_input_tokens_seen": 70491080, + "step": 121500 + }, + { + "epoch": 18.09725945784927, + "grad_norm": 8.166670340870041e-06, + "learning_rate": 1.3661432993577333e-06, + "loss": 0.0, + "num_input_tokens_seen": 70493896, + "step": 121505 + }, + { + "epoch": 18.09800417039023, + "grad_norm": 2.3850786874390906e-06, + "learning_rate": 1.365084041547282e-06, + "loss": 0.0, + "num_input_tokens_seen": 70496520, + "step": 121510 + }, + { + "epoch": 18.09874888293119, + "grad_norm": 5.3265055612428114e-05, + "learning_rate": 1.364025183024717e-06, + "loss": 0.0, + "num_input_tokens_seen": 70499592, + "step": 121515 + }, + { + "epoch": 18.099493595472147, + "grad_norm": 2.435750047879992e-06, + "learning_rate": 1.3629667238079208e-06, + "loss": 0.0, + "num_input_tokens_seen": 70502440, + "step": 121520 + }, + { + "epoch": 18.100238308013108, + "grad_norm": 0.0001614114735275507, + "learning_rate": 1.361908663914782e-06, + "loss": 0.0, + "num_input_tokens_seen": 70505384, + "step": 121525 + }, + { + "epoch": 18.100983020554064, + "grad_norm": 2.4690529244253412e-05, + "learning_rate": 1.3608510033631728e-06, + "loss": 0.0, + "num_input_tokens_seen": 70508104, + "step": 121530 + }, + { + "epoch": 18.101727733095025, + "grad_norm": 0.00015795814397279173, + "learning_rate": 1.3597937421709588e-06, + "loss": 0.0, + "num_input_tokens_seen": 70510728, + "step": 121535 + }, + { + "epoch": 18.102472445635986, + "grad_norm": 1.834060071814747e-06, + "learning_rate": 1.358736880356004e-06, + "loss": 0.0, + "num_input_tokens_seen": 70513864, + "step": 121540 + }, + { + "epoch": 18.103217158176943, + "grad_norm": 1.5238650121318642e-05, + "learning_rate": 1.3576804179361552e-06, + "loss": 0.0, + "num_input_tokens_seen": 70516520, + "step": 121545 + }, + { + "epoch": 18.103961870717903, + "grad_norm": 8.219933988584671e-06, + "learning_rate": 1.356624354929273e-06, + "loss": 0.0, + "num_input_tokens_seen": 70519336, + "step": 121550 + }, + { + "epoch": 18.104706583258864, + "grad_norm": 6.78475635140785e-06, + "learning_rate": 1.3555686913531874e-06, + "loss": 0.0, + "num_input_tokens_seen": 70521960, + "step": 121555 + }, + { + "epoch": 18.10545129579982, + "grad_norm": 0.00015001713472884148, + "learning_rate": 1.3545134272257426e-06, + "loss": 0.0, + "num_input_tokens_seen": 70525192, + "step": 121560 + }, + { + "epoch": 18.10619600834078, + "grad_norm": 4.846898718824377e-06, + "learning_rate": 1.3534585625647578e-06, + "loss": 0.0, + "num_input_tokens_seen": 70527976, + "step": 121565 + }, + { + "epoch": 18.106940720881738, + "grad_norm": 0.00012713806063402444, + "learning_rate": 1.3524040973880575e-06, + "loss": 0.0, + "num_input_tokens_seen": 70530888, + "step": 121570 + }, + { + "epoch": 18.1076854334227, + "grad_norm": 3.3760277347028023e-06, + "learning_rate": 1.3513500317134582e-06, + "loss": 0.0, + "num_input_tokens_seen": 70533768, + "step": 121575 + }, + { + "epoch": 18.10843014596366, + "grad_norm": 1.0417592420708388e-05, + "learning_rate": 1.3502963655587619e-06, + "loss": 0.0, + "num_input_tokens_seen": 70536776, + "step": 121580 + }, + { + "epoch": 18.109174858504616, + "grad_norm": 2.969200295410701e-06, + "learning_rate": 1.3492430989417742e-06, + "loss": 0.0, + "num_input_tokens_seen": 70539752, + "step": 121585 + }, + { + "epoch": 18.109919571045577, + "grad_norm": 2.402461632300401e-06, + "learning_rate": 1.3481902318802835e-06, + "loss": 0.0, + "num_input_tokens_seen": 70542824, + "step": 121590 + }, + { + "epoch": 18.110664283586537, + "grad_norm": 9.91037450148724e-05, + "learning_rate": 1.3471377643920841e-06, + "loss": 0.0, + "num_input_tokens_seen": 70545704, + "step": 121595 + }, + { + "epoch": 18.111408996127494, + "grad_norm": 4.707239440904232e-06, + "learning_rate": 1.346085696494953e-06, + "loss": 0.0, + "num_input_tokens_seen": 70548456, + "step": 121600 + }, + { + "epoch": 18.112153708668455, + "grad_norm": 2.4920569558162242e-05, + "learning_rate": 1.3450340282066625e-06, + "loss": 0.0, + "num_input_tokens_seen": 70551560, + "step": 121605 + }, + { + "epoch": 18.11289842120941, + "grad_norm": 4.870630618825089e-06, + "learning_rate": 1.3439827595449761e-06, + "loss": 0.0, + "num_input_tokens_seen": 70554152, + "step": 121610 + }, + { + "epoch": 18.113643133750372, + "grad_norm": 9.081250937015284e-06, + "learning_rate": 1.3429318905276627e-06, + "loss": 0.0, + "num_input_tokens_seen": 70556968, + "step": 121615 + }, + { + "epoch": 18.114387846291333, + "grad_norm": 1.1382668162696064e-05, + "learning_rate": 1.3418814211724695e-06, + "loss": 0.0, + "num_input_tokens_seen": 70559976, + "step": 121620 + }, + { + "epoch": 18.11513255883229, + "grad_norm": 8.90351839188952e-06, + "learning_rate": 1.3408313514971432e-06, + "loss": 0.0, + "num_input_tokens_seen": 70562888, + "step": 121625 + }, + { + "epoch": 18.11587727137325, + "grad_norm": 0.00023768132086843252, + "learning_rate": 1.3397816815194281e-06, + "loss": 0.0, + "num_input_tokens_seen": 70565800, + "step": 121630 + }, + { + "epoch": 18.11662198391421, + "grad_norm": 3.975018898927374e-06, + "learning_rate": 1.3387324112570488e-06, + "loss": 0.0, + "num_input_tokens_seen": 70568776, + "step": 121635 + }, + { + "epoch": 18.117366696455168, + "grad_norm": 0.0001650898193474859, + "learning_rate": 1.3376835407277437e-06, + "loss": 0.0, + "num_input_tokens_seen": 70571528, + "step": 121640 + }, + { + "epoch": 18.11811140899613, + "grad_norm": 4.129708031541668e-06, + "learning_rate": 1.3366350699492214e-06, + "loss": 0.0, + "num_input_tokens_seen": 70574280, + "step": 121645 + }, + { + "epoch": 18.118856121537085, + "grad_norm": 1.1848645044665318e-05, + "learning_rate": 1.3355869989392005e-06, + "loss": 0.0, + "num_input_tokens_seen": 70577032, + "step": 121650 + }, + { + "epoch": 18.119600834078046, + "grad_norm": 3.002271796503919e-06, + "learning_rate": 1.3345393277153896e-06, + "loss": 0.0, + "num_input_tokens_seen": 70579752, + "step": 121655 + }, + { + "epoch": 18.120345546619006, + "grad_norm": 2.229853635071777e-06, + "learning_rate": 1.3334920562954822e-06, + "loss": 0.0, + "num_input_tokens_seen": 70582504, + "step": 121660 + }, + { + "epoch": 18.121090259159963, + "grad_norm": 1.117067313316511e-05, + "learning_rate": 1.332445184697173e-06, + "loss": 0.0, + "num_input_tokens_seen": 70585352, + "step": 121665 + }, + { + "epoch": 18.121834971700924, + "grad_norm": 0.0002613618562463671, + "learning_rate": 1.3313987129381422e-06, + "loss": 0.0, + "num_input_tokens_seen": 70588232, + "step": 121670 + }, + { + "epoch": 18.12257968424188, + "grad_norm": 2.5278554858232383e-06, + "learning_rate": 1.3303526410360811e-06, + "loss": 0.0, + "num_input_tokens_seen": 70591208, + "step": 121675 + }, + { + "epoch": 18.12332439678284, + "grad_norm": 3.3181970593432197e-06, + "learning_rate": 1.3293069690086506e-06, + "loss": 0.0, + "num_input_tokens_seen": 70594184, + "step": 121680 + }, + { + "epoch": 18.124069109323802, + "grad_norm": 3.6395776987774298e-06, + "learning_rate": 1.3282616968735256e-06, + "loss": 0.0, + "num_input_tokens_seen": 70597224, + "step": 121685 + }, + { + "epoch": 18.12481382186476, + "grad_norm": 0.0014796738978475332, + "learning_rate": 1.3272168246483557e-06, + "loss": 0.0, + "num_input_tokens_seen": 70599688, + "step": 121690 + }, + { + "epoch": 18.12555853440572, + "grad_norm": 8.90557021193672e-06, + "learning_rate": 1.3261723523508018e-06, + "loss": 0.0, + "num_input_tokens_seen": 70602984, + "step": 121695 + }, + { + "epoch": 18.12630324694668, + "grad_norm": 0.00012107720976928249, + "learning_rate": 1.3251282799985026e-06, + "loss": 0.0, + "num_input_tokens_seen": 70605768, + "step": 121700 + }, + { + "epoch": 18.127047959487637, + "grad_norm": 4.0103950595948845e-06, + "learning_rate": 1.3240846076090996e-06, + "loss": 0.0, + "num_input_tokens_seen": 70608968, + "step": 121705 + }, + { + "epoch": 18.127792672028598, + "grad_norm": 3.8744465200579725e-06, + "learning_rate": 1.3230413352002259e-06, + "loss": 0.0, + "num_input_tokens_seen": 70611848, + "step": 121710 + }, + { + "epoch": 18.128537384569555, + "grad_norm": 9.693021638668142e-06, + "learning_rate": 1.3219984627895005e-06, + "loss": 0.0, + "num_input_tokens_seen": 70614440, + "step": 121715 + }, + { + "epoch": 18.129282097110515, + "grad_norm": 3.0897942906449316e-06, + "learning_rate": 1.3209559903945483e-06, + "loss": 0.0, + "num_input_tokens_seen": 70617160, + "step": 121720 + }, + { + "epoch": 18.130026809651476, + "grad_norm": 8.710879228601698e-06, + "learning_rate": 1.3199139180329806e-06, + "loss": 0.0, + "num_input_tokens_seen": 70619816, + "step": 121725 + }, + { + "epoch": 18.130771522192433, + "grad_norm": 7.621565146109788e-06, + "learning_rate": 1.3188722457223995e-06, + "loss": 0.0, + "num_input_tokens_seen": 70622504, + "step": 121730 + }, + { + "epoch": 18.131516234733393, + "grad_norm": 1.1679307135636918e-05, + "learning_rate": 1.3178309734803968e-06, + "loss": 0.0, + "num_input_tokens_seen": 70625320, + "step": 121735 + }, + { + "epoch": 18.132260947274354, + "grad_norm": 2.2087322122388287e-06, + "learning_rate": 1.3167901013245775e-06, + "loss": 0.0, + "num_input_tokens_seen": 70628488, + "step": 121740 + }, + { + "epoch": 18.13300565981531, + "grad_norm": 4.264236849849112e-06, + "learning_rate": 1.3157496292725169e-06, + "loss": 0.0, + "num_input_tokens_seen": 70631464, + "step": 121745 + }, + { + "epoch": 18.13375037235627, + "grad_norm": 0.003022548509761691, + "learning_rate": 1.3147095573417923e-06, + "loss": 0.0, + "num_input_tokens_seen": 70634184, + "step": 121750 + }, + { + "epoch": 18.134495084897228, + "grad_norm": 0.0001607620361028239, + "learning_rate": 1.3136698855499812e-06, + "loss": 0.0, + "num_input_tokens_seen": 70637128, + "step": 121755 + }, + { + "epoch": 18.13523979743819, + "grad_norm": 8.954331860877573e-05, + "learning_rate": 1.3126306139146394e-06, + "loss": 0.0, + "num_input_tokens_seen": 70640008, + "step": 121760 + }, + { + "epoch": 18.13598450997915, + "grad_norm": 8.375293873541523e-06, + "learning_rate": 1.3115917424533303e-06, + "loss": 0.0, + "num_input_tokens_seen": 70642920, + "step": 121765 + }, + { + "epoch": 18.136729222520106, + "grad_norm": 4.632486707123462e-06, + "learning_rate": 1.310553271183601e-06, + "loss": 0.0, + "num_input_tokens_seen": 70646216, + "step": 121770 + }, + { + "epoch": 18.137473935061067, + "grad_norm": 3.0660175980301574e-06, + "learning_rate": 1.3095152001230015e-06, + "loss": 0.0, + "num_input_tokens_seen": 70649032, + "step": 121775 + }, + { + "epoch": 18.138218647602027, + "grad_norm": 5.500927727553062e-06, + "learning_rate": 1.308477529289065e-06, + "loss": 0.0, + "num_input_tokens_seen": 70651688, + "step": 121780 + }, + { + "epoch": 18.138963360142984, + "grad_norm": 2.7491830678627593e-06, + "learning_rate": 1.3074402586993162e-06, + "loss": 0.0, + "num_input_tokens_seen": 70654536, + "step": 121785 + }, + { + "epoch": 18.139708072683945, + "grad_norm": 6.675968506897334e-06, + "learning_rate": 1.3064033883712883e-06, + "loss": 0.0, + "num_input_tokens_seen": 70657384, + "step": 121790 + }, + { + "epoch": 18.140452785224902, + "grad_norm": 1.2465729923860636e-05, + "learning_rate": 1.3053669183224954e-06, + "loss": 0.0, + "num_input_tokens_seen": 70660264, + "step": 121795 + }, + { + "epoch": 18.141197497765862, + "grad_norm": 6.0788815972045995e-06, + "learning_rate": 1.3043308485704454e-06, + "loss": 0.0, + "num_input_tokens_seen": 70663016, + "step": 121800 + }, + { + "epoch": 18.141942210306823, + "grad_norm": 2.038597631326411e-05, + "learning_rate": 1.303295179132638e-06, + "loss": 0.0, + "num_input_tokens_seen": 70665736, + "step": 121805 + }, + { + "epoch": 18.14268692284778, + "grad_norm": 4.618938692146912e-05, + "learning_rate": 1.3022599100265791e-06, + "loss": 0.0, + "num_input_tokens_seen": 70668840, + "step": 121810 + }, + { + "epoch": 18.14343163538874, + "grad_norm": 3.1069168926478596e-06, + "learning_rate": 1.3012250412697519e-06, + "loss": 0.0, + "num_input_tokens_seen": 70671848, + "step": 121815 + }, + { + "epoch": 18.1441763479297, + "grad_norm": 1.2435104508767836e-05, + "learning_rate": 1.300190572879642e-06, + "loss": 0.0, + "num_input_tokens_seen": 70674344, + "step": 121820 + }, + { + "epoch": 18.144921060470658, + "grad_norm": 8.592673111706972e-06, + "learning_rate": 1.2991565048737248e-06, + "loss": 0.0, + "num_input_tokens_seen": 70677608, + "step": 121825 + }, + { + "epoch": 18.14566577301162, + "grad_norm": 1.188332953461213e-05, + "learning_rate": 1.2981228372694692e-06, + "loss": 0.0, + "num_input_tokens_seen": 70680360, + "step": 121830 + }, + { + "epoch": 18.146410485552575, + "grad_norm": 5.74384785068105e-06, + "learning_rate": 1.2970895700843394e-06, + "loss": 0.0, + "num_input_tokens_seen": 70683272, + "step": 121835 + }, + { + "epoch": 18.147155198093536, + "grad_norm": 0.00026787430397234857, + "learning_rate": 1.296056703335788e-06, + "loss": 0.0003, + "num_input_tokens_seen": 70686216, + "step": 121840 + }, + { + "epoch": 18.147899910634496, + "grad_norm": 1.4725511618962628e-06, + "learning_rate": 1.2950242370412702e-06, + "loss": 0.0, + "num_input_tokens_seen": 70689032, + "step": 121845 + }, + { + "epoch": 18.148644623175453, + "grad_norm": 4.152292603976093e-06, + "learning_rate": 1.2939921712182223e-06, + "loss": 0.0, + "num_input_tokens_seen": 70692072, + "step": 121850 + }, + { + "epoch": 18.149389335716414, + "grad_norm": 4.955411895934958e-06, + "learning_rate": 1.2929605058840889e-06, + "loss": 0.0, + "num_input_tokens_seen": 70694888, + "step": 121855 + }, + { + "epoch": 18.15013404825737, + "grad_norm": 4.416444426169619e-06, + "learning_rate": 1.2919292410562889e-06, + "loss": 0.0, + "num_input_tokens_seen": 70697896, + "step": 121860 + }, + { + "epoch": 18.15087876079833, + "grad_norm": 0.0012468552449718118, + "learning_rate": 1.2908983767522504e-06, + "loss": 0.0, + "num_input_tokens_seen": 70700680, + "step": 121865 + }, + { + "epoch": 18.151623473339292, + "grad_norm": 4.894749872619286e-05, + "learning_rate": 1.289867912989387e-06, + "loss": 0.0, + "num_input_tokens_seen": 70703304, + "step": 121870 + }, + { + "epoch": 18.15236818588025, + "grad_norm": 0.00013640020915772766, + "learning_rate": 1.2888378497851045e-06, + "loss": 0.0, + "num_input_tokens_seen": 70706472, + "step": 121875 + }, + { + "epoch": 18.15311289842121, + "grad_norm": 0.000679110933560878, + "learning_rate": 1.2878081871568082e-06, + "loss": 0.0, + "num_input_tokens_seen": 70709640, + "step": 121880 + }, + { + "epoch": 18.15385761096217, + "grad_norm": 4.996491497877287e-06, + "learning_rate": 1.2867789251218925e-06, + "loss": 0.0, + "num_input_tokens_seen": 70712360, + "step": 121885 + }, + { + "epoch": 18.154602323503127, + "grad_norm": 5.957974281045608e-06, + "learning_rate": 1.2857500636977466e-06, + "loss": 0.0, + "num_input_tokens_seen": 70715080, + "step": 121890 + }, + { + "epoch": 18.155347036044088, + "grad_norm": 3.548910399331362e-06, + "learning_rate": 1.2847216029017506e-06, + "loss": 0.0, + "num_input_tokens_seen": 70717992, + "step": 121895 + }, + { + "epoch": 18.156091748585045, + "grad_norm": 4.051663836435182e-06, + "learning_rate": 1.2836935427512826e-06, + "loss": 0.0, + "num_input_tokens_seen": 70720744, + "step": 121900 + }, + { + "epoch": 18.156836461126005, + "grad_norm": 3.036643647646997e-06, + "learning_rate": 1.2826658832637062e-06, + "loss": 0.0, + "num_input_tokens_seen": 70723560, + "step": 121905 + }, + { + "epoch": 18.157581173666966, + "grad_norm": 2.2006124709150754e-06, + "learning_rate": 1.2816386244563827e-06, + "loss": 0.0, + "num_input_tokens_seen": 70726760, + "step": 121910 + }, + { + "epoch": 18.158325886207923, + "grad_norm": 5.829472684126813e-06, + "learning_rate": 1.2806117663466704e-06, + "loss": 0.0, + "num_input_tokens_seen": 70729736, + "step": 121915 + }, + { + "epoch": 18.159070598748883, + "grad_norm": 9.992623745347373e-06, + "learning_rate": 1.279585308951914e-06, + "loss": 0.0004, + "num_input_tokens_seen": 70732424, + "step": 121920 + }, + { + "epoch": 18.159815311289844, + "grad_norm": 2.3098302335711196e-05, + "learning_rate": 1.2785592522894573e-06, + "loss": 0.0, + "num_input_tokens_seen": 70735400, + "step": 121925 + }, + { + "epoch": 18.1605600238308, + "grad_norm": 4.968821303918958e-05, + "learning_rate": 1.2775335963766317e-06, + "loss": 0.0, + "num_input_tokens_seen": 70738568, + "step": 121930 + }, + { + "epoch": 18.16130473637176, + "grad_norm": 5.051995231042383e-06, + "learning_rate": 1.2765083412307672e-06, + "loss": 0.0, + "num_input_tokens_seen": 70741672, + "step": 121935 + }, + { + "epoch": 18.162049448912718, + "grad_norm": 7.972133062139619e-06, + "learning_rate": 1.2754834868691834e-06, + "loss": 0.0, + "num_input_tokens_seen": 70744552, + "step": 121940 + }, + { + "epoch": 18.16279416145368, + "grad_norm": 3.143720959997154e-06, + "learning_rate": 1.2744590333091888e-06, + "loss": 0.0, + "num_input_tokens_seen": 70747368, + "step": 121945 + }, + { + "epoch": 18.16353887399464, + "grad_norm": 3.962338269047905e-06, + "learning_rate": 1.2734349805680974e-06, + "loss": 0.0, + "num_input_tokens_seen": 70750344, + "step": 121950 + }, + { + "epoch": 18.164283586535596, + "grad_norm": 3.862576704705134e-06, + "learning_rate": 1.2724113286632061e-06, + "loss": 0.0, + "num_input_tokens_seen": 70753448, + "step": 121955 + }, + { + "epoch": 18.165028299076557, + "grad_norm": 1.6099531876534456e-06, + "learning_rate": 1.2713880776118126e-06, + "loss": 0.0, + "num_input_tokens_seen": 70756328, + "step": 121960 + }, + { + "epoch": 18.165773011617517, + "grad_norm": 7.26455109543167e-05, + "learning_rate": 1.2703652274311973e-06, + "loss": 0.0, + "num_input_tokens_seen": 70759400, + "step": 121965 + }, + { + "epoch": 18.166517724158474, + "grad_norm": 2.5912345336109865e-06, + "learning_rate": 1.2693427781386464e-06, + "loss": 0.0, + "num_input_tokens_seen": 70762216, + "step": 121970 + }, + { + "epoch": 18.167262436699435, + "grad_norm": 2.1269506760290824e-06, + "learning_rate": 1.2683207297514293e-06, + "loss": 0.0, + "num_input_tokens_seen": 70765384, + "step": 121975 + }, + { + "epoch": 18.168007149240392, + "grad_norm": 0.00016477871395181865, + "learning_rate": 1.267299082286816e-06, + "loss": 0.0, + "num_input_tokens_seen": 70768296, + "step": 121980 + }, + { + "epoch": 18.168751861781352, + "grad_norm": 5.489250543178059e-05, + "learning_rate": 1.2662778357620614e-06, + "loss": 0.0, + "num_input_tokens_seen": 70771176, + "step": 121985 + }, + { + "epoch": 18.169496574322313, + "grad_norm": 3.0253622753662057e-05, + "learning_rate": 1.2652569901944244e-06, + "loss": 0.0, + "num_input_tokens_seen": 70774344, + "step": 121990 + }, + { + "epoch": 18.17024128686327, + "grad_norm": 0.00033911215723492205, + "learning_rate": 1.2642365456011467e-06, + "loss": 0.0, + "num_input_tokens_seen": 70777256, + "step": 121995 + }, + { + "epoch": 18.17098599940423, + "grad_norm": 4.293569872970693e-05, + "learning_rate": 1.2632165019994646e-06, + "loss": 0.0, + "num_input_tokens_seen": 70780136, + "step": 122000 + }, + { + "epoch": 18.171730711945187, + "grad_norm": 2.5076242309296504e-06, + "learning_rate": 1.262196859406617e-06, + "loss": 0.0, + "num_input_tokens_seen": 70782952, + "step": 122005 + }, + { + "epoch": 18.172475424486148, + "grad_norm": 2.8950723844900494e-06, + "learning_rate": 1.261177617839826e-06, + "loss": 0.0, + "num_input_tokens_seen": 70785640, + "step": 122010 + }, + { + "epoch": 18.17322013702711, + "grad_norm": 7.120059308363125e-05, + "learning_rate": 1.2601587773163142e-06, + "loss": 0.0, + "num_input_tokens_seen": 70788456, + "step": 122015 + }, + { + "epoch": 18.173964849568065, + "grad_norm": 1.831684585340554e-06, + "learning_rate": 1.259140337853293e-06, + "loss": 0.0, + "num_input_tokens_seen": 70791208, + "step": 122020 + }, + { + "epoch": 18.174709562109026, + "grad_norm": 3.048979488085024e-05, + "learning_rate": 1.258122299467962e-06, + "loss": 0.0, + "num_input_tokens_seen": 70794184, + "step": 122025 + }, + { + "epoch": 18.175454274649987, + "grad_norm": 0.0002650983806233853, + "learning_rate": 1.2571046621775273e-06, + "loss": 0.0, + "num_input_tokens_seen": 70797320, + "step": 122030 + }, + { + "epoch": 18.176198987190944, + "grad_norm": 6.561665486515267e-06, + "learning_rate": 1.2560874259991778e-06, + "loss": 0.0, + "num_input_tokens_seen": 70800680, + "step": 122035 + }, + { + "epoch": 18.176943699731904, + "grad_norm": 5.0776079660863616e-06, + "learning_rate": 1.2550705909500998e-06, + "loss": 0.0, + "num_input_tokens_seen": 70803688, + "step": 122040 + }, + { + "epoch": 18.17768841227286, + "grad_norm": 1.5911007722024806e-05, + "learning_rate": 1.2540541570474684e-06, + "loss": 0.0, + "num_input_tokens_seen": 70806568, + "step": 122045 + }, + { + "epoch": 18.17843312481382, + "grad_norm": 2.9654499940079404e-06, + "learning_rate": 1.2530381243084616e-06, + "loss": 0.0, + "num_input_tokens_seen": 70809544, + "step": 122050 + }, + { + "epoch": 18.179177837354782, + "grad_norm": 2.503630867067841e-06, + "learning_rate": 1.2520224927502405e-06, + "loss": 0.0, + "num_input_tokens_seen": 70812232, + "step": 122055 + }, + { + "epoch": 18.17992254989574, + "grad_norm": 9.650655556470156e-05, + "learning_rate": 1.251007262389961e-06, + "loss": 0.0, + "num_input_tokens_seen": 70815080, + "step": 122060 + }, + { + "epoch": 18.1806672624367, + "grad_norm": 7.5243342507747e-06, + "learning_rate": 1.2499924332447788e-06, + "loss": 0.0, + "num_input_tokens_seen": 70818088, + "step": 122065 + }, + { + "epoch": 18.18141197497766, + "grad_norm": 2.2348176571540534e-06, + "learning_rate": 1.24897800533183e-06, + "loss": 0.0, + "num_input_tokens_seen": 70821224, + "step": 122070 + }, + { + "epoch": 18.182156687518617, + "grad_norm": 0.00036431095213629305, + "learning_rate": 1.247963978668265e-06, + "loss": 0.0, + "num_input_tokens_seen": 70824424, + "step": 122075 + }, + { + "epoch": 18.182901400059578, + "grad_norm": 3.2436919354950078e-06, + "learning_rate": 1.2469503532712008e-06, + "loss": 0.0, + "num_input_tokens_seen": 70827368, + "step": 122080 + }, + { + "epoch": 18.183646112600535, + "grad_norm": 3.5638361168821575e-06, + "learning_rate": 1.2459371291577759e-06, + "loss": 0.0, + "num_input_tokens_seen": 70830120, + "step": 122085 + }, + { + "epoch": 18.184390825141495, + "grad_norm": 6.689814199489774e-06, + "learning_rate": 1.2449243063450967e-06, + "loss": 0.0, + "num_input_tokens_seen": 70833032, + "step": 122090 + }, + { + "epoch": 18.185135537682456, + "grad_norm": 4.0920454011939e-06, + "learning_rate": 1.2439118848502796e-06, + "loss": 0.0, + "num_input_tokens_seen": 70836008, + "step": 122095 + }, + { + "epoch": 18.185880250223413, + "grad_norm": 1.5977422663127072e-05, + "learning_rate": 1.2428998646904277e-06, + "loss": 0.0, + "num_input_tokens_seen": 70838984, + "step": 122100 + }, + { + "epoch": 18.186624962764373, + "grad_norm": 0.0004388465895317495, + "learning_rate": 1.241888245882633e-06, + "loss": 0.0, + "num_input_tokens_seen": 70842152, + "step": 122105 + }, + { + "epoch": 18.187369675305334, + "grad_norm": 0.0001743758184602484, + "learning_rate": 1.240877028443993e-06, + "loss": 0.0, + "num_input_tokens_seen": 70845160, + "step": 122110 + }, + { + "epoch": 18.18811438784629, + "grad_norm": 7.029796961433021e-06, + "learning_rate": 1.2398662123915827e-06, + "loss": 0.0, + "num_input_tokens_seen": 70848360, + "step": 122115 + }, + { + "epoch": 18.18885910038725, + "grad_norm": 4.758203431265429e-06, + "learning_rate": 1.2388557977424915e-06, + "loss": 0.0, + "num_input_tokens_seen": 70851208, + "step": 122120 + }, + { + "epoch": 18.18960381292821, + "grad_norm": 4.78668016512529e-06, + "learning_rate": 1.2378457845137775e-06, + "loss": 0.0, + "num_input_tokens_seen": 70853992, + "step": 122125 + }, + { + "epoch": 18.19034852546917, + "grad_norm": 6.656743062194437e-05, + "learning_rate": 1.236836172722511e-06, + "loss": 0.0, + "num_input_tokens_seen": 70856904, + "step": 122130 + }, + { + "epoch": 18.19109323801013, + "grad_norm": 0.00011729918333003297, + "learning_rate": 1.2358269623857416e-06, + "loss": 0.0, + "num_input_tokens_seen": 70859816, + "step": 122135 + }, + { + "epoch": 18.191837950551086, + "grad_norm": 0.0010898923501372337, + "learning_rate": 1.2348181535205283e-06, + "loss": 0.0, + "num_input_tokens_seen": 70862792, + "step": 122140 + }, + { + "epoch": 18.192582663092047, + "grad_norm": 2.6288141725672176e-06, + "learning_rate": 1.2338097461439047e-06, + "loss": 0.0, + "num_input_tokens_seen": 70865832, + "step": 122145 + }, + { + "epoch": 18.193327375633007, + "grad_norm": 5.1451220315357205e-06, + "learning_rate": 1.2328017402729098e-06, + "loss": 0.0, + "num_input_tokens_seen": 70869704, + "step": 122150 + }, + { + "epoch": 18.194072088173964, + "grad_norm": 1.4985596862970851e-05, + "learning_rate": 1.2317941359245744e-06, + "loss": 0.0, + "num_input_tokens_seen": 70872872, + "step": 122155 + }, + { + "epoch": 18.194816800714925, + "grad_norm": 4.10515212934115e-06, + "learning_rate": 1.2307869331159182e-06, + "loss": 0.0, + "num_input_tokens_seen": 70875816, + "step": 122160 + }, + { + "epoch": 18.195561513255882, + "grad_norm": 3.914188710041344e-06, + "learning_rate": 1.229780131863964e-06, + "loss": 0.0, + "num_input_tokens_seen": 70878568, + "step": 122165 + }, + { + "epoch": 18.196306225796842, + "grad_norm": 6.786635367461713e-06, + "learning_rate": 1.2287737321857118e-06, + "loss": 0.0, + "num_input_tokens_seen": 70881640, + "step": 122170 + }, + { + "epoch": 18.197050938337803, + "grad_norm": 8.172262823791243e-06, + "learning_rate": 1.2277677340981675e-06, + "loss": 0.0, + "num_input_tokens_seen": 70884424, + "step": 122175 + }, + { + "epoch": 18.19779565087876, + "grad_norm": 5.985737061564578e-06, + "learning_rate": 1.2267621376183286e-06, + "loss": 0.0, + "num_input_tokens_seen": 70887112, + "step": 122180 + }, + { + "epoch": 18.19854036341972, + "grad_norm": 1.0682463653211016e-05, + "learning_rate": 1.2257569427631789e-06, + "loss": 0.0, + "num_input_tokens_seen": 70889608, + "step": 122185 + }, + { + "epoch": 18.199285075960677, + "grad_norm": 3.831251888186671e-06, + "learning_rate": 1.2247521495497043e-06, + "loss": 0.0, + "num_input_tokens_seen": 70892488, + "step": 122190 + }, + { + "epoch": 18.200029788501638, + "grad_norm": 6.189294708747184e-06, + "learning_rate": 1.223747757994878e-06, + "loss": 0.0, + "num_input_tokens_seen": 70895272, + "step": 122195 + }, + { + "epoch": 18.2007745010426, + "grad_norm": 5.9483513723535e-06, + "learning_rate": 1.2227437681156695e-06, + "loss": 0.0, + "num_input_tokens_seen": 70898216, + "step": 122200 + }, + { + "epoch": 18.201519213583556, + "grad_norm": 6.4124083110073116e-06, + "learning_rate": 1.2217401799290345e-06, + "loss": 0.0, + "num_input_tokens_seen": 70901160, + "step": 122205 + }, + { + "epoch": 18.202263926124516, + "grad_norm": 0.00052543671336025, + "learning_rate": 1.2207369934519347e-06, + "loss": 0.0, + "num_input_tokens_seen": 70904264, + "step": 122210 + }, + { + "epoch": 18.203008638665477, + "grad_norm": 4.263880782673368e-06, + "learning_rate": 1.2197342087013146e-06, + "loss": 0.0, + "num_input_tokens_seen": 70907848, + "step": 122215 + }, + { + "epoch": 18.203753351206434, + "grad_norm": 3.3614576295804e-06, + "learning_rate": 1.2187318256941166e-06, + "loss": 0.0, + "num_input_tokens_seen": 70910664, + "step": 122220 + }, + { + "epoch": 18.204498063747394, + "grad_norm": 6.371176277752966e-05, + "learning_rate": 1.2177298444472741e-06, + "loss": 0.0, + "num_input_tokens_seen": 70913800, + "step": 122225 + }, + { + "epoch": 18.20524277628835, + "grad_norm": 1.3519298590836115e-05, + "learning_rate": 1.2167282649777123e-06, + "loss": 0.0, + "num_input_tokens_seen": 70916840, + "step": 122230 + }, + { + "epoch": 18.20598748882931, + "grad_norm": 0.0012541186297312379, + "learning_rate": 1.2157270873023596e-06, + "loss": 0.0, + "num_input_tokens_seen": 70919656, + "step": 122235 + }, + { + "epoch": 18.206732201370272, + "grad_norm": 7.299616117961705e-05, + "learning_rate": 1.2147263114381191e-06, + "loss": 0.0, + "num_input_tokens_seen": 70922408, + "step": 122240 + }, + { + "epoch": 18.20747691391123, + "grad_norm": 2.8937673050677404e-05, + "learning_rate": 1.213725937401905e-06, + "loss": 0.0, + "num_input_tokens_seen": 70925096, + "step": 122245 + }, + { + "epoch": 18.20822162645219, + "grad_norm": 2.4592400222900324e-06, + "learning_rate": 1.2127259652106149e-06, + "loss": 0.0, + "num_input_tokens_seen": 70927656, + "step": 122250 + }, + { + "epoch": 18.20896633899315, + "grad_norm": 4.921489562548231e-06, + "learning_rate": 1.2117263948811464e-06, + "loss": 0.0, + "num_input_tokens_seen": 70930792, + "step": 122255 + }, + { + "epoch": 18.209711051534107, + "grad_norm": 1.4361741705215536e-05, + "learning_rate": 1.2107272264303831e-06, + "loss": 0.0, + "num_input_tokens_seen": 70933736, + "step": 122260 + }, + { + "epoch": 18.210455764075068, + "grad_norm": 3.3642059861449525e-05, + "learning_rate": 1.2097284598752034e-06, + "loss": 0.0, + "num_input_tokens_seen": 70936616, + "step": 122265 + }, + { + "epoch": 18.211200476616025, + "grad_norm": 3.0744906780455494e-06, + "learning_rate": 1.2087300952324826e-06, + "loss": 0.0, + "num_input_tokens_seen": 70939432, + "step": 122270 + }, + { + "epoch": 18.211945189156985, + "grad_norm": 3.776980520342477e-05, + "learning_rate": 1.2077321325190849e-06, + "loss": 0.0, + "num_input_tokens_seen": 70942568, + "step": 122275 + }, + { + "epoch": 18.212689901697946, + "grad_norm": 8.52598350320477e-06, + "learning_rate": 1.206734571751872e-06, + "loss": 0.0, + "num_input_tokens_seen": 70945352, + "step": 122280 + }, + { + "epoch": 18.213434614238903, + "grad_norm": 2.216807843069546e-05, + "learning_rate": 1.2057374129476968e-06, + "loss": 0.0, + "num_input_tokens_seen": 70947976, + "step": 122285 + }, + { + "epoch": 18.214179326779863, + "grad_norm": 0.00011089231702499092, + "learning_rate": 1.2047406561234042e-06, + "loss": 0.0, + "num_input_tokens_seen": 70950664, + "step": 122290 + }, + { + "epoch": 18.214924039320824, + "grad_norm": 0.0006185111706145108, + "learning_rate": 1.203744301295831e-06, + "loss": 0.0, + "num_input_tokens_seen": 70953704, + "step": 122295 + }, + { + "epoch": 18.21566875186178, + "grad_norm": 4.533555056696059e-06, + "learning_rate": 1.2027483484818165e-06, + "loss": 0.0, + "num_input_tokens_seen": 70956584, + "step": 122300 + }, + { + "epoch": 18.21641346440274, + "grad_norm": 2.6485402486287057e-05, + "learning_rate": 1.2017527976981828e-06, + "loss": 0.0, + "num_input_tokens_seen": 70959464, + "step": 122305 + }, + { + "epoch": 18.2171581769437, + "grad_norm": 1.9108059859718196e-05, + "learning_rate": 1.200757648961745e-06, + "loss": 0.0, + "num_input_tokens_seen": 70962536, + "step": 122310 + }, + { + "epoch": 18.21790288948466, + "grad_norm": 4.141627505305223e-05, + "learning_rate": 1.1997629022893198e-06, + "loss": 0.0, + "num_input_tokens_seen": 70965128, + "step": 122315 + }, + { + "epoch": 18.21864760202562, + "grad_norm": 3.190422603438492e-06, + "learning_rate": 1.1987685576977131e-06, + "loss": 0.0, + "num_input_tokens_seen": 70967880, + "step": 122320 + }, + { + "epoch": 18.219392314566576, + "grad_norm": 4.811969120055437e-06, + "learning_rate": 1.19777461520372e-06, + "loss": 0.0, + "num_input_tokens_seen": 70970792, + "step": 122325 + }, + { + "epoch": 18.220137027107537, + "grad_norm": 8.54711379361106e-06, + "learning_rate": 1.19678107482413e-06, + "loss": 0.0, + "num_input_tokens_seen": 70973704, + "step": 122330 + }, + { + "epoch": 18.220881739648497, + "grad_norm": 1.4168585948937107e-05, + "learning_rate": 1.1957879365757346e-06, + "loss": 0.0, + "num_input_tokens_seen": 70976584, + "step": 122335 + }, + { + "epoch": 18.221626452189454, + "grad_norm": 0.00020997074898332357, + "learning_rate": 1.1947952004753044e-06, + "loss": 0.0, + "num_input_tokens_seen": 70979496, + "step": 122340 + }, + { + "epoch": 18.222371164730415, + "grad_norm": 1.989889824471902e-06, + "learning_rate": 1.1938028665396173e-06, + "loss": 0.0, + "num_input_tokens_seen": 70982408, + "step": 122345 + }, + { + "epoch": 18.223115877271372, + "grad_norm": 5.680946560460143e-05, + "learning_rate": 1.1928109347854377e-06, + "loss": 0.0, + "num_input_tokens_seen": 70985192, + "step": 122350 + }, + { + "epoch": 18.223860589812332, + "grad_norm": 5.9624189816531725e-06, + "learning_rate": 1.1918194052295162e-06, + "loss": 0.0, + "num_input_tokens_seen": 70988168, + "step": 122355 + }, + { + "epoch": 18.224605302353293, + "grad_norm": 3.238810904804268e-06, + "learning_rate": 1.1908282778886115e-06, + "loss": 0.0, + "num_input_tokens_seen": 70991080, + "step": 122360 + }, + { + "epoch": 18.22535001489425, + "grad_norm": 6.657120593445143e-06, + "learning_rate": 1.1898375527794603e-06, + "loss": 0.0, + "num_input_tokens_seen": 70994024, + "step": 122365 + }, + { + "epoch": 18.22609472743521, + "grad_norm": 6.551151727762772e-06, + "learning_rate": 1.1888472299188102e-06, + "loss": 0.0, + "num_input_tokens_seen": 70997128, + "step": 122370 + }, + { + "epoch": 18.226839439976168, + "grad_norm": 4.430385615705745e-06, + "learning_rate": 1.1878573093233814e-06, + "loss": 0.0, + "num_input_tokens_seen": 71000104, + "step": 122375 + }, + { + "epoch": 18.227584152517128, + "grad_norm": 4.015373178845039e-06, + "learning_rate": 1.1868677910099018e-06, + "loss": 0.0, + "num_input_tokens_seen": 71002856, + "step": 122380 + }, + { + "epoch": 18.22832886505809, + "grad_norm": 9.953095286618918e-06, + "learning_rate": 1.1858786749950919e-06, + "loss": 0.0, + "num_input_tokens_seen": 71005960, + "step": 122385 + }, + { + "epoch": 18.229073577599046, + "grad_norm": 5.265138952381676e-06, + "learning_rate": 1.184889961295657e-06, + "loss": 0.0, + "num_input_tokens_seen": 71008744, + "step": 122390 + }, + { + "epoch": 18.229818290140006, + "grad_norm": 1.533412614662666e-05, + "learning_rate": 1.1839016499283013e-06, + "loss": 0.0, + "num_input_tokens_seen": 71011688, + "step": 122395 + }, + { + "epoch": 18.230563002680967, + "grad_norm": 0.0005993745871819556, + "learning_rate": 1.1829137409097191e-06, + "loss": 0.0, + "num_input_tokens_seen": 71014824, + "step": 122400 + }, + { + "epoch": 18.231307715221924, + "grad_norm": 9.071118256542832e-06, + "learning_rate": 1.1819262342566056e-06, + "loss": 0.0, + "num_input_tokens_seen": 71017992, + "step": 122405 + }, + { + "epoch": 18.232052427762884, + "grad_norm": 3.346432276885025e-05, + "learning_rate": 1.1809391299856365e-06, + "loss": 0.0, + "num_input_tokens_seen": 71020936, + "step": 122410 + }, + { + "epoch": 18.23279714030384, + "grad_norm": 0.00010942426888504997, + "learning_rate": 1.1799524281134983e-06, + "loss": 0.0, + "num_input_tokens_seen": 71023656, + "step": 122415 + }, + { + "epoch": 18.2335418528448, + "grad_norm": 5.009065716876648e-06, + "learning_rate": 1.1789661286568472e-06, + "loss": 0.0, + "num_input_tokens_seen": 71026376, + "step": 122420 + }, + { + "epoch": 18.234286565385762, + "grad_norm": 6.830157508375123e-05, + "learning_rate": 1.1779802316323585e-06, + "loss": 0.0, + "num_input_tokens_seen": 71029032, + "step": 122425 + }, + { + "epoch": 18.23503127792672, + "grad_norm": 3.810129555859021e-06, + "learning_rate": 1.176994737056683e-06, + "loss": 0.0, + "num_input_tokens_seen": 71032072, + "step": 122430 + }, + { + "epoch": 18.23577599046768, + "grad_norm": 0.002194278407841921, + "learning_rate": 1.176009644946463e-06, + "loss": 0.0, + "num_input_tokens_seen": 71034728, + "step": 122435 + }, + { + "epoch": 18.23652070300864, + "grad_norm": 6.827521247032564e-06, + "learning_rate": 1.1750249553183518e-06, + "loss": 0.0, + "num_input_tokens_seen": 71037480, + "step": 122440 + }, + { + "epoch": 18.237265415549597, + "grad_norm": 2.2577139588975115e-06, + "learning_rate": 1.1740406681889748e-06, + "loss": 0.0001, + "num_input_tokens_seen": 71040456, + "step": 122445 + }, + { + "epoch": 18.238010128090558, + "grad_norm": 0.0001016875758068636, + "learning_rate": 1.173056783574969e-06, + "loss": 0.0, + "num_input_tokens_seen": 71043400, + "step": 122450 + }, + { + "epoch": 18.238754840631515, + "grad_norm": 2.6972300474881195e-05, + "learning_rate": 1.1720733014929514e-06, + "loss": 0.0, + "num_input_tokens_seen": 71046376, + "step": 122455 + }, + { + "epoch": 18.239499553172475, + "grad_norm": 1.3151126950106118e-05, + "learning_rate": 1.1710902219595366e-06, + "loss": 0.0, + "num_input_tokens_seen": 71049512, + "step": 122460 + }, + { + "epoch": 18.240244265713436, + "grad_norm": 0.0019760774448513985, + "learning_rate": 1.1701075449913363e-06, + "loss": 0.0, + "num_input_tokens_seen": 71052904, + "step": 122465 + }, + { + "epoch": 18.240988978254393, + "grad_norm": 6.5276653913315386e-06, + "learning_rate": 1.1691252706049456e-06, + "loss": 0.0, + "num_input_tokens_seen": 71055880, + "step": 122470 + }, + { + "epoch": 18.241733690795353, + "grad_norm": 3.829316938208649e-06, + "learning_rate": 1.168143398816965e-06, + "loss": 0.0, + "num_input_tokens_seen": 71058920, + "step": 122475 + }, + { + "epoch": 18.242478403336314, + "grad_norm": 4.711376732302597e-06, + "learning_rate": 1.1671619296439785e-06, + "loss": 0.0, + "num_input_tokens_seen": 71061928, + "step": 122480 + }, + { + "epoch": 18.24322311587727, + "grad_norm": 5.012296242057346e-06, + "learning_rate": 1.16618086310257e-06, + "loss": 0.0, + "num_input_tokens_seen": 71064872, + "step": 122485 + }, + { + "epoch": 18.24396782841823, + "grad_norm": 1.3916880561737344e-05, + "learning_rate": 1.1652001992093097e-06, + "loss": 0.0, + "num_input_tokens_seen": 71067592, + "step": 122490 + }, + { + "epoch": 18.24471254095919, + "grad_norm": 2.4426044546999037e-05, + "learning_rate": 1.1642199379807706e-06, + "loss": 0.0, + "num_input_tokens_seen": 71070280, + "step": 122495 + }, + { + "epoch": 18.24545725350015, + "grad_norm": 3.335553628858179e-06, + "learning_rate": 1.1632400794335084e-06, + "loss": 0.0, + "num_input_tokens_seen": 71073320, + "step": 122500 + }, + { + "epoch": 18.24620196604111, + "grad_norm": 4.035511301481165e-05, + "learning_rate": 1.16226062358408e-06, + "loss": 0.0, + "num_input_tokens_seen": 71076008, + "step": 122505 + }, + { + "epoch": 18.246946678582066, + "grad_norm": 7.87738918006653e-06, + "learning_rate": 1.1612815704490298e-06, + "loss": 0.0, + "num_input_tokens_seen": 71078920, + "step": 122510 + }, + { + "epoch": 18.247691391123027, + "grad_norm": 6.192045930220047e-06, + "learning_rate": 1.1603029200448978e-06, + "loss": 0.0, + "num_input_tokens_seen": 71081640, + "step": 122515 + }, + { + "epoch": 18.248436103663984, + "grad_norm": 4.568410986394156e-06, + "learning_rate": 1.1593246723882206e-06, + "loss": 0.0, + "num_input_tokens_seen": 71084520, + "step": 122520 + }, + { + "epoch": 18.249180816204944, + "grad_norm": 8.46687180455774e-06, + "learning_rate": 1.158346827495524e-06, + "loss": 0.0031, + "num_input_tokens_seen": 71087304, + "step": 122525 + }, + { + "epoch": 18.249925528745905, + "grad_norm": 2.899090986829833e-06, + "learning_rate": 1.1573693853833224e-06, + "loss": 0.0, + "num_input_tokens_seen": 71090248, + "step": 122530 + }, + { + "epoch": 18.250670241286862, + "grad_norm": 3.492856922093779e-05, + "learning_rate": 1.156392346068133e-06, + "loss": 0.0, + "num_input_tokens_seen": 71093128, + "step": 122535 + }, + { + "epoch": 18.251414953827823, + "grad_norm": 3.3706306567182764e-05, + "learning_rate": 1.1554157095664625e-06, + "loss": 0.0, + "num_input_tokens_seen": 71097192, + "step": 122540 + }, + { + "epoch": 18.252159666368783, + "grad_norm": 5.043727924203267e-06, + "learning_rate": 1.1544394758948112e-06, + "loss": 0.0, + "num_input_tokens_seen": 71100072, + "step": 122545 + }, + { + "epoch": 18.25290437890974, + "grad_norm": 0.010491114109754562, + "learning_rate": 1.1534636450696634e-06, + "loss": 0.0, + "num_input_tokens_seen": 71102664, + "step": 122550 + }, + { + "epoch": 18.2536490914507, + "grad_norm": 5.2663426686194725e-06, + "learning_rate": 1.1524882171075168e-06, + "loss": 0.0, + "num_input_tokens_seen": 71105256, + "step": 122555 + }, + { + "epoch": 18.254393803991658, + "grad_norm": 6.0429697441577446e-06, + "learning_rate": 1.151513192024839e-06, + "loss": 0.0, + "num_input_tokens_seen": 71108328, + "step": 122560 + }, + { + "epoch": 18.255138516532618, + "grad_norm": 4.719487606053008e-06, + "learning_rate": 1.150538569838111e-06, + "loss": 0.0, + "num_input_tokens_seen": 71111112, + "step": 122565 + }, + { + "epoch": 18.25588322907358, + "grad_norm": 0.00020673226390499622, + "learning_rate": 1.1495643505637922e-06, + "loss": 0.0, + "num_input_tokens_seen": 71114184, + "step": 122570 + }, + { + "epoch": 18.256627941614536, + "grad_norm": 0.0009666556725278497, + "learning_rate": 1.148590534218344e-06, + "loss": 0.0, + "num_input_tokens_seen": 71117000, + "step": 122575 + }, + { + "epoch": 18.257372654155496, + "grad_norm": 1.4270310202846304e-05, + "learning_rate": 1.1476171208182146e-06, + "loss": 0.0, + "num_input_tokens_seen": 71120200, + "step": 122580 + }, + { + "epoch": 18.258117366696457, + "grad_norm": 0.0002724858350120485, + "learning_rate": 1.1466441103798575e-06, + "loss": 0.0, + "num_input_tokens_seen": 71123144, + "step": 122585 + }, + { + "epoch": 18.258862079237414, + "grad_norm": 7.913046283647418e-05, + "learning_rate": 1.1456715029197012e-06, + "loss": 0.0, + "num_input_tokens_seen": 71125960, + "step": 122590 + }, + { + "epoch": 18.259606791778374, + "grad_norm": 8.984578016679734e-05, + "learning_rate": 1.1446992984541827e-06, + "loss": 0.0, + "num_input_tokens_seen": 71128840, + "step": 122595 + }, + { + "epoch": 18.26035150431933, + "grad_norm": 1.4410842595680151e-05, + "learning_rate": 1.143727496999722e-06, + "loss": 0.0, + "num_input_tokens_seen": 71131624, + "step": 122600 + }, + { + "epoch": 18.26109621686029, + "grad_norm": 0.0016404299531131983, + "learning_rate": 1.1427560985727392e-06, + "loss": 0.0, + "num_input_tokens_seen": 71134600, + "step": 122605 + }, + { + "epoch": 18.261840929401252, + "grad_norm": 4.127216925553512e-06, + "learning_rate": 1.1417851031896438e-06, + "loss": 0.0, + "num_input_tokens_seen": 71137352, + "step": 122610 + }, + { + "epoch": 18.26258564194221, + "grad_norm": 2.544092922107666e-06, + "learning_rate": 1.140814510866839e-06, + "loss": 0.0, + "num_input_tokens_seen": 71140200, + "step": 122615 + }, + { + "epoch": 18.26333035448317, + "grad_norm": 5.672324277838925e-06, + "learning_rate": 1.1398443216207282e-06, + "loss": 0.0, + "num_input_tokens_seen": 71143080, + "step": 122620 + }, + { + "epoch": 18.26407506702413, + "grad_norm": 3.801768798439298e-06, + "learning_rate": 1.138874535467696e-06, + "loss": 0.0, + "num_input_tokens_seen": 71145928, + "step": 122625 + }, + { + "epoch": 18.264819779565087, + "grad_norm": 8.204136065614875e-06, + "learning_rate": 1.1379051524241236e-06, + "loss": 0.0, + "num_input_tokens_seen": 71148744, + "step": 122630 + }, + { + "epoch": 18.265564492106048, + "grad_norm": 7.73272131482372e-06, + "learning_rate": 1.1369361725063948e-06, + "loss": 0.0, + "num_input_tokens_seen": 71151656, + "step": 122635 + }, + { + "epoch": 18.266309204647005, + "grad_norm": 2.4649430997669697e-05, + "learning_rate": 1.1359675957308745e-06, + "loss": 0.0, + "num_input_tokens_seen": 71154696, + "step": 122640 + }, + { + "epoch": 18.267053917187965, + "grad_norm": 4.134422397328308e-06, + "learning_rate": 1.1349994221139276e-06, + "loss": 0.0, + "num_input_tokens_seen": 71157512, + "step": 122645 + }, + { + "epoch": 18.267798629728926, + "grad_norm": 1.070688904292183e-05, + "learning_rate": 1.134031651671913e-06, + "loss": 0.0, + "num_input_tokens_seen": 71160424, + "step": 122650 + }, + { + "epoch": 18.268543342269883, + "grad_norm": 2.878499981306959e-05, + "learning_rate": 1.133064284421176e-06, + "loss": 0.0, + "num_input_tokens_seen": 71163336, + "step": 122655 + }, + { + "epoch": 18.269288054810843, + "grad_norm": 5.471068652695976e-05, + "learning_rate": 1.132097320378056e-06, + "loss": 0.0, + "num_input_tokens_seen": 71166408, + "step": 122660 + }, + { + "epoch": 18.270032767351804, + "grad_norm": 6.264121566346148e-06, + "learning_rate": 1.1311307595588987e-06, + "loss": 0.0, + "num_input_tokens_seen": 71169224, + "step": 122665 + }, + { + "epoch": 18.27077747989276, + "grad_norm": 3.7733868794020964e-06, + "learning_rate": 1.130164601980027e-06, + "loss": 0.0, + "num_input_tokens_seen": 71172136, + "step": 122670 + }, + { + "epoch": 18.27152219243372, + "grad_norm": 4.24944119004067e-06, + "learning_rate": 1.1291988476577614e-06, + "loss": 0.0, + "num_input_tokens_seen": 71174952, + "step": 122675 + }, + { + "epoch": 18.27226690497468, + "grad_norm": 3.3938315482373582e-06, + "learning_rate": 1.1282334966084246e-06, + "loss": 0.0, + "num_input_tokens_seen": 71177992, + "step": 122680 + }, + { + "epoch": 18.27301161751564, + "grad_norm": 4.131301011511823e-06, + "learning_rate": 1.1272685488483148e-06, + "loss": 0.0, + "num_input_tokens_seen": 71180808, + "step": 122685 + }, + { + "epoch": 18.2737563300566, + "grad_norm": 0.0005537191173061728, + "learning_rate": 1.126304004393744e-06, + "loss": 0.0, + "num_input_tokens_seen": 71183976, + "step": 122690 + }, + { + "epoch": 18.274501042597556, + "grad_norm": 2.3215105102281086e-05, + "learning_rate": 1.125339863261002e-06, + "loss": 0.0, + "num_input_tokens_seen": 71187528, + "step": 122695 + }, + { + "epoch": 18.275245755138517, + "grad_norm": 3.3666581202851376e-06, + "learning_rate": 1.1243761254663781e-06, + "loss": 0.0, + "num_input_tokens_seen": 71190984, + "step": 122700 + }, + { + "epoch": 18.275990467679474, + "grad_norm": 2.9849184102204163e-06, + "learning_rate": 1.1234127910261543e-06, + "loss": 0.0, + "num_input_tokens_seen": 71194056, + "step": 122705 + }, + { + "epoch": 18.276735180220435, + "grad_norm": 2.5438157535973005e-06, + "learning_rate": 1.1224498599566009e-06, + "loss": 0.0, + "num_input_tokens_seen": 71196872, + "step": 122710 + }, + { + "epoch": 18.277479892761395, + "grad_norm": 0.001919104834087193, + "learning_rate": 1.1214873322739933e-06, + "loss": 0.0, + "num_input_tokens_seen": 71199720, + "step": 122715 + }, + { + "epoch": 18.278224605302352, + "grad_norm": 4.686986358137801e-05, + "learning_rate": 1.1205252079945882e-06, + "loss": 0.0, + "num_input_tokens_seen": 71202536, + "step": 122720 + }, + { + "epoch": 18.278969317843313, + "grad_norm": 4.6439040488621686e-06, + "learning_rate": 1.1195634871346395e-06, + "loss": 0.0, + "num_input_tokens_seen": 71205512, + "step": 122725 + }, + { + "epoch": 18.279714030384273, + "grad_norm": 2.0452175704122055e-06, + "learning_rate": 1.1186021697103893e-06, + "loss": 0.0, + "num_input_tokens_seen": 71208456, + "step": 122730 + }, + { + "epoch": 18.28045874292523, + "grad_norm": 2.8461563488235697e-06, + "learning_rate": 1.1176412557380888e-06, + "loss": 0.0, + "num_input_tokens_seen": 71211432, + "step": 122735 + }, + { + "epoch": 18.28120345546619, + "grad_norm": 5.635433808492962e-06, + "learning_rate": 1.116680745233961e-06, + "loss": 0.0, + "num_input_tokens_seen": 71214248, + "step": 122740 + }, + { + "epoch": 18.281948168007148, + "grad_norm": 3.392432972759707e-06, + "learning_rate": 1.1157206382142433e-06, + "loss": 0.0, + "num_input_tokens_seen": 71217352, + "step": 122745 + }, + { + "epoch": 18.282692880548108, + "grad_norm": 4.007199549960205e-06, + "learning_rate": 1.1147609346951526e-06, + "loss": 0.0, + "num_input_tokens_seen": 71220008, + "step": 122750 + }, + { + "epoch": 18.28343759308907, + "grad_norm": 3.101976517427829e-06, + "learning_rate": 1.113801634692893e-06, + "loss": 0.0, + "num_input_tokens_seen": 71222792, + "step": 122755 + }, + { + "epoch": 18.284182305630026, + "grad_norm": 7.4953186413040385e-06, + "learning_rate": 1.1128427382236823e-06, + "loss": 0.0, + "num_input_tokens_seen": 71225960, + "step": 122760 + }, + { + "epoch": 18.284927018170986, + "grad_norm": 3.070403408855782e-06, + "learning_rate": 1.1118842453037126e-06, + "loss": 0.0, + "num_input_tokens_seen": 71229448, + "step": 122765 + }, + { + "epoch": 18.285671730711947, + "grad_norm": 7.674126391066238e-05, + "learning_rate": 1.1109261559491823e-06, + "loss": 0.0, + "num_input_tokens_seen": 71232584, + "step": 122770 + }, + { + "epoch": 18.286416443252904, + "grad_norm": 5.902141856495291e-05, + "learning_rate": 1.109968470176273e-06, + "loss": 0.0, + "num_input_tokens_seen": 71235112, + "step": 122775 + }, + { + "epoch": 18.287161155793864, + "grad_norm": 3.7219042496872135e-06, + "learning_rate": 1.109011188001169e-06, + "loss": 0.0, + "num_input_tokens_seen": 71237928, + "step": 122780 + }, + { + "epoch": 18.28790586833482, + "grad_norm": 7.284750608960167e-05, + "learning_rate": 1.1080543094400374e-06, + "loss": 0.0, + "num_input_tokens_seen": 71240968, + "step": 122785 + }, + { + "epoch": 18.28865058087578, + "grad_norm": 4.6984561777208e-05, + "learning_rate": 1.1070978345090494e-06, + "loss": 0.0, + "num_input_tokens_seen": 71243880, + "step": 122790 + }, + { + "epoch": 18.289395293416742, + "grad_norm": 9.112006227951497e-05, + "learning_rate": 1.1061417632243554e-06, + "loss": 0.0, + "num_input_tokens_seen": 71246792, + "step": 122795 + }, + { + "epoch": 18.2901400059577, + "grad_norm": 5.35419621883193e-06, + "learning_rate": 1.105186095602112e-06, + "loss": 0.0, + "num_input_tokens_seen": 71249480, + "step": 122800 + }, + { + "epoch": 18.29088471849866, + "grad_norm": 3.333340282551944e-05, + "learning_rate": 1.1042308316584649e-06, + "loss": 0.0, + "num_input_tokens_seen": 71252072, + "step": 122805 + }, + { + "epoch": 18.29162943103962, + "grad_norm": 2.412549974906142e-06, + "learning_rate": 1.1032759714095481e-06, + "loss": 0.0, + "num_input_tokens_seen": 71255048, + "step": 122810 + }, + { + "epoch": 18.292374143580577, + "grad_norm": 4.985641226085136e-06, + "learning_rate": 1.1023215148714988e-06, + "loss": 0.0, + "num_input_tokens_seen": 71257928, + "step": 122815 + }, + { + "epoch": 18.293118856121538, + "grad_norm": 0.00012333280756138265, + "learning_rate": 1.1013674620604376e-06, + "loss": 0.0, + "num_input_tokens_seen": 71260872, + "step": 122820 + }, + { + "epoch": 18.293863568662495, + "grad_norm": 1.3755684449279215e-05, + "learning_rate": 1.1004138129924874e-06, + "loss": 0.0, + "num_input_tokens_seen": 71263464, + "step": 122825 + }, + { + "epoch": 18.294608281203455, + "grad_norm": 5.097489975014469e-06, + "learning_rate": 1.0994605676837521e-06, + "loss": 0.0, + "num_input_tokens_seen": 71266152, + "step": 122830 + }, + { + "epoch": 18.295352993744416, + "grad_norm": 8.889727723726537e-06, + "learning_rate": 1.0985077261503384e-06, + "loss": 0.0, + "num_input_tokens_seen": 71269064, + "step": 122835 + }, + { + "epoch": 18.296097706285373, + "grad_norm": 9.466692063142546e-06, + "learning_rate": 1.0975552884083473e-06, + "loss": 0.0, + "num_input_tokens_seen": 71271656, + "step": 122840 + }, + { + "epoch": 18.296842418826333, + "grad_norm": 4.011273176729446e-06, + "learning_rate": 1.096603254473863e-06, + "loss": 0.0, + "num_input_tokens_seen": 71274536, + "step": 122845 + }, + { + "epoch": 18.297587131367294, + "grad_norm": 2.120507815561723e-05, + "learning_rate": 1.0956516243629754e-06, + "loss": 0.0, + "num_input_tokens_seen": 71277128, + "step": 122850 + }, + { + "epoch": 18.29833184390825, + "grad_norm": 0.00016277046233881265, + "learning_rate": 1.094700398091758e-06, + "loss": 0.0, + "num_input_tokens_seen": 71280168, + "step": 122855 + }, + { + "epoch": 18.29907655644921, + "grad_norm": 2.9494426598830614e-06, + "learning_rate": 1.093749575676281e-06, + "loss": 0.0, + "num_input_tokens_seen": 71283112, + "step": 122860 + }, + { + "epoch": 18.29982126899017, + "grad_norm": 3.9294714042625856e-06, + "learning_rate": 1.092799157132604e-06, + "loss": 0.0, + "num_input_tokens_seen": 71285992, + "step": 122865 + }, + { + "epoch": 18.30056598153113, + "grad_norm": 1.756886013026815e-05, + "learning_rate": 1.091849142476792e-06, + "loss": 0.0, + "num_input_tokens_seen": 71288808, + "step": 122870 + }, + { + "epoch": 18.30131069407209, + "grad_norm": 2.2125404939288273e-06, + "learning_rate": 1.0908995317248898e-06, + "loss": 0.0, + "num_input_tokens_seen": 71291656, + "step": 122875 + }, + { + "epoch": 18.302055406613047, + "grad_norm": 2.468339289407595e-06, + "learning_rate": 1.0899503248929355e-06, + "loss": 0.0, + "num_input_tokens_seen": 71294600, + "step": 122880 + }, + { + "epoch": 18.302800119154007, + "grad_norm": 3.3598023492231732e-06, + "learning_rate": 1.0890015219969713e-06, + "loss": 0.0, + "num_input_tokens_seen": 71297544, + "step": 122885 + }, + { + "epoch": 18.303544831694964, + "grad_norm": 7.67647179600317e-06, + "learning_rate": 1.0880531230530233e-06, + "loss": 0.0, + "num_input_tokens_seen": 71300520, + "step": 122890 + }, + { + "epoch": 18.304289544235925, + "grad_norm": 4.196985173621215e-06, + "learning_rate": 1.0871051280771178e-06, + "loss": 0.0, + "num_input_tokens_seen": 71303496, + "step": 122895 + }, + { + "epoch": 18.305034256776885, + "grad_norm": 1.678089802226168e-06, + "learning_rate": 1.0861575370852612e-06, + "loss": 0.0, + "num_input_tokens_seen": 71306504, + "step": 122900 + }, + { + "epoch": 18.305778969317842, + "grad_norm": 2.6051181976072257e-06, + "learning_rate": 1.085210350093474e-06, + "loss": 0.0, + "num_input_tokens_seen": 71309384, + "step": 122905 + }, + { + "epoch": 18.306523681858803, + "grad_norm": 1.6670392142259516e-05, + "learning_rate": 1.084263567117752e-06, + "loss": 0.0, + "num_input_tokens_seen": 71312392, + "step": 122910 + }, + { + "epoch": 18.307268394399763, + "grad_norm": 3.86641158911516e-06, + "learning_rate": 1.083317188174085e-06, + "loss": 0.0, + "num_input_tokens_seen": 71315304, + "step": 122915 + }, + { + "epoch": 18.30801310694072, + "grad_norm": 3.7326612982724328e-06, + "learning_rate": 1.0823712132784713e-06, + "loss": 0.0, + "num_input_tokens_seen": 71318184, + "step": 122920 + }, + { + "epoch": 18.30875781948168, + "grad_norm": 3.759582250495441e-05, + "learning_rate": 1.0814256424468872e-06, + "loss": 0.0, + "num_input_tokens_seen": 71320904, + "step": 122925 + }, + { + "epoch": 18.309502532022638, + "grad_norm": 4.264414201315958e-06, + "learning_rate": 1.0804804756953057e-06, + "loss": 0.0, + "num_input_tokens_seen": 71323784, + "step": 122930 + }, + { + "epoch": 18.310247244563598, + "grad_norm": 0.015856506302952766, + "learning_rate": 1.079535713039695e-06, + "loss": 0.0, + "num_input_tokens_seen": 71326728, + "step": 122935 + }, + { + "epoch": 18.31099195710456, + "grad_norm": 3.2234456739388406e-06, + "learning_rate": 1.078591354496017e-06, + "loss": 0.0, + "num_input_tokens_seen": 71329800, + "step": 122940 + }, + { + "epoch": 18.311736669645516, + "grad_norm": 0.00037855433765798807, + "learning_rate": 1.0776474000802255e-06, + "loss": 0.0, + "num_input_tokens_seen": 71332904, + "step": 122945 + }, + { + "epoch": 18.312481382186476, + "grad_norm": 2.146118958989973e-06, + "learning_rate": 1.0767038498082694e-06, + "loss": 0.0, + "num_input_tokens_seen": 71335592, + "step": 122950 + }, + { + "epoch": 18.313226094727437, + "grad_norm": 2.3571392375743017e-06, + "learning_rate": 1.0757607036960853e-06, + "loss": 0.0, + "num_input_tokens_seen": 71338856, + "step": 122955 + }, + { + "epoch": 18.313970807268394, + "grad_norm": 2.7606311050476506e-06, + "learning_rate": 1.0748179617596082e-06, + "loss": 0.0, + "num_input_tokens_seen": 71342120, + "step": 122960 + }, + { + "epoch": 18.314715519809354, + "grad_norm": 1.0509605999686755e-05, + "learning_rate": 1.0738756240147668e-06, + "loss": 0.0, + "num_input_tokens_seen": 71344744, + "step": 122965 + }, + { + "epoch": 18.31546023235031, + "grad_norm": 0.0005565733299590647, + "learning_rate": 1.0729336904774762e-06, + "loss": 0.0, + "num_input_tokens_seen": 71347720, + "step": 122970 + }, + { + "epoch": 18.316204944891272, + "grad_norm": 0.0001305467012571171, + "learning_rate": 1.071992161163654e-06, + "loss": 0.0, + "num_input_tokens_seen": 71350664, + "step": 122975 + }, + { + "epoch": 18.316949657432232, + "grad_norm": 1.672196958679706e-05, + "learning_rate": 1.0710510360892072e-06, + "loss": 0.0, + "num_input_tokens_seen": 71353320, + "step": 122980 + }, + { + "epoch": 18.31769436997319, + "grad_norm": 3.243087121518329e-05, + "learning_rate": 1.0701103152700343e-06, + "loss": 0.0, + "num_input_tokens_seen": 71356296, + "step": 122985 + }, + { + "epoch": 18.31843908251415, + "grad_norm": 7.638411261723377e-06, + "learning_rate": 1.0691699987220194e-06, + "loss": 0.0, + "num_input_tokens_seen": 71358728, + "step": 122990 + }, + { + "epoch": 18.31918379505511, + "grad_norm": 5.674996828020085e-06, + "learning_rate": 1.068230086461061e-06, + "loss": 0.0, + "num_input_tokens_seen": 71361736, + "step": 122995 + }, + { + "epoch": 18.319928507596067, + "grad_norm": 1.8288307046532282e-06, + "learning_rate": 1.067290578503033e-06, + "loss": 0.0, + "num_input_tokens_seen": 71364584, + "step": 123000 + }, + { + "epoch": 18.320673220137028, + "grad_norm": 4.388171419122955e-06, + "learning_rate": 1.0663514748637998e-06, + "loss": 0.0, + "num_input_tokens_seen": 71367560, + "step": 123005 + }, + { + "epoch": 18.321417932677985, + "grad_norm": 6.959448455745587e-06, + "learning_rate": 1.0654127755592381e-06, + "loss": 0.0, + "num_input_tokens_seen": 71370472, + "step": 123010 + }, + { + "epoch": 18.322162645218945, + "grad_norm": 8.119188350974582e-06, + "learning_rate": 1.0644744806051988e-06, + "loss": 0.0, + "num_input_tokens_seen": 71373576, + "step": 123015 + }, + { + "epoch": 18.322907357759906, + "grad_norm": 9.927203791448846e-06, + "learning_rate": 1.0635365900175414e-06, + "loss": 0.0, + "num_input_tokens_seen": 71376264, + "step": 123020 + }, + { + "epoch": 18.323652070300863, + "grad_norm": 3.942346666008234e-06, + "learning_rate": 1.062599103812098e-06, + "loss": 0.0, + "num_input_tokens_seen": 71379592, + "step": 123025 + }, + { + "epoch": 18.324396782841823, + "grad_norm": 7.885922968853265e-06, + "learning_rate": 1.0616620220047197e-06, + "loss": 0.0, + "num_input_tokens_seen": 71382280, + "step": 123030 + }, + { + "epoch": 18.32514149538278, + "grad_norm": 2.5889599783113226e-05, + "learning_rate": 1.0607253446112324e-06, + "loss": 0.0, + "num_input_tokens_seen": 71385160, + "step": 123035 + }, + { + "epoch": 18.32588620792374, + "grad_norm": 2.907751195380115e-06, + "learning_rate": 1.0597890716474545e-06, + "loss": 0.0, + "num_input_tokens_seen": 71387880, + "step": 123040 + }, + { + "epoch": 18.3266309204647, + "grad_norm": 2.132235385943204e-05, + "learning_rate": 1.058853203129212e-06, + "loss": 0.0, + "num_input_tokens_seen": 71390952, + "step": 123045 + }, + { + "epoch": 18.32737563300566, + "grad_norm": 2.9003535928495694e-06, + "learning_rate": 1.0579177390723116e-06, + "loss": 0.0, + "num_input_tokens_seen": 71393960, + "step": 123050 + }, + { + "epoch": 18.32812034554662, + "grad_norm": 2.826535592248547e-06, + "learning_rate": 1.0569826794925602e-06, + "loss": 0.0, + "num_input_tokens_seen": 71397000, + "step": 123055 + }, + { + "epoch": 18.32886505808758, + "grad_norm": 1.3421318726614118e-05, + "learning_rate": 1.0560480244057452e-06, + "loss": 0.0, + "num_input_tokens_seen": 71399944, + "step": 123060 + }, + { + "epoch": 18.329609770628537, + "grad_norm": 5.851388777955435e-05, + "learning_rate": 1.0551137738276678e-06, + "loss": 0.0, + "num_input_tokens_seen": 71402856, + "step": 123065 + }, + { + "epoch": 18.330354483169497, + "grad_norm": 1.2541107025754172e-05, + "learning_rate": 1.0541799277741071e-06, + "loss": 0.0, + "num_input_tokens_seen": 71405640, + "step": 123070 + }, + { + "epoch": 18.331099195710454, + "grad_norm": 1.2734592928609345e-05, + "learning_rate": 1.0532464862608366e-06, + "loss": 0.0, + "num_input_tokens_seen": 71408424, + "step": 123075 + }, + { + "epoch": 18.331843908251415, + "grad_norm": 0.0003145068185403943, + "learning_rate": 1.0523134493036296e-06, + "loss": 0.0, + "num_input_tokens_seen": 71411048, + "step": 123080 + }, + { + "epoch": 18.332588620792375, + "grad_norm": 8.429044100921601e-06, + "learning_rate": 1.051380816918243e-06, + "loss": 0.0, + "num_input_tokens_seen": 71414280, + "step": 123085 + }, + { + "epoch": 18.333333333333332, + "grad_norm": 0.000163630407769233, + "learning_rate": 1.0504485891204452e-06, + "loss": 0.0, + "num_input_tokens_seen": 71417448, + "step": 123090 + }, + { + "epoch": 18.334078045874293, + "grad_norm": 7.129565346986055e-06, + "learning_rate": 1.0495167659259703e-06, + "loss": 0.0, + "num_input_tokens_seen": 71420488, + "step": 123095 + }, + { + "epoch": 18.334822758415253, + "grad_norm": 1.904700297927775e-06, + "learning_rate": 1.0485853473505724e-06, + "loss": 0.0, + "num_input_tokens_seen": 71423144, + "step": 123100 + }, + { + "epoch": 18.33556747095621, + "grad_norm": 5.204786702961428e-06, + "learning_rate": 1.0476543334099781e-06, + "loss": 0.0, + "num_input_tokens_seen": 71425768, + "step": 123105 + }, + { + "epoch": 18.33631218349717, + "grad_norm": 2.2316762624541298e-06, + "learning_rate": 1.0467237241199218e-06, + "loss": 0.0, + "num_input_tokens_seen": 71428552, + "step": 123110 + }, + { + "epoch": 18.337056896038128, + "grad_norm": 2.7002304705092683e-05, + "learning_rate": 1.0457935194961245e-06, + "loss": 0.0, + "num_input_tokens_seen": 71431208, + "step": 123115 + }, + { + "epoch": 18.33780160857909, + "grad_norm": 5.228978352533886e-06, + "learning_rate": 1.044863719554298e-06, + "loss": 0.0, + "num_input_tokens_seen": 71434184, + "step": 123120 + }, + { + "epoch": 18.33854632112005, + "grad_norm": 1.8279893993167207e-05, + "learning_rate": 1.0439343243101558e-06, + "loss": 0.0, + "num_input_tokens_seen": 71437160, + "step": 123125 + }, + { + "epoch": 18.339291033661006, + "grad_norm": 1.1258222002652474e-05, + "learning_rate": 1.04300533377939e-06, + "loss": 0.0, + "num_input_tokens_seen": 71439976, + "step": 123130 + }, + { + "epoch": 18.340035746201966, + "grad_norm": 2.496145862096455e-05, + "learning_rate": 1.0420767479777022e-06, + "loss": 0.0, + "num_input_tokens_seen": 71443336, + "step": 123135 + }, + { + "epoch": 18.340780458742927, + "grad_norm": 0.00012072810204699636, + "learning_rate": 1.0411485669207772e-06, + "loss": 0.0, + "num_input_tokens_seen": 71446280, + "step": 123140 + }, + { + "epoch": 18.341525171283884, + "grad_norm": 1.343472922599176e-05, + "learning_rate": 1.0402207906242966e-06, + "loss": 0.0, + "num_input_tokens_seen": 71449096, + "step": 123145 + }, + { + "epoch": 18.342269883824844, + "grad_norm": 2.1383202692959458e-06, + "learning_rate": 1.0392934191039372e-06, + "loss": 0.0007, + "num_input_tokens_seen": 71451944, + "step": 123150 + }, + { + "epoch": 18.3430145963658, + "grad_norm": 2.39506380239618e-06, + "learning_rate": 1.0383664523753584e-06, + "loss": 0.0, + "num_input_tokens_seen": 71454888, + "step": 123155 + }, + { + "epoch": 18.343759308906762, + "grad_norm": 0.000107658575871028, + "learning_rate": 1.0374398904542283e-06, + "loss": 0.0, + "num_input_tokens_seen": 71457608, + "step": 123160 + }, + { + "epoch": 18.344504021447722, + "grad_norm": 1.3443216630548704e-05, + "learning_rate": 1.0365137333561925e-06, + "loss": 0.0, + "num_input_tokens_seen": 71460392, + "step": 123165 + }, + { + "epoch": 18.34524873398868, + "grad_norm": 5.594171670963988e-05, + "learning_rate": 1.0355879810969054e-06, + "loss": 0.0, + "num_input_tokens_seen": 71463528, + "step": 123170 + }, + { + "epoch": 18.34599344652964, + "grad_norm": 0.00015461257135029882, + "learning_rate": 1.0346626336920019e-06, + "loss": 0.0, + "num_input_tokens_seen": 71466664, + "step": 123175 + }, + { + "epoch": 18.3467381590706, + "grad_norm": 3.915962679457152e-06, + "learning_rate": 1.0337376911571161e-06, + "loss": 0.0, + "num_input_tokens_seen": 71469704, + "step": 123180 + }, + { + "epoch": 18.347482871611557, + "grad_norm": 1.0141402526642196e-05, + "learning_rate": 1.032813153507875e-06, + "loss": 0.0, + "num_input_tokens_seen": 71472456, + "step": 123185 + }, + { + "epoch": 18.348227584152518, + "grad_norm": 4.968419307260774e-05, + "learning_rate": 1.0318890207598963e-06, + "loss": 0.0, + "num_input_tokens_seen": 71475368, + "step": 123190 + }, + { + "epoch": 18.348972296693475, + "grad_norm": 5.737338597100461e-06, + "learning_rate": 1.0309652929287926e-06, + "loss": 0.0, + "num_input_tokens_seen": 71478184, + "step": 123195 + }, + { + "epoch": 18.349717009234435, + "grad_norm": 0.007450154982507229, + "learning_rate": 1.0300419700301684e-06, + "loss": 0.0, + "num_input_tokens_seen": 71481128, + "step": 123200 + }, + { + "epoch": 18.350461721775396, + "grad_norm": 7.474525773432106e-06, + "learning_rate": 1.0291190520796246e-06, + "loss": 0.0, + "num_input_tokens_seen": 71484360, + "step": 123205 + }, + { + "epoch": 18.351206434316353, + "grad_norm": 4.395460564410314e-06, + "learning_rate": 1.028196539092746e-06, + "loss": 0.0, + "num_input_tokens_seen": 71487112, + "step": 123210 + }, + { + "epoch": 18.351951146857314, + "grad_norm": 4.566844836517703e-06, + "learning_rate": 1.027274431085129e-06, + "loss": 0.0, + "num_input_tokens_seen": 71490184, + "step": 123215 + }, + { + "epoch": 18.35269585939827, + "grad_norm": 1.0322404705220833e-05, + "learning_rate": 1.0263527280723411e-06, + "loss": 0.0, + "num_input_tokens_seen": 71493384, + "step": 123220 + }, + { + "epoch": 18.35344057193923, + "grad_norm": 3.054047192563303e-05, + "learning_rate": 1.025431430069962e-06, + "loss": 0.0, + "num_input_tokens_seen": 71496200, + "step": 123225 + }, + { + "epoch": 18.35418528448019, + "grad_norm": 1.7022586689563468e-05, + "learning_rate": 1.0245105370935536e-06, + "loss": 0.0, + "num_input_tokens_seen": 71499144, + "step": 123230 + }, + { + "epoch": 18.35492999702115, + "grad_norm": 2.5037804789462825e-06, + "learning_rate": 1.0235900491586652e-06, + "loss": 0.0, + "num_input_tokens_seen": 71501960, + "step": 123235 + }, + { + "epoch": 18.35567470956211, + "grad_norm": 3.5476377888699062e-06, + "learning_rate": 1.022669966280862e-06, + "loss": 0.0, + "num_input_tokens_seen": 71504680, + "step": 123240 + }, + { + "epoch": 18.35641942210307, + "grad_norm": 2.5564754650986288e-06, + "learning_rate": 1.021750288475673e-06, + "loss": 0.0, + "num_input_tokens_seen": 71507752, + "step": 123245 + }, + { + "epoch": 18.357164134644027, + "grad_norm": 4.950810762238689e-06, + "learning_rate": 1.0208310157586497e-06, + "loss": 0.0, + "num_input_tokens_seen": 71510568, + "step": 123250 + }, + { + "epoch": 18.357908847184987, + "grad_norm": 1.0387440852355212e-05, + "learning_rate": 1.0199121481453106e-06, + "loss": 0.0, + "num_input_tokens_seen": 71513736, + "step": 123255 + }, + { + "epoch": 18.358653559725944, + "grad_norm": 3.6339653888717294e-05, + "learning_rate": 1.0189936856511873e-06, + "loss": 0.0, + "num_input_tokens_seen": 71516744, + "step": 123260 + }, + { + "epoch": 18.359398272266905, + "grad_norm": 2.7369997042114846e-06, + "learning_rate": 1.018075628291787e-06, + "loss": 0.0, + "num_input_tokens_seen": 71519464, + "step": 123265 + }, + { + "epoch": 18.360142984807865, + "grad_norm": 2.0158282495685853e-05, + "learning_rate": 1.0171579760826279e-06, + "loss": 0.0, + "num_input_tokens_seen": 71522504, + "step": 123270 + }, + { + "epoch": 18.360887697348822, + "grad_norm": 1.1173832717759069e-05, + "learning_rate": 1.0162407290392112e-06, + "loss": 0.0, + "num_input_tokens_seen": 71525064, + "step": 123275 + }, + { + "epoch": 18.361632409889783, + "grad_norm": 3.2832263059390243e-06, + "learning_rate": 1.0153238871770277e-06, + "loss": 0.0, + "num_input_tokens_seen": 71527656, + "step": 123280 + }, + { + "epoch": 18.362377122430743, + "grad_norm": 9.203625268128235e-06, + "learning_rate": 1.014407450511573e-06, + "loss": 0.0, + "num_input_tokens_seen": 71530664, + "step": 123285 + }, + { + "epoch": 18.3631218349717, + "grad_norm": 2.40999688685406e-05, + "learning_rate": 1.013491419058324e-06, + "loss": 0.0, + "num_input_tokens_seen": 71533800, + "step": 123290 + }, + { + "epoch": 18.36386654751266, + "grad_norm": 9.3012295110384e-06, + "learning_rate": 1.0125757928327623e-06, + "loss": 0.0, + "num_input_tokens_seen": 71537160, + "step": 123295 + }, + { + "epoch": 18.364611260053618, + "grad_norm": 6.813123491156148e-06, + "learning_rate": 1.011660571850348e-06, + "loss": 0.0, + "num_input_tokens_seen": 71540072, + "step": 123300 + }, + { + "epoch": 18.36535597259458, + "grad_norm": 2.3832068109186366e-05, + "learning_rate": 1.010745756126552e-06, + "loss": 0.0, + "num_input_tokens_seen": 71542728, + "step": 123305 + }, + { + "epoch": 18.36610068513554, + "grad_norm": 15.939346313476562, + "learning_rate": 1.0098313456768233e-06, + "loss": 0.0645, + "num_input_tokens_seen": 71545576, + "step": 123310 + }, + { + "epoch": 18.366845397676496, + "grad_norm": 0.00857155304402113, + "learning_rate": 1.008917340516613e-06, + "loss": 0.0, + "num_input_tokens_seen": 71548872, + "step": 123315 + }, + { + "epoch": 18.367590110217456, + "grad_norm": 6.316686631180346e-05, + "learning_rate": 1.008003740661359e-06, + "loss": 0.0, + "num_input_tokens_seen": 71551848, + "step": 123320 + }, + { + "epoch": 18.368334822758417, + "grad_norm": 2.184225877499557e-06, + "learning_rate": 1.007090546126499e-06, + "loss": 0.0045, + "num_input_tokens_seen": 71554824, + "step": 123325 + }, + { + "epoch": 18.369079535299374, + "grad_norm": 3.5220755307818763e-06, + "learning_rate": 1.0061777569274593e-06, + "loss": 0.0, + "num_input_tokens_seen": 71557544, + "step": 123330 + }, + { + "epoch": 18.369824247840334, + "grad_norm": 3.995529368694406e-06, + "learning_rate": 1.0052653730796558e-06, + "loss": 0.0, + "num_input_tokens_seen": 71560296, + "step": 123335 + }, + { + "epoch": 18.37056896038129, + "grad_norm": 7.569386070827022e-06, + "learning_rate": 1.004353394598509e-06, + "loss": 0.0, + "num_input_tokens_seen": 71563176, + "step": 123340 + }, + { + "epoch": 18.371313672922252, + "grad_norm": 5.375753062253352e-06, + "learning_rate": 1.0034418214994235e-06, + "loss": 0.0, + "num_input_tokens_seen": 71566088, + "step": 123345 + }, + { + "epoch": 18.372058385463212, + "grad_norm": 1.182949654321419e-05, + "learning_rate": 1.0025306537978007e-06, + "loss": 0.0, + "num_input_tokens_seen": 71569128, + "step": 123350 + }, + { + "epoch": 18.37280309800417, + "grad_norm": 2.9101716791046783e-05, + "learning_rate": 1.001619891509034e-06, + "loss": 0.0, + "num_input_tokens_seen": 71571976, + "step": 123355 + }, + { + "epoch": 18.37354781054513, + "grad_norm": 6.6995216911891475e-06, + "learning_rate": 1.000709534648503e-06, + "loss": 0.0, + "num_input_tokens_seen": 71575176, + "step": 123360 + }, + { + "epoch": 18.37429252308609, + "grad_norm": 3.0273254196799826e-06, + "learning_rate": 9.997995832315977e-07, + "loss": 0.0, + "num_input_tokens_seen": 71577832, + "step": 123365 + }, + { + "epoch": 18.375037235627047, + "grad_norm": 1.4023818948771805e-05, + "learning_rate": 9.988900372736808e-07, + "loss": 0.0, + "num_input_tokens_seen": 71580488, + "step": 123370 + }, + { + "epoch": 18.375781948168008, + "grad_norm": 0.0006951075629331172, + "learning_rate": 9.979808967901267e-07, + "loss": 0.0, + "num_input_tokens_seen": 71583144, + "step": 123375 + }, + { + "epoch": 18.376526660708965, + "grad_norm": 5.3863500397710595e-06, + "learning_rate": 9.97072161796292e-07, + "loss": 0.0, + "num_input_tokens_seen": 71586376, + "step": 123380 + }, + { + "epoch": 18.377271373249926, + "grad_norm": 4.505803190113511e-06, + "learning_rate": 9.961638323075284e-07, + "loss": 0.0, + "num_input_tokens_seen": 71589224, + "step": 123385 + }, + { + "epoch": 18.378016085790886, + "grad_norm": 3.6128292322246125e-06, + "learning_rate": 9.952559083391765e-07, + "loss": 0.0, + "num_input_tokens_seen": 71592168, + "step": 123390 + }, + { + "epoch": 18.378760798331843, + "grad_norm": 9.500945452600718e-05, + "learning_rate": 9.943483899065798e-07, + "loss": 0.0, + "num_input_tokens_seen": 71595240, + "step": 123395 + }, + { + "epoch": 18.379505510872804, + "grad_norm": 2.0195725483063143e-06, + "learning_rate": 9.9344127702507e-07, + "loss": 0.0, + "num_input_tokens_seen": 71598088, + "step": 123400 + }, + { + "epoch": 18.38025022341376, + "grad_norm": 5.003883416065946e-06, + "learning_rate": 9.925345697099686e-07, + "loss": 0.0, + "num_input_tokens_seen": 71600808, + "step": 123405 + }, + { + "epoch": 18.38099493595472, + "grad_norm": 6.713210041198181e-06, + "learning_rate": 9.916282679765965e-07, + "loss": 0.0, + "num_input_tokens_seen": 71603816, + "step": 123410 + }, + { + "epoch": 18.38173964849568, + "grad_norm": 6.241717528610025e-06, + "learning_rate": 9.907223718402608e-07, + "loss": 0.0, + "num_input_tokens_seen": 71606696, + "step": 123415 + }, + { + "epoch": 18.38248436103664, + "grad_norm": 2.251761543448083e-06, + "learning_rate": 9.898168813162744e-07, + "loss": 0.0, + "num_input_tokens_seen": 71609416, + "step": 123420 + }, + { + "epoch": 18.3832290735776, + "grad_norm": 2.982087062264327e-05, + "learning_rate": 9.889117964199252e-07, + "loss": 0.0, + "num_input_tokens_seen": 71612136, + "step": 123425 + }, + { + "epoch": 18.38397378611856, + "grad_norm": 2.8188558189867763e-06, + "learning_rate": 9.880071171665089e-07, + "loss": 0.0, + "num_input_tokens_seen": 71614888, + "step": 123430 + }, + { + "epoch": 18.384718498659517, + "grad_norm": 8.990510650619399e-06, + "learning_rate": 9.871028435713081e-07, + "loss": 0.0, + "num_input_tokens_seen": 71617736, + "step": 123435 + }, + { + "epoch": 18.385463211200477, + "grad_norm": 1.7767423514669645e-06, + "learning_rate": 9.861989756495965e-07, + "loss": 0.0, + "num_input_tokens_seen": 71620776, + "step": 123440 + }, + { + "epoch": 18.386207923741434, + "grad_norm": 0.00023312406847253442, + "learning_rate": 9.852955134166481e-07, + "loss": 0.0, + "num_input_tokens_seen": 71623560, + "step": 123445 + }, + { + "epoch": 18.386952636282395, + "grad_norm": 2.8596192350960337e-05, + "learning_rate": 9.843924568877282e-07, + "loss": 0.0, + "num_input_tokens_seen": 71626376, + "step": 123450 + }, + { + "epoch": 18.387697348823355, + "grad_norm": 1.4935060789866839e-05, + "learning_rate": 9.834898060780861e-07, + "loss": 0.0, + "num_input_tokens_seen": 71629128, + "step": 123455 + }, + { + "epoch": 18.388442061364312, + "grad_norm": 4.109990186407231e-06, + "learning_rate": 9.825875610029733e-07, + "loss": 0.0, + "num_input_tokens_seen": 71631816, + "step": 123460 + }, + { + "epoch": 18.389186773905273, + "grad_norm": 3.949712208850542e-06, + "learning_rate": 9.81685721677636e-07, + "loss": 0.0, + "num_input_tokens_seen": 71635240, + "step": 123465 + }, + { + "epoch": 18.389931486446233, + "grad_norm": 5.699153007299174e-06, + "learning_rate": 9.807842881173034e-07, + "loss": 0.0, + "num_input_tokens_seen": 71637992, + "step": 123470 + }, + { + "epoch": 18.39067619898719, + "grad_norm": 3.2357131658500293e-06, + "learning_rate": 9.79883260337211e-07, + "loss": 0.0, + "num_input_tokens_seen": 71640808, + "step": 123475 + }, + { + "epoch": 18.39142091152815, + "grad_norm": 3.6287453895056387e-06, + "learning_rate": 9.789826383525796e-07, + "loss": 0.0, + "num_input_tokens_seen": 71644104, + "step": 123480 + }, + { + "epoch": 18.392165624069108, + "grad_norm": 2.8269228096178267e-06, + "learning_rate": 9.780824221786195e-07, + "loss": 0.0, + "num_input_tokens_seen": 71647336, + "step": 123485 + }, + { + "epoch": 18.39291033661007, + "grad_norm": 4.2761316763062496e-06, + "learning_rate": 9.771826118305432e-07, + "loss": 0.0, + "num_input_tokens_seen": 71650120, + "step": 123490 + }, + { + "epoch": 18.39365504915103, + "grad_norm": 1.1961512427660637e-05, + "learning_rate": 9.762832073235501e-07, + "loss": 0.0, + "num_input_tokens_seen": 71653000, + "step": 123495 + }, + { + "epoch": 18.394399761691986, + "grad_norm": 3.728937736013904e-05, + "learning_rate": 9.75384208672836e-07, + "loss": 0.0, + "num_input_tokens_seen": 71656360, + "step": 123500 + }, + { + "epoch": 18.395144474232946, + "grad_norm": 3.617974243752542e-06, + "learning_rate": 9.744856158935888e-07, + "loss": 0.0, + "num_input_tokens_seen": 71659208, + "step": 123505 + }, + { + "epoch": 18.395889186773907, + "grad_norm": 7.0695568865630776e-06, + "learning_rate": 9.735874290009884e-07, + "loss": 0.0, + "num_input_tokens_seen": 71661960, + "step": 123510 + }, + { + "epoch": 18.396633899314864, + "grad_norm": 1.570678796269931e-05, + "learning_rate": 9.72689648010211e-07, + "loss": 0.0, + "num_input_tokens_seen": 71664840, + "step": 123515 + }, + { + "epoch": 18.397378611855824, + "grad_norm": 9.522031177766621e-06, + "learning_rate": 9.717922729364198e-07, + "loss": 0.0, + "num_input_tokens_seen": 71667848, + "step": 123520 + }, + { + "epoch": 18.39812332439678, + "grad_norm": 2.4146916075551417e-06, + "learning_rate": 9.708953037947804e-07, + "loss": 0.0, + "num_input_tokens_seen": 71670568, + "step": 123525 + }, + { + "epoch": 18.398868036937742, + "grad_norm": 11.166324615478516, + "learning_rate": 9.699987406004364e-07, + "loss": 0.0178, + "num_input_tokens_seen": 71673384, + "step": 123530 + }, + { + "epoch": 18.399612749478703, + "grad_norm": 1.8127039993487415e-06, + "learning_rate": 9.691025833685446e-07, + "loss": 0.0, + "num_input_tokens_seen": 71676328, + "step": 123535 + }, + { + "epoch": 18.40035746201966, + "grad_norm": 2.4170903998310678e-06, + "learning_rate": 9.68206832114238e-07, + "loss": 0.0, + "num_input_tokens_seen": 71679144, + "step": 123540 + }, + { + "epoch": 18.40110217456062, + "grad_norm": 4.832860668102512e-06, + "learning_rate": 9.673114868526568e-07, + "loss": 0.0, + "num_input_tokens_seen": 71681960, + "step": 123545 + }, + { + "epoch": 18.401846887101577, + "grad_norm": 4.814061685465276e-05, + "learning_rate": 9.664165475989168e-07, + "loss": 0.0, + "num_input_tokens_seen": 71684552, + "step": 123550 + }, + { + "epoch": 18.402591599642538, + "grad_norm": 5.144361239217687e-06, + "learning_rate": 9.655220143681476e-07, + "loss": 0.0, + "num_input_tokens_seen": 71687464, + "step": 123555 + }, + { + "epoch": 18.403336312183498, + "grad_norm": 7.527893467340618e-05, + "learning_rate": 9.646278871754539e-07, + "loss": 0.0, + "num_input_tokens_seen": 71690248, + "step": 123560 + }, + { + "epoch": 18.404081024724455, + "grad_norm": 1.2751574104186147e-05, + "learning_rate": 9.637341660359428e-07, + "loss": 0.0, + "num_input_tokens_seen": 71693288, + "step": 123565 + }, + { + "epoch": 18.404825737265416, + "grad_norm": 4.686226020567119e-06, + "learning_rate": 9.628408509647164e-07, + "loss": 0.0, + "num_input_tokens_seen": 71696040, + "step": 123570 + }, + { + "epoch": 18.405570449806376, + "grad_norm": 3.231347000109963e-05, + "learning_rate": 9.619479419768596e-07, + "loss": 0.0, + "num_input_tokens_seen": 71699112, + "step": 123575 + }, + { + "epoch": 18.406315162347333, + "grad_norm": 5.723541107727215e-06, + "learning_rate": 9.610554390874632e-07, + "loss": 0.0, + "num_input_tokens_seen": 71701992, + "step": 123580 + }, + { + "epoch": 18.407059874888294, + "grad_norm": 0.00017875450430437922, + "learning_rate": 9.60163342311604e-07, + "loss": 0.0, + "num_input_tokens_seen": 71705448, + "step": 123585 + }, + { + "epoch": 18.40780458742925, + "grad_norm": 0.0038844456430524588, + "learning_rate": 9.592716516643536e-07, + "loss": 0.0, + "num_input_tokens_seen": 71708968, + "step": 123590 + }, + { + "epoch": 18.40854929997021, + "grad_norm": 8.836620509100612e-06, + "learning_rate": 9.583803671607743e-07, + "loss": 0.0, + "num_input_tokens_seen": 71712168, + "step": 123595 + }, + { + "epoch": 18.40929401251117, + "grad_norm": 5.622073331323918e-06, + "learning_rate": 9.574894888159186e-07, + "loss": 0.0, + "num_input_tokens_seen": 71715048, + "step": 123600 + }, + { + "epoch": 18.41003872505213, + "grad_norm": 1.401303325110348e-05, + "learning_rate": 9.565990166448463e-07, + "loss": 0.0, + "num_input_tokens_seen": 71718088, + "step": 123605 + }, + { + "epoch": 18.41078343759309, + "grad_norm": 5.179874278837815e-06, + "learning_rate": 9.557089506625954e-07, + "loss": 0.0, + "num_input_tokens_seen": 71720904, + "step": 123610 + }, + { + "epoch": 18.41152815013405, + "grad_norm": 4.7807385271880776e-05, + "learning_rate": 9.54819290884207e-07, + "loss": 0.0, + "num_input_tokens_seen": 71723592, + "step": 123615 + }, + { + "epoch": 18.412272862675007, + "grad_norm": 3.7130832879483933e-06, + "learning_rate": 9.539300373247045e-07, + "loss": 0.0, + "num_input_tokens_seen": 71726856, + "step": 123620 + }, + { + "epoch": 18.413017575215967, + "grad_norm": 3.170222043991089e-06, + "learning_rate": 9.530411899991182e-07, + "loss": 0.0005, + "num_input_tokens_seen": 71729672, + "step": 123625 + }, + { + "epoch": 18.413762287756924, + "grad_norm": 6.253332230699016e-06, + "learning_rate": 9.521527489224552e-07, + "loss": 0.0, + "num_input_tokens_seen": 71732712, + "step": 123630 + }, + { + "epoch": 18.414507000297885, + "grad_norm": 3.6938765788363526e-06, + "learning_rate": 9.512647141097369e-07, + "loss": 0.0, + "num_input_tokens_seen": 71735688, + "step": 123635 + }, + { + "epoch": 18.415251712838845, + "grad_norm": 2.741602884270833e-06, + "learning_rate": 9.503770855759569e-07, + "loss": 0.0119, + "num_input_tokens_seen": 71738504, + "step": 123640 + }, + { + "epoch": 18.415996425379802, + "grad_norm": 2.197638877987629e-06, + "learning_rate": 9.494898633361144e-07, + "loss": 0.0, + "num_input_tokens_seen": 71741320, + "step": 123645 + }, + { + "epoch": 18.416741137920763, + "grad_norm": 2.610508818179369e-05, + "learning_rate": 9.486030474051944e-07, + "loss": 0.0, + "num_input_tokens_seen": 71744104, + "step": 123650 + }, + { + "epoch": 18.417485850461723, + "grad_norm": 6.307992589427158e-05, + "learning_rate": 9.477166377981822e-07, + "loss": 0.0, + "num_input_tokens_seen": 71746984, + "step": 123655 + }, + { + "epoch": 18.41823056300268, + "grad_norm": 7.205640940810554e-06, + "learning_rate": 9.468306345300548e-07, + "loss": 0.0, + "num_input_tokens_seen": 71749832, + "step": 123660 + }, + { + "epoch": 18.41897527554364, + "grad_norm": 1.1577769328141585e-05, + "learning_rate": 9.459450376157697e-07, + "loss": 0.0, + "num_input_tokens_seen": 71753032, + "step": 123665 + }, + { + "epoch": 18.419719988084598, + "grad_norm": 2.69636348093627e-06, + "learning_rate": 9.450598470703037e-07, + "loss": 0.0, + "num_input_tokens_seen": 71756456, + "step": 123670 + }, + { + "epoch": 18.42046470062556, + "grad_norm": 3.652040504675824e-06, + "learning_rate": 9.441750629086004e-07, + "loss": 0.0, + "num_input_tokens_seen": 71759400, + "step": 123675 + }, + { + "epoch": 18.42120941316652, + "grad_norm": 1.8759268641588278e-05, + "learning_rate": 9.432906851456064e-07, + "loss": 0.0, + "num_input_tokens_seen": 71762536, + "step": 123680 + }, + { + "epoch": 18.421954125707476, + "grad_norm": 7.698755325691309e-06, + "learning_rate": 9.424067137962705e-07, + "loss": 0.0, + "num_input_tokens_seen": 71765288, + "step": 123685 + }, + { + "epoch": 18.422698838248436, + "grad_norm": 0.00012194061855552718, + "learning_rate": 9.4152314887552e-07, + "loss": 0.0, + "num_input_tokens_seen": 71767848, + "step": 123690 + }, + { + "epoch": 18.423443550789397, + "grad_norm": 7.881653800723143e-06, + "learning_rate": 9.406399903982844e-07, + "loss": 0.0, + "num_input_tokens_seen": 71770952, + "step": 123695 + }, + { + "epoch": 18.424188263330354, + "grad_norm": 5.7768729675444774e-06, + "learning_rate": 9.397572383794823e-07, + "loss": 0.0, + "num_input_tokens_seen": 71773768, + "step": 123700 + }, + { + "epoch": 18.424932975871315, + "grad_norm": 2.4201790438382886e-06, + "learning_rate": 9.388748928340296e-07, + "loss": 0.0, + "num_input_tokens_seen": 71776616, + "step": 123705 + }, + { + "epoch": 18.42567768841227, + "grad_norm": 8.380448889511172e-06, + "learning_rate": 9.379929537768339e-07, + "loss": 0.0, + "num_input_tokens_seen": 71779560, + "step": 123710 + }, + { + "epoch": 18.426422400953232, + "grad_norm": 7.658766662643757e-06, + "learning_rate": 9.371114212227889e-07, + "loss": 0.0, + "num_input_tokens_seen": 71782344, + "step": 123715 + }, + { + "epoch": 18.427167113494193, + "grad_norm": 3.4794400107784895e-06, + "learning_rate": 9.362302951867907e-07, + "loss": 0.0, + "num_input_tokens_seen": 71784968, + "step": 123720 + }, + { + "epoch": 18.42791182603515, + "grad_norm": 1.5500805602641776e-05, + "learning_rate": 9.353495756837222e-07, + "loss": 0.0, + "num_input_tokens_seen": 71788008, + "step": 123725 + }, + { + "epoch": 18.42865653857611, + "grad_norm": 3.644419393822318e-06, + "learning_rate": 9.344692627284657e-07, + "loss": 0.0, + "num_input_tokens_seen": 71790728, + "step": 123730 + }, + { + "epoch": 18.42940125111707, + "grad_norm": 2.3415628675138578e-05, + "learning_rate": 9.335893563358899e-07, + "loss": 0.0, + "num_input_tokens_seen": 71793800, + "step": 123735 + }, + { + "epoch": 18.430145963658028, + "grad_norm": 0.0016035176813602448, + "learning_rate": 9.327098565208636e-07, + "loss": 0.0, + "num_input_tokens_seen": 71796808, + "step": 123740 + }, + { + "epoch": 18.430890676198988, + "grad_norm": 4.552914106170647e-06, + "learning_rate": 9.318307632982415e-07, + "loss": 0.0, + "num_input_tokens_seen": 71800008, + "step": 123745 + }, + { + "epoch": 18.431635388739945, + "grad_norm": 4.667390840040753e-06, + "learning_rate": 9.309520766828811e-07, + "loss": 0.0, + "num_input_tokens_seen": 71802792, + "step": 123750 + }, + { + "epoch": 18.432380101280906, + "grad_norm": 2.836279463735991e-06, + "learning_rate": 9.300737966896206e-07, + "loss": 0.0, + "num_input_tokens_seen": 71805576, + "step": 123755 + }, + { + "epoch": 18.433124813821866, + "grad_norm": 3.619939434429398e-06, + "learning_rate": 9.291959233332981e-07, + "loss": 0.0, + "num_input_tokens_seen": 71808488, + "step": 123760 + }, + { + "epoch": 18.433869526362823, + "grad_norm": 3.305430936961784e-06, + "learning_rate": 9.28318456628749e-07, + "loss": 0.0, + "num_input_tokens_seen": 71811208, + "step": 123765 + }, + { + "epoch": 18.434614238903784, + "grad_norm": 3.4993865938304225e-06, + "learning_rate": 9.274413965907919e-07, + "loss": 0.0, + "num_input_tokens_seen": 71814152, + "step": 123770 + }, + { + "epoch": 18.43535895144474, + "grad_norm": 3.4841862088796915e-06, + "learning_rate": 9.265647432342455e-07, + "loss": 0.0, + "num_input_tokens_seen": 71817032, + "step": 123775 + }, + { + "epoch": 18.4361036639857, + "grad_norm": 1.673793690315506e-06, + "learning_rate": 9.256884965739232e-07, + "loss": 0.0, + "num_input_tokens_seen": 71819816, + "step": 123780 + }, + { + "epoch": 18.43684837652666, + "grad_norm": 7.81885682954453e-06, + "learning_rate": 9.248126566246267e-07, + "loss": 0.0, + "num_input_tokens_seen": 71822952, + "step": 123785 + }, + { + "epoch": 18.43759308906762, + "grad_norm": 0.003916730638593435, + "learning_rate": 9.239372234011473e-07, + "loss": 0.0, + "num_input_tokens_seen": 71825704, + "step": 123790 + }, + { + "epoch": 18.43833780160858, + "grad_norm": 9.705803677206859e-06, + "learning_rate": 9.230621969182812e-07, + "loss": 0.0, + "num_input_tokens_seen": 71828520, + "step": 123795 + }, + { + "epoch": 18.43908251414954, + "grad_norm": 1.835457169363508e-06, + "learning_rate": 9.221875771908084e-07, + "loss": 0.0, + "num_input_tokens_seen": 71831208, + "step": 123800 + }, + { + "epoch": 18.439827226690497, + "grad_norm": 0.007792408112436533, + "learning_rate": 9.213133642335031e-07, + "loss": 0.0, + "num_input_tokens_seen": 71833960, + "step": 123805 + }, + { + "epoch": 18.440571939231457, + "grad_norm": 4.514092324825469e-06, + "learning_rate": 9.204395580611397e-07, + "loss": 0.0, + "num_input_tokens_seen": 71836936, + "step": 123810 + }, + { + "epoch": 18.441316651772414, + "grad_norm": 2.1814414594700793e-06, + "learning_rate": 9.195661586884729e-07, + "loss": 0.0, + "num_input_tokens_seen": 71840072, + "step": 123815 + }, + { + "epoch": 18.442061364313375, + "grad_norm": 2.814645995385945e-06, + "learning_rate": 9.186931661302634e-07, + "loss": 0.0, + "num_input_tokens_seen": 71843112, + "step": 123820 + }, + { + "epoch": 18.442806076854335, + "grad_norm": 4.010461907455465e-06, + "learning_rate": 9.178205804012546e-07, + "loss": 0.0, + "num_input_tokens_seen": 71845896, + "step": 123825 + }, + { + "epoch": 18.443550789395292, + "grad_norm": 7.030161214061081e-05, + "learning_rate": 9.16948401516196e-07, + "loss": 0.0, + "num_input_tokens_seen": 71848648, + "step": 123830 + }, + { + "epoch": 18.444295501936253, + "grad_norm": 6.8493845901684836e-06, + "learning_rate": 9.160766294898148e-07, + "loss": 0.0, + "num_input_tokens_seen": 71851560, + "step": 123835 + }, + { + "epoch": 18.445040214477213, + "grad_norm": 7.037887826299993e-06, + "learning_rate": 9.152052643368408e-07, + "loss": 0.0, + "num_input_tokens_seen": 71854280, + "step": 123840 + }, + { + "epoch": 18.44578492701817, + "grad_norm": 3.1053671136760386e-06, + "learning_rate": 9.143343060719956e-07, + "loss": 0.0, + "num_input_tokens_seen": 71857288, + "step": 123845 + }, + { + "epoch": 18.44652963955913, + "grad_norm": 4.078746769664576e-06, + "learning_rate": 9.13463754709995e-07, + "loss": 0.0, + "num_input_tokens_seen": 71859976, + "step": 123850 + }, + { + "epoch": 18.447274352100088, + "grad_norm": 7.823275154805742e-06, + "learning_rate": 9.125936102655414e-07, + "loss": 0.0, + "num_input_tokens_seen": 71862920, + "step": 123855 + }, + { + "epoch": 18.44801906464105, + "grad_norm": 3.957225999329239e-06, + "learning_rate": 9.117238727533367e-07, + "loss": 0.0, + "num_input_tokens_seen": 71865608, + "step": 123860 + }, + { + "epoch": 18.44876377718201, + "grad_norm": 1.8104972696164623e-05, + "learning_rate": 9.108545421880776e-07, + "loss": 0.0, + "num_input_tokens_seen": 71868616, + "step": 123865 + }, + { + "epoch": 18.449508489722966, + "grad_norm": 5.76490128878504e-06, + "learning_rate": 9.09985618584444e-07, + "loss": 0.0, + "num_input_tokens_seen": 71871720, + "step": 123870 + }, + { + "epoch": 18.450253202263927, + "grad_norm": 3.5414514059084468e-06, + "learning_rate": 9.091171019571215e-07, + "loss": 0.0, + "num_input_tokens_seen": 71874504, + "step": 123875 + }, + { + "epoch": 18.450997914804887, + "grad_norm": 2.2678430468658917e-05, + "learning_rate": 9.082489923207815e-07, + "loss": 0.0, + "num_input_tokens_seen": 71877288, + "step": 123880 + }, + { + "epoch": 18.451742627345844, + "grad_norm": 2.79691448668018e-06, + "learning_rate": 9.073812896900874e-07, + "loss": 0.0, + "num_input_tokens_seen": 71880040, + "step": 123885 + }, + { + "epoch": 18.452487339886805, + "grad_norm": 2.2375422759068897e-06, + "learning_rate": 9.065139940797024e-07, + "loss": 0.0, + "num_input_tokens_seen": 71883016, + "step": 123890 + }, + { + "epoch": 18.45323205242776, + "grad_norm": 1.7703447383610182e-06, + "learning_rate": 9.056471055042732e-07, + "loss": 0.0, + "num_input_tokens_seen": 71885928, + "step": 123895 + }, + { + "epoch": 18.453976764968722, + "grad_norm": 0.0002460526884533465, + "learning_rate": 9.04780623978449e-07, + "loss": 0.0, + "num_input_tokens_seen": 71888680, + "step": 123900 + }, + { + "epoch": 18.454721477509683, + "grad_norm": 2.111089770551189e-06, + "learning_rate": 9.039145495168655e-07, + "loss": 0.0, + "num_input_tokens_seen": 71891656, + "step": 123905 + }, + { + "epoch": 18.45546619005064, + "grad_norm": 3.5837015275319573e-06, + "learning_rate": 9.030488821341554e-07, + "loss": 0.0, + "num_input_tokens_seen": 71894632, + "step": 123910 + }, + { + "epoch": 18.4562109025916, + "grad_norm": 9.452213998883963e-05, + "learning_rate": 9.021836218449459e-07, + "loss": 0.0, + "num_input_tokens_seen": 71897800, + "step": 123915 + }, + { + "epoch": 18.456955615132557, + "grad_norm": 3.380524867679924e-05, + "learning_rate": 9.01318768663853e-07, + "loss": 0.0, + "num_input_tokens_seen": 71900808, + "step": 123920 + }, + { + "epoch": 18.457700327673518, + "grad_norm": 7.94001516624121e-06, + "learning_rate": 9.004543226054846e-07, + "loss": 0.0, + "num_input_tokens_seen": 71903720, + "step": 123925 + }, + { + "epoch": 18.458445040214478, + "grad_norm": 7.829121386748739e-06, + "learning_rate": 8.995902836844455e-07, + "loss": 0.0, + "num_input_tokens_seen": 71906760, + "step": 123930 + }, + { + "epoch": 18.459189752755435, + "grad_norm": 3.283407204435207e-05, + "learning_rate": 8.987266519153353e-07, + "loss": 0.0, + "num_input_tokens_seen": 71909704, + "step": 123935 + }, + { + "epoch": 18.459934465296396, + "grad_norm": 3.7916674045845866e-05, + "learning_rate": 8.978634273127424e-07, + "loss": 0.0, + "num_input_tokens_seen": 71912520, + "step": 123940 + }, + { + "epoch": 18.460679177837356, + "grad_norm": 2.5153369733743602e-06, + "learning_rate": 8.97000609891252e-07, + "loss": 0.0, + "num_input_tokens_seen": 71915368, + "step": 123945 + }, + { + "epoch": 18.461423890378313, + "grad_norm": 0.0001731492084218189, + "learning_rate": 8.961381996654361e-07, + "loss": 0.0, + "num_input_tokens_seen": 71918152, + "step": 123950 + }, + { + "epoch": 18.462168602919274, + "grad_norm": 1.7720511777952197e-06, + "learning_rate": 8.952761966498691e-07, + "loss": 0.0, + "num_input_tokens_seen": 71921192, + "step": 123955 + }, + { + "epoch": 18.46291331546023, + "grad_norm": 1.53927812789334e-05, + "learning_rate": 8.944146008591143e-07, + "loss": 0.0, + "num_input_tokens_seen": 71923816, + "step": 123960 + }, + { + "epoch": 18.46365802800119, + "grad_norm": 3.17602971335873e-05, + "learning_rate": 8.93553412307721e-07, + "loss": 0.0, + "num_input_tokens_seen": 71926984, + "step": 123965 + }, + { + "epoch": 18.464402740542152, + "grad_norm": 2.991253677464556e-05, + "learning_rate": 8.926926310102445e-07, + "loss": 0.0, + "num_input_tokens_seen": 71929800, + "step": 123970 + }, + { + "epoch": 18.46514745308311, + "grad_norm": 4.212070052744821e-06, + "learning_rate": 8.918322569812259e-07, + "loss": 0.0, + "num_input_tokens_seen": 71932616, + "step": 123975 + }, + { + "epoch": 18.46589216562407, + "grad_norm": 2.980628096338478e-06, + "learning_rate": 8.909722902351924e-07, + "loss": 0.0, + "num_input_tokens_seen": 71935624, + "step": 123980 + }, + { + "epoch": 18.46663687816503, + "grad_norm": 2.6303771392122144e-06, + "learning_rate": 8.901127307866852e-07, + "loss": 0.0, + "num_input_tokens_seen": 71938600, + "step": 123985 + }, + { + "epoch": 18.467381590705987, + "grad_norm": 4.9543082241143566e-06, + "learning_rate": 8.892535786502176e-07, + "loss": 0.0, + "num_input_tokens_seen": 71941640, + "step": 123990 + }, + { + "epoch": 18.468126303246947, + "grad_norm": 0.025628453120589256, + "learning_rate": 8.883948338403058e-07, + "loss": 0.0, + "num_input_tokens_seen": 71944488, + "step": 123995 + }, + { + "epoch": 18.468871015787904, + "grad_norm": 4.222022653266322e-06, + "learning_rate": 8.87536496371455e-07, + "loss": 0.0, + "num_input_tokens_seen": 71947592, + "step": 124000 + }, + { + "epoch": 18.469615728328865, + "grad_norm": 6.302339897956699e-05, + "learning_rate": 8.866785662581728e-07, + "loss": 0.0, + "num_input_tokens_seen": 71950472, + "step": 124005 + }, + { + "epoch": 18.470360440869825, + "grad_norm": 5.960539056104608e-06, + "learning_rate": 8.858210435149422e-07, + "loss": 0.0, + "num_input_tokens_seen": 71953704, + "step": 124010 + }, + { + "epoch": 18.471105153410782, + "grad_norm": 9.2044792836532e-05, + "learning_rate": 8.849639281562628e-07, + "loss": 0.0, + "num_input_tokens_seen": 71956712, + "step": 124015 + }, + { + "epoch": 18.471849865951743, + "grad_norm": 2.3793679702066584e-06, + "learning_rate": 8.841072201966033e-07, + "loss": 0.0, + "num_input_tokens_seen": 71959624, + "step": 124020 + }, + { + "epoch": 18.472594578492703, + "grad_norm": 2.6159423214267008e-05, + "learning_rate": 8.83250919650444e-07, + "loss": 0.0, + "num_input_tokens_seen": 71962664, + "step": 124025 + }, + { + "epoch": 18.47333929103366, + "grad_norm": 4.657956196751911e-06, + "learning_rate": 8.823950265322484e-07, + "loss": 0.0, + "num_input_tokens_seen": 71965640, + "step": 124030 + }, + { + "epoch": 18.47408400357462, + "grad_norm": 1.309516119363252e-05, + "learning_rate": 8.815395408564797e-07, + "loss": 0.0, + "num_input_tokens_seen": 71968392, + "step": 124035 + }, + { + "epoch": 18.474828716115578, + "grad_norm": 2.426800847388222e-06, + "learning_rate": 8.806844626375848e-07, + "loss": 0.0, + "num_input_tokens_seen": 71971304, + "step": 124040 + }, + { + "epoch": 18.47557342865654, + "grad_norm": 9.902273632178549e-06, + "learning_rate": 8.798297918900162e-07, + "loss": 0.0, + "num_input_tokens_seen": 71974312, + "step": 124045 + }, + { + "epoch": 18.4763181411975, + "grad_norm": 7.1925405791262165e-06, + "learning_rate": 8.789755286282065e-07, + "loss": 0.0, + "num_input_tokens_seen": 71977480, + "step": 124050 + }, + { + "epoch": 18.477062853738456, + "grad_norm": 6.550708803843008e-06, + "learning_rate": 8.781216728665859e-07, + "loss": 0.0, + "num_input_tokens_seen": 71980136, + "step": 124055 + }, + { + "epoch": 18.477807566279417, + "grad_norm": 5.155246071808506e-06, + "learning_rate": 8.772682246195873e-07, + "loss": 0.0, + "num_input_tokens_seen": 71983048, + "step": 124060 + }, + { + "epoch": 18.478552278820374, + "grad_norm": 2.2566916868527187e-06, + "learning_rate": 8.764151839016216e-07, + "loss": 0.0, + "num_input_tokens_seen": 71985800, + "step": 124065 + }, + { + "epoch": 18.479296991361334, + "grad_norm": 1.0391711839474738e-05, + "learning_rate": 8.755625507271076e-07, + "loss": 0.0, + "num_input_tokens_seen": 71988680, + "step": 124070 + }, + { + "epoch": 18.480041703902295, + "grad_norm": 1.1410531442379579e-05, + "learning_rate": 8.747103251104394e-07, + "loss": 0.0, + "num_input_tokens_seen": 71991464, + "step": 124075 + }, + { + "epoch": 18.48078641644325, + "grad_norm": 6.2914041336625814e-06, + "learning_rate": 8.738585070660249e-07, + "loss": 0.0, + "num_input_tokens_seen": 71994312, + "step": 124080 + }, + { + "epoch": 18.481531128984212, + "grad_norm": 3.428855779930018e-05, + "learning_rate": 8.730070966082499e-07, + "loss": 0.0, + "num_input_tokens_seen": 71997192, + "step": 124085 + }, + { + "epoch": 18.482275841525173, + "grad_norm": 0.019364146515727043, + "learning_rate": 8.721560937514972e-07, + "loss": 0.0, + "num_input_tokens_seen": 71999976, + "step": 124090 + }, + { + "epoch": 18.48302055406613, + "grad_norm": 2.780253680612077e-06, + "learning_rate": 8.71305498510147e-07, + "loss": 0.0, + "num_input_tokens_seen": 72002728, + "step": 124095 + }, + { + "epoch": 18.48376526660709, + "grad_norm": 1.6374877986891079e-06, + "learning_rate": 8.70455310898563e-07, + "loss": 0.0, + "num_input_tokens_seen": 72005672, + "step": 124100 + }, + { + "epoch": 18.484509979148047, + "grad_norm": 4.743767476611538e-06, + "learning_rate": 8.696055309311169e-07, + "loss": 0.0, + "num_input_tokens_seen": 72008744, + "step": 124105 + }, + { + "epoch": 18.485254691689008, + "grad_norm": 3.5147568269167095e-05, + "learning_rate": 8.687561586221582e-07, + "loss": 0.0, + "num_input_tokens_seen": 72011464, + "step": 124110 + }, + { + "epoch": 18.48599940422997, + "grad_norm": 1.0097681297338568e-05, + "learning_rate": 8.679071939860394e-07, + "loss": 0.0, + "num_input_tokens_seen": 72014280, + "step": 124115 + }, + { + "epoch": 18.486744116770925, + "grad_norm": 3.410795443414827e-06, + "learning_rate": 8.67058637037102e-07, + "loss": 0.0, + "num_input_tokens_seen": 72017192, + "step": 124120 + }, + { + "epoch": 18.487488829311886, + "grad_norm": 6.116529402788728e-05, + "learning_rate": 8.662104877896788e-07, + "loss": 0.0, + "num_input_tokens_seen": 72020040, + "step": 124125 + }, + { + "epoch": 18.488233541852846, + "grad_norm": 0.00018068771169055253, + "learning_rate": 8.653627462581027e-07, + "loss": 0.0, + "num_input_tokens_seen": 72022856, + "step": 124130 + }, + { + "epoch": 18.488978254393803, + "grad_norm": 9.800797670322936e-06, + "learning_rate": 8.645154124566929e-07, + "loss": 0.0, + "num_input_tokens_seen": 72025672, + "step": 124135 + }, + { + "epoch": 18.489722966934764, + "grad_norm": 2.240885123683256e-06, + "learning_rate": 8.636684863997657e-07, + "loss": 0.0, + "num_input_tokens_seen": 72028584, + "step": 124140 + }, + { + "epoch": 18.49046767947572, + "grad_norm": 3.4767499528243206e-06, + "learning_rate": 8.628219681016264e-07, + "loss": 0.0, + "num_input_tokens_seen": 72031432, + "step": 124145 + }, + { + "epoch": 18.49121239201668, + "grad_norm": 2.291005694132764e-05, + "learning_rate": 8.619758575765801e-07, + "loss": 0.0, + "num_input_tokens_seen": 72034088, + "step": 124150 + }, + { + "epoch": 18.491957104557642, + "grad_norm": 0.0002665497886482626, + "learning_rate": 8.611301548389155e-07, + "loss": 0.0, + "num_input_tokens_seen": 72037000, + "step": 124155 + }, + { + "epoch": 18.4927018170986, + "grad_norm": 5.482222604769049e-06, + "learning_rate": 8.602848599029267e-07, + "loss": 0.0, + "num_input_tokens_seen": 72040040, + "step": 124160 + }, + { + "epoch": 18.49344652963956, + "grad_norm": 5.112205144541804e-06, + "learning_rate": 8.59439972782894e-07, + "loss": 0.0, + "num_input_tokens_seen": 72042792, + "step": 124165 + }, + { + "epoch": 18.49419124218052, + "grad_norm": 3.2586644920229446e-06, + "learning_rate": 8.585954934930806e-07, + "loss": 0.0, + "num_input_tokens_seen": 72045672, + "step": 124170 + }, + { + "epoch": 18.494935954721477, + "grad_norm": 8.177400559361558e-06, + "learning_rate": 8.577514220477644e-07, + "loss": 0.0, + "num_input_tokens_seen": 72048584, + "step": 124175 + }, + { + "epoch": 18.495680667262437, + "grad_norm": 9.032601155922748e-06, + "learning_rate": 8.569077584612006e-07, + "loss": 0.0, + "num_input_tokens_seen": 72051656, + "step": 124180 + }, + { + "epoch": 18.496425379803394, + "grad_norm": 7.023718353593722e-06, + "learning_rate": 8.560645027476416e-07, + "loss": 0.0, + "num_input_tokens_seen": 72054824, + "step": 124185 + }, + { + "epoch": 18.497170092344355, + "grad_norm": 7.810097486071754e-06, + "learning_rate": 8.552216549213316e-07, + "loss": 0.0, + "num_input_tokens_seen": 72057576, + "step": 124190 + }, + { + "epoch": 18.497914804885315, + "grad_norm": 2.248359805889777e-06, + "learning_rate": 8.543792149965174e-07, + "loss": 0.0, + "num_input_tokens_seen": 72060328, + "step": 124195 + }, + { + "epoch": 18.498659517426272, + "grad_norm": 5.765646619693143e-06, + "learning_rate": 8.535371829874239e-07, + "loss": 0.0, + "num_input_tokens_seen": 72063432, + "step": 124200 + }, + { + "epoch": 18.499404229967233, + "grad_norm": 2.9841183277312666e-05, + "learning_rate": 8.52695558908273e-07, + "loss": 0.0, + "num_input_tokens_seen": 72066440, + "step": 124205 + }, + { + "epoch": 18.500148942508194, + "grad_norm": 9.261345439881552e-06, + "learning_rate": 8.51854342773295e-07, + "loss": 0.0, + "num_input_tokens_seen": 72069480, + "step": 124210 + }, + { + "epoch": 18.50089365504915, + "grad_norm": 2.7461107947601704e-06, + "learning_rate": 8.510135345966897e-07, + "loss": 0.0, + "num_input_tokens_seen": 72072264, + "step": 124215 + }, + { + "epoch": 18.50163836759011, + "grad_norm": 3.481615294731455e-06, + "learning_rate": 8.501731343926706e-07, + "loss": 0.0, + "num_input_tokens_seen": 72075688, + "step": 124220 + }, + { + "epoch": 18.502383080131068, + "grad_norm": 8.896402505342849e-06, + "learning_rate": 8.493331421754291e-07, + "loss": 0.0, + "num_input_tokens_seen": 72079080, + "step": 124225 + }, + { + "epoch": 18.50312779267203, + "grad_norm": 6.514514097943902e-06, + "learning_rate": 8.484935579591596e-07, + "loss": 0.0, + "num_input_tokens_seen": 72082344, + "step": 124230 + }, + { + "epoch": 18.50387250521299, + "grad_norm": 8.237702786573209e-06, + "learning_rate": 8.476543817580451e-07, + "loss": 0.0, + "num_input_tokens_seen": 72085480, + "step": 124235 + }, + { + "epoch": 18.504617217753946, + "grad_norm": 5.913225322728977e-06, + "learning_rate": 8.468156135862631e-07, + "loss": 0.0, + "num_input_tokens_seen": 72088488, + "step": 124240 + }, + { + "epoch": 18.505361930294907, + "grad_norm": 2.3186155885923654e-05, + "learning_rate": 8.45977253457983e-07, + "loss": 0.0, + "num_input_tokens_seen": 72091720, + "step": 124245 + }, + { + "epoch": 18.506106642835867, + "grad_norm": 5.246723594609648e-05, + "learning_rate": 8.451393013873682e-07, + "loss": 0.0, + "num_input_tokens_seen": 72094536, + "step": 124250 + }, + { + "epoch": 18.506851355376824, + "grad_norm": 2.3234401851368602e-06, + "learning_rate": 8.443017573885769e-07, + "loss": 0.0, + "num_input_tokens_seen": 72097384, + "step": 124255 + }, + { + "epoch": 18.507596067917785, + "grad_norm": 1.5701769370934926e-05, + "learning_rate": 8.434646214757536e-07, + "loss": 0.0, + "num_input_tokens_seen": 72100008, + "step": 124260 + }, + { + "epoch": 18.50834078045874, + "grad_norm": 1.3089771528029814e-05, + "learning_rate": 8.42627893663045e-07, + "loss": 0.0, + "num_input_tokens_seen": 72102664, + "step": 124265 + }, + { + "epoch": 18.509085492999702, + "grad_norm": 2.944437710539205e-06, + "learning_rate": 8.417915739645815e-07, + "loss": 0.0, + "num_input_tokens_seen": 72105320, + "step": 124270 + }, + { + "epoch": 18.509830205540663, + "grad_norm": 1.2605010851984844e-05, + "learning_rate": 8.40955662394502e-07, + "loss": 0.0, + "num_input_tokens_seen": 72108168, + "step": 124275 + }, + { + "epoch": 18.51057491808162, + "grad_norm": 0.00014986029418651015, + "learning_rate": 8.401201589669227e-07, + "loss": 0.0, + "num_input_tokens_seen": 72111432, + "step": 124280 + }, + { + "epoch": 18.51131963062258, + "grad_norm": 4.511554834607523e-06, + "learning_rate": 8.392850636959521e-07, + "loss": 0.0, + "num_input_tokens_seen": 72114248, + "step": 124285 + }, + { + "epoch": 18.512064343163537, + "grad_norm": 1.7672118701739237e-05, + "learning_rate": 8.384503765957091e-07, + "loss": 0.0, + "num_input_tokens_seen": 72117256, + "step": 124290 + }, + { + "epoch": 18.512809055704498, + "grad_norm": 3.095822467003018e-05, + "learning_rate": 8.376160976802882e-07, + "loss": 0.0, + "num_input_tokens_seen": 72119944, + "step": 124295 + }, + { + "epoch": 18.51355376824546, + "grad_norm": 2.220243368356023e-05, + "learning_rate": 8.367822269637892e-07, + "loss": 0.0, + "num_input_tokens_seen": 72122888, + "step": 124300 + }, + { + "epoch": 18.514298480786415, + "grad_norm": 7.160043878684519e-06, + "learning_rate": 8.359487644602954e-07, + "loss": 0.0, + "num_input_tokens_seen": 72125704, + "step": 124305 + }, + { + "epoch": 18.515043193327376, + "grad_norm": 3.1890153877611738e-06, + "learning_rate": 8.351157101838842e-07, + "loss": 0.0, + "num_input_tokens_seen": 72128552, + "step": 124310 + }, + { + "epoch": 18.515787905868336, + "grad_norm": 4.171479758952046e-06, + "learning_rate": 8.34283064148636e-07, + "loss": 0.0, + "num_input_tokens_seen": 72131496, + "step": 124315 + }, + { + "epoch": 18.516532618409293, + "grad_norm": 0.00014418848149944097, + "learning_rate": 8.334508263686147e-07, + "loss": 0.0, + "num_input_tokens_seen": 72134440, + "step": 124320 + }, + { + "epoch": 18.517277330950254, + "grad_norm": 8.865524250722956e-06, + "learning_rate": 8.326189968578785e-07, + "loss": 0.0, + "num_input_tokens_seen": 72137320, + "step": 124325 + }, + { + "epoch": 18.51802204349121, + "grad_norm": 4.546930540527683e-06, + "learning_rate": 8.317875756304827e-07, + "loss": 0.0, + "num_input_tokens_seen": 72140136, + "step": 124330 + }, + { + "epoch": 18.51876675603217, + "grad_norm": 4.211072791804327e-06, + "learning_rate": 8.309565627004717e-07, + "loss": 0.0, + "num_input_tokens_seen": 72143048, + "step": 124335 + }, + { + "epoch": 18.519511468573132, + "grad_norm": 1.5166318007686641e-05, + "learning_rate": 8.301259580818843e-07, + "loss": 0.0, + "num_input_tokens_seen": 72145704, + "step": 124340 + }, + { + "epoch": 18.52025618111409, + "grad_norm": 1.0161671525565907e-05, + "learning_rate": 8.292957617887537e-07, + "loss": 0.0, + "num_input_tokens_seen": 72148456, + "step": 124345 + }, + { + "epoch": 18.52100089365505, + "grad_norm": 1.6691756172804162e-05, + "learning_rate": 8.284659738351047e-07, + "loss": 0.0, + "num_input_tokens_seen": 72151272, + "step": 124350 + }, + { + "epoch": 18.52174560619601, + "grad_norm": 3.987043783126865e-06, + "learning_rate": 8.276365942349595e-07, + "loss": 0.0, + "num_input_tokens_seen": 72154056, + "step": 124355 + }, + { + "epoch": 18.522490318736967, + "grad_norm": 7.6172846092958935e-06, + "learning_rate": 8.268076230023264e-07, + "loss": 0.0, + "num_input_tokens_seen": 72156968, + "step": 124360 + }, + { + "epoch": 18.523235031277927, + "grad_norm": 4.367068413557718e-06, + "learning_rate": 8.259790601512052e-07, + "loss": 0.0, + "num_input_tokens_seen": 72159688, + "step": 124365 + }, + { + "epoch": 18.523979743818884, + "grad_norm": 8.547427569283172e-06, + "learning_rate": 8.251509056956042e-07, + "loss": 0.0, + "num_input_tokens_seen": 72162632, + "step": 124370 + }, + { + "epoch": 18.524724456359845, + "grad_norm": 3.809369809459895e-05, + "learning_rate": 8.243231596495066e-07, + "loss": 0.0, + "num_input_tokens_seen": 72165480, + "step": 124375 + }, + { + "epoch": 18.525469168900806, + "grad_norm": 1.154466826847056e-05, + "learning_rate": 8.234958220268985e-07, + "loss": 0.0, + "num_input_tokens_seen": 72168456, + "step": 124380 + }, + { + "epoch": 18.526213881441763, + "grad_norm": 2.2272665773925837e-06, + "learning_rate": 8.22668892841752e-07, + "loss": 0.0, + "num_input_tokens_seen": 72171496, + "step": 124385 + }, + { + "epoch": 18.526958593982723, + "grad_norm": 0.00017630051297601312, + "learning_rate": 8.218423721080476e-07, + "loss": 0.0, + "num_input_tokens_seen": 72174408, + "step": 124390 + }, + { + "epoch": 18.527703306523684, + "grad_norm": 2.579762804089114e-05, + "learning_rate": 8.21016259839738e-07, + "loss": 0.0, + "num_input_tokens_seen": 72177256, + "step": 124395 + }, + { + "epoch": 18.52844801906464, + "grad_norm": 1.6490264897583984e-05, + "learning_rate": 8.201905560507872e-07, + "loss": 0.0, + "num_input_tokens_seen": 72180296, + "step": 124400 + }, + { + "epoch": 18.5291927316056, + "grad_norm": 2.2206088488019304e-06, + "learning_rate": 8.193652607551422e-07, + "loss": 0.0, + "num_input_tokens_seen": 72183144, + "step": 124405 + }, + { + "epoch": 18.529937444146558, + "grad_norm": 1.592835360497702e-05, + "learning_rate": 8.185403739667419e-07, + "loss": 0.0, + "num_input_tokens_seen": 72185864, + "step": 124410 + }, + { + "epoch": 18.53068215668752, + "grad_norm": 6.887379640829749e-06, + "learning_rate": 8.177158956995279e-07, + "loss": 0.0, + "num_input_tokens_seen": 72188616, + "step": 124415 + }, + { + "epoch": 18.53142686922848, + "grad_norm": 2.4019736883929e-06, + "learning_rate": 8.168918259674224e-07, + "loss": 0.0, + "num_input_tokens_seen": 72191304, + "step": 124420 + }, + { + "epoch": 18.532171581769436, + "grad_norm": 6.096995830535889, + "learning_rate": 8.16068164784356e-07, + "loss": 0.0189, + "num_input_tokens_seen": 72194120, + "step": 124425 + }, + { + "epoch": 18.532916294310397, + "grad_norm": 1.6713533113943413e-05, + "learning_rate": 8.152449121642342e-07, + "loss": 0.0, + "num_input_tokens_seen": 72197128, + "step": 124430 + }, + { + "epoch": 18.533661006851354, + "grad_norm": 9.802752174437046e-06, + "learning_rate": 8.144220681209708e-07, + "loss": 0.0, + "num_input_tokens_seen": 72200008, + "step": 124435 + }, + { + "epoch": 18.534405719392314, + "grad_norm": 4.349938535597175e-06, + "learning_rate": 8.135996326684686e-07, + "loss": 0.0, + "num_input_tokens_seen": 72202728, + "step": 124440 + }, + { + "epoch": 18.535150431933275, + "grad_norm": 4.160830485488987e-06, + "learning_rate": 8.127776058206166e-07, + "loss": 0.0, + "num_input_tokens_seen": 72205704, + "step": 124445 + }, + { + "epoch": 18.53589514447423, + "grad_norm": 2.6213315322820563e-06, + "learning_rate": 8.119559875913036e-07, + "loss": 0.0, + "num_input_tokens_seen": 72208520, + "step": 124450 + }, + { + "epoch": 18.536639857015192, + "grad_norm": 4.788822479895316e-06, + "learning_rate": 8.111347779944101e-07, + "loss": 0.0, + "num_input_tokens_seen": 72211208, + "step": 124455 + }, + { + "epoch": 18.537384569556153, + "grad_norm": 7.424686918966472e-05, + "learning_rate": 8.103139770438112e-07, + "loss": 0.0, + "num_input_tokens_seen": 72213928, + "step": 124460 + }, + { + "epoch": 18.53812928209711, + "grad_norm": 2.289847998326877e-06, + "learning_rate": 8.09493584753368e-07, + "loss": 0.0, + "num_input_tokens_seen": 72216648, + "step": 124465 + }, + { + "epoch": 18.53887399463807, + "grad_norm": 4.316023023420712e-06, + "learning_rate": 8.0867360113695e-07, + "loss": 0.0, + "num_input_tokens_seen": 72219240, + "step": 124470 + }, + { + "epoch": 18.539618707179027, + "grad_norm": 0.004166779108345509, + "learning_rate": 8.078540262084017e-07, + "loss": 0.0, + "num_input_tokens_seen": 72222184, + "step": 124475 + }, + { + "epoch": 18.540363419719988, + "grad_norm": 4.061883828399004e-06, + "learning_rate": 8.07034859981573e-07, + "loss": 0.0, + "num_input_tokens_seen": 72225032, + "step": 124480 + }, + { + "epoch": 18.54110813226095, + "grad_norm": 0.34316620230674744, + "learning_rate": 8.062161024703029e-07, + "loss": 0.0, + "num_input_tokens_seen": 72227784, + "step": 124485 + }, + { + "epoch": 18.541852844801905, + "grad_norm": 3.3823251724243164, + "learning_rate": 8.053977536884194e-07, + "loss": 0.0064, + "num_input_tokens_seen": 72230664, + "step": 124490 + }, + { + "epoch": 18.542597557342866, + "grad_norm": 4.05934724767576e-06, + "learning_rate": 8.045798136497529e-07, + "loss": 0.0, + "num_input_tokens_seen": 72233512, + "step": 124495 + }, + { + "epoch": 18.543342269883826, + "grad_norm": 1.0257912435918115e-05, + "learning_rate": 8.037622823681174e-07, + "loss": 0.0, + "num_input_tokens_seen": 72236712, + "step": 124500 + }, + { + "epoch": 18.544086982424783, + "grad_norm": 0.00025217243819497526, + "learning_rate": 8.029451598573267e-07, + "loss": 0.0, + "num_input_tokens_seen": 72239528, + "step": 124505 + }, + { + "epoch": 18.544831694965744, + "grad_norm": 6.119472800492076e-06, + "learning_rate": 8.021284461311867e-07, + "loss": 0.0, + "num_input_tokens_seen": 72242344, + "step": 124510 + }, + { + "epoch": 18.5455764075067, + "grad_norm": 1.1780488421209157e-05, + "learning_rate": 8.013121412034919e-07, + "loss": 0.0, + "num_input_tokens_seen": 72245288, + "step": 124515 + }, + { + "epoch": 18.54632112004766, + "grad_norm": 0.00140807731077075, + "learning_rate": 8.004962450880338e-07, + "loss": 0.0, + "num_input_tokens_seen": 72248360, + "step": 124520 + }, + { + "epoch": 18.547065832588622, + "grad_norm": 4.8748952394817024e-06, + "learning_rate": 7.99680757798596e-07, + "loss": 0.0, + "num_input_tokens_seen": 72251048, + "step": 124525 + }, + { + "epoch": 18.54781054512958, + "grad_norm": 2.0168913579254877e-06, + "learning_rate": 7.988656793489563e-07, + "loss": 0.0, + "num_input_tokens_seen": 72254088, + "step": 124530 + }, + { + "epoch": 18.54855525767054, + "grad_norm": 2.328538585061324e-06, + "learning_rate": 7.980510097528815e-07, + "loss": 0.0, + "num_input_tokens_seen": 72257352, + "step": 124535 + }, + { + "epoch": 18.5492999702115, + "grad_norm": 7.100933726178482e-05, + "learning_rate": 7.972367490241412e-07, + "loss": 0.0, + "num_input_tokens_seen": 72260456, + "step": 124540 + }, + { + "epoch": 18.550044682752457, + "grad_norm": 3.629163529694779e-06, + "learning_rate": 7.964228971764826e-07, + "loss": 0.0, + "num_input_tokens_seen": 72263240, + "step": 124545 + }, + { + "epoch": 18.550789395293418, + "grad_norm": 7.683649164391682e-05, + "learning_rate": 7.956094542236642e-07, + "loss": 0.0, + "num_input_tokens_seen": 72266120, + "step": 124550 + }, + { + "epoch": 18.551534107834375, + "grad_norm": 1.0528805432841182e-05, + "learning_rate": 7.947964201794223e-07, + "loss": 0.0, + "num_input_tokens_seen": 72269160, + "step": 124555 + }, + { + "epoch": 18.552278820375335, + "grad_norm": 1.1473027370811906e-05, + "learning_rate": 7.93983795057493e-07, + "loss": 0.0, + "num_input_tokens_seen": 72271848, + "step": 124560 + }, + { + "epoch": 18.553023532916296, + "grad_norm": 2.932804136435152e-06, + "learning_rate": 7.931715788716071e-07, + "loss": 0.0, + "num_input_tokens_seen": 72274696, + "step": 124565 + }, + { + "epoch": 18.553768245457253, + "grad_norm": 0.0021123727783560753, + "learning_rate": 7.923597716354841e-07, + "loss": 0.0, + "num_input_tokens_seen": 72277512, + "step": 124570 + }, + { + "epoch": 18.554512957998213, + "grad_norm": 8.585208888689522e-06, + "learning_rate": 7.915483733628382e-07, + "loss": 0.0, + "num_input_tokens_seen": 72280552, + "step": 124575 + }, + { + "epoch": 18.55525767053917, + "grad_norm": 6.0478241721284576e-06, + "learning_rate": 7.907373840673804e-07, + "loss": 0.0, + "num_input_tokens_seen": 72283464, + "step": 124580 + }, + { + "epoch": 18.55600238308013, + "grad_norm": 0.0009389043552801013, + "learning_rate": 7.899268037628082e-07, + "loss": 0.0, + "num_input_tokens_seen": 72286600, + "step": 124585 + }, + { + "epoch": 18.55674709562109, + "grad_norm": 4.253956376487622e-06, + "learning_rate": 7.891166324628163e-07, + "loss": 0.0, + "num_input_tokens_seen": 72289448, + "step": 124590 + }, + { + "epoch": 18.557491808162048, + "grad_norm": 4.795228596776724e-05, + "learning_rate": 7.883068701810936e-07, + "loss": 0.0, + "num_input_tokens_seen": 72292328, + "step": 124595 + }, + { + "epoch": 18.55823652070301, + "grad_norm": 1.9385761333978735e-05, + "learning_rate": 7.874975169313181e-07, + "loss": 0.0, + "num_input_tokens_seen": 72295784, + "step": 124600 + }, + { + "epoch": 18.55898123324397, + "grad_norm": 1.8949152718050755e-06, + "learning_rate": 7.866885727271594e-07, + "loss": 0.0, + "num_input_tokens_seen": 72298984, + "step": 124605 + }, + { + "epoch": 18.559725945784926, + "grad_norm": 0.00042672440758906305, + "learning_rate": 7.858800375822928e-07, + "loss": 0.0, + "num_input_tokens_seen": 72301896, + "step": 124610 + }, + { + "epoch": 18.560470658325887, + "grad_norm": 6.546863460243912e-06, + "learning_rate": 7.850719115103683e-07, + "loss": 0.0, + "num_input_tokens_seen": 72304968, + "step": 124615 + }, + { + "epoch": 18.561215370866844, + "grad_norm": 3.5829043554258533e-06, + "learning_rate": 7.842641945250473e-07, + "loss": 0.0, + "num_input_tokens_seen": 72307720, + "step": 124620 + }, + { + "epoch": 18.561960083407804, + "grad_norm": 7.436154191964306e-06, + "learning_rate": 7.834568866399688e-07, + "loss": 0.0, + "num_input_tokens_seen": 72310376, + "step": 124625 + }, + { + "epoch": 18.562704795948765, + "grad_norm": 1.0127535460924264e-05, + "learning_rate": 7.826499878687749e-07, + "loss": 0.0, + "num_input_tokens_seen": 72313256, + "step": 124630 + }, + { + "epoch": 18.56344950848972, + "grad_norm": 4.822094979317626e-06, + "learning_rate": 7.81843498225096e-07, + "loss": 0.0646, + "num_input_tokens_seen": 72316232, + "step": 124635 + }, + { + "epoch": 18.564194221030682, + "grad_norm": 6.453085916291457e-06, + "learning_rate": 7.810374177225549e-07, + "loss": 0.0, + "num_input_tokens_seen": 72319336, + "step": 124640 + }, + { + "epoch": 18.564938933571643, + "grad_norm": 5.066194717073813e-06, + "learning_rate": 7.802317463747738e-07, + "loss": 0.0, + "num_input_tokens_seen": 72322312, + "step": 124645 + }, + { + "epoch": 18.5656836461126, + "grad_norm": 2.7734686227631755e-06, + "learning_rate": 7.794264841953613e-07, + "loss": 0.0, + "num_input_tokens_seen": 72325032, + "step": 124650 + }, + { + "epoch": 18.56642835865356, + "grad_norm": 2.668890192580875e-05, + "learning_rate": 7.786216311979233e-07, + "loss": 0.0, + "num_input_tokens_seen": 72327912, + "step": 124655 + }, + { + "epoch": 18.567173071194517, + "grad_norm": 3.2449997888761573e-06, + "learning_rate": 7.778171873960516e-07, + "loss": 0.0, + "num_input_tokens_seen": 72330792, + "step": 124660 + }, + { + "epoch": 18.567917783735478, + "grad_norm": 9.900402801577002e-06, + "learning_rate": 7.770131528033409e-07, + "loss": 0.0, + "num_input_tokens_seen": 72333736, + "step": 124665 + }, + { + "epoch": 18.56866249627644, + "grad_norm": 2.4531443614250747e-06, + "learning_rate": 7.762095274333747e-07, + "loss": 0.0, + "num_input_tokens_seen": 72336360, + "step": 124670 + }, + { + "epoch": 18.569407208817395, + "grad_norm": 1.1752620594052132e-05, + "learning_rate": 7.754063112997284e-07, + "loss": 0.0, + "num_input_tokens_seen": 72339112, + "step": 124675 + }, + { + "epoch": 18.570151921358356, + "grad_norm": 2.1921619008935522e-06, + "learning_rate": 7.746035044159688e-07, + "loss": 0.0, + "num_input_tokens_seen": 72342184, + "step": 124680 + }, + { + "epoch": 18.570896633899316, + "grad_norm": 2.4223212676588446e-05, + "learning_rate": 7.738011067956658e-07, + "loss": 0.0, + "num_input_tokens_seen": 72345032, + "step": 124685 + }, + { + "epoch": 18.571641346440273, + "grad_norm": 0.0002740657946560532, + "learning_rate": 7.729991184523722e-07, + "loss": 0.0, + "num_input_tokens_seen": 72347880, + "step": 124690 + }, + { + "epoch": 18.572386058981234, + "grad_norm": 3.887364982801955e-06, + "learning_rate": 7.7219753939963e-07, + "loss": 0.0, + "num_input_tokens_seen": 72350920, + "step": 124695 + }, + { + "epoch": 18.57313077152219, + "grad_norm": 4.309390533308033e-06, + "learning_rate": 7.713963696509896e-07, + "loss": 0.0328, + "num_input_tokens_seen": 72353960, + "step": 124700 + }, + { + "epoch": 18.57387548406315, + "grad_norm": 2.463058081048075e-06, + "learning_rate": 7.705956092199818e-07, + "loss": 0.0, + "num_input_tokens_seen": 72357288, + "step": 124705 + }, + { + "epoch": 18.574620196604112, + "grad_norm": 3.1805464004719397e-06, + "learning_rate": 7.697952581201373e-07, + "loss": 0.0, + "num_input_tokens_seen": 72360072, + "step": 124710 + }, + { + "epoch": 18.57536490914507, + "grad_norm": 1.777098646016384e-06, + "learning_rate": 7.689953163649704e-07, + "loss": 0.0, + "num_input_tokens_seen": 72362824, + "step": 124715 + }, + { + "epoch": 18.57610962168603, + "grad_norm": 1.882041260614642e-06, + "learning_rate": 7.681957839680065e-07, + "loss": 0.0, + "num_input_tokens_seen": 72365608, + "step": 124720 + }, + { + "epoch": 18.57685433422699, + "grad_norm": 3.399530760361813e-05, + "learning_rate": 7.67396660942743e-07, + "loss": 0.0, + "num_input_tokens_seen": 72368616, + "step": 124725 + }, + { + "epoch": 18.577599046767947, + "grad_norm": 8.86622910911683e-06, + "learning_rate": 7.66597947302683e-07, + "loss": 0.0, + "num_input_tokens_seen": 72371592, + "step": 124730 + }, + { + "epoch": 18.578343759308908, + "grad_norm": 9.668709026300348e-06, + "learning_rate": 7.65799643061324e-07, + "loss": 0.0, + "num_input_tokens_seen": 72374216, + "step": 124735 + }, + { + "epoch": 18.579088471849865, + "grad_norm": 1.9292798242531717e-05, + "learning_rate": 7.65001748232147e-07, + "loss": 0.0, + "num_input_tokens_seen": 72377064, + "step": 124740 + }, + { + "epoch": 18.579833184390825, + "grad_norm": 2.243408971480676e-06, + "learning_rate": 7.642042628286355e-07, + "loss": 0.0, + "num_input_tokens_seen": 72380008, + "step": 124745 + }, + { + "epoch": 18.580577896931786, + "grad_norm": 5.9204471654084045e-06, + "learning_rate": 7.634071868642595e-07, + "loss": 0.0, + "num_input_tokens_seen": 72382920, + "step": 124750 + }, + { + "epoch": 18.581322609472743, + "grad_norm": 4.91203945784946e-06, + "learning_rate": 7.626105203524886e-07, + "loss": 0.0, + "num_input_tokens_seen": 72385704, + "step": 124755 + }, + { + "epoch": 18.582067322013703, + "grad_norm": 2.8411634048097767e-05, + "learning_rate": 7.61814263306776e-07, + "loss": 0.0, + "num_input_tokens_seen": 72388200, + "step": 124760 + }, + { + "epoch": 18.582812034554664, + "grad_norm": 7.83260566095123e-06, + "learning_rate": 7.610184157405803e-07, + "loss": 0.0, + "num_input_tokens_seen": 72391240, + "step": 124765 + }, + { + "epoch": 18.58355674709562, + "grad_norm": 9.042846068041399e-05, + "learning_rate": 7.602229776673409e-07, + "loss": 0.0, + "num_input_tokens_seen": 72394248, + "step": 124770 + }, + { + "epoch": 18.58430145963658, + "grad_norm": 3.0283924843388377e-06, + "learning_rate": 7.594279491004997e-07, + "loss": 0.0, + "num_input_tokens_seen": 72397224, + "step": 124775 + }, + { + "epoch": 18.585046172177538, + "grad_norm": 2.685850631678477e-05, + "learning_rate": 7.586333300534876e-07, + "loss": 0.0, + "num_input_tokens_seen": 72400360, + "step": 124780 + }, + { + "epoch": 18.5857908847185, + "grad_norm": 5.297565621731337e-06, + "learning_rate": 7.578391205397218e-07, + "loss": 0.0, + "num_input_tokens_seen": 72403304, + "step": 124785 + }, + { + "epoch": 18.58653559725946, + "grad_norm": 2.781347393465694e-05, + "learning_rate": 7.570453205726303e-07, + "loss": 0.0, + "num_input_tokens_seen": 72406056, + "step": 124790 + }, + { + "epoch": 18.587280309800416, + "grad_norm": 2.3475940906791948e-06, + "learning_rate": 7.562519301656162e-07, + "loss": 0.0, + "num_input_tokens_seen": 72408840, + "step": 124795 + }, + { + "epoch": 18.588025022341377, + "grad_norm": 5.039400093664881e-06, + "learning_rate": 7.554589493320885e-07, + "loss": 0.0002, + "num_input_tokens_seen": 72411880, + "step": 124800 + }, + { + "epoch": 18.588769734882334, + "grad_norm": 5.078302365291165e-06, + "learning_rate": 7.54666378085439e-07, + "loss": 0.0, + "num_input_tokens_seen": 72414856, + "step": 124805 + }, + { + "epoch": 18.589514447423294, + "grad_norm": 5.914164830755908e-06, + "learning_rate": 7.538742164390572e-07, + "loss": 0.0, + "num_input_tokens_seen": 72417896, + "step": 124810 + }, + { + "epoch": 18.590259159964255, + "grad_norm": 4.6584477786382195e-06, + "learning_rate": 7.530824644063295e-07, + "loss": 0.0, + "num_input_tokens_seen": 72420840, + "step": 124815 + }, + { + "epoch": 18.591003872505212, + "grad_norm": 0.00015036796685308218, + "learning_rate": 7.522911220006285e-07, + "loss": 0.0, + "num_input_tokens_seen": 72423528, + "step": 124820 + }, + { + "epoch": 18.591748585046172, + "grad_norm": 1.2961826541868504e-05, + "learning_rate": 7.515001892353268e-07, + "loss": 0.0, + "num_input_tokens_seen": 72426568, + "step": 124825 + }, + { + "epoch": 18.592493297587133, + "grad_norm": 3.0564447115466464e-06, + "learning_rate": 7.507096661237834e-07, + "loss": 0.0, + "num_input_tokens_seen": 72429288, + "step": 124830 + }, + { + "epoch": 18.59323801012809, + "grad_norm": 3.235544136259705e-05, + "learning_rate": 7.499195526793567e-07, + "loss": 0.0, + "num_input_tokens_seen": 72432072, + "step": 124835 + }, + { + "epoch": 18.59398272266905, + "grad_norm": 1.6518972188350745e-05, + "learning_rate": 7.491298489153919e-07, + "loss": 0.0, + "num_input_tokens_seen": 72435176, + "step": 124840 + }, + { + "epoch": 18.594727435210007, + "grad_norm": 1.5188737052085344e-05, + "learning_rate": 7.483405548452283e-07, + "loss": 0.0, + "num_input_tokens_seen": 72438088, + "step": 124845 + }, + { + "epoch": 18.595472147750968, + "grad_norm": 2.692609314181027e-06, + "learning_rate": 7.47551670482205e-07, + "loss": 0.0, + "num_input_tokens_seen": 72440808, + "step": 124850 + }, + { + "epoch": 18.59621686029193, + "grad_norm": 7.977944733283948e-06, + "learning_rate": 7.467631958396448e-07, + "loss": 0.0, + "num_input_tokens_seen": 72443880, + "step": 124855 + }, + { + "epoch": 18.596961572832885, + "grad_norm": 6.5225667640333995e-06, + "learning_rate": 7.459751309308733e-07, + "loss": 0.0, + "num_input_tokens_seen": 72446792, + "step": 124860 + }, + { + "epoch": 18.597706285373846, + "grad_norm": 2.7709456844604574e-05, + "learning_rate": 7.451874757691991e-07, + "loss": 0.0125, + "num_input_tokens_seen": 72449736, + "step": 124865 + }, + { + "epoch": 18.598450997914806, + "grad_norm": 6.063454566174187e-05, + "learning_rate": 7.444002303679309e-07, + "loss": 0.0, + "num_input_tokens_seen": 72452584, + "step": 124870 + }, + { + "epoch": 18.599195710455763, + "grad_norm": 2.0393408703966998e-05, + "learning_rate": 7.436133947403695e-07, + "loss": 0.0, + "num_input_tokens_seen": 72455592, + "step": 124875 + }, + { + "epoch": 18.599940422996724, + "grad_norm": 1.5677861711083096e-06, + "learning_rate": 7.428269688998068e-07, + "loss": 0.0, + "num_input_tokens_seen": 72458440, + "step": 124880 + }, + { + "epoch": 18.60068513553768, + "grad_norm": 2.1637650206685066e-05, + "learning_rate": 7.420409528595296e-07, + "loss": 0.0, + "num_input_tokens_seen": 72461224, + "step": 124885 + }, + { + "epoch": 18.60142984807864, + "grad_norm": 4.656783858081326e-06, + "learning_rate": 7.412553466328131e-07, + "loss": 0.0, + "num_input_tokens_seen": 72464232, + "step": 124890 + }, + { + "epoch": 18.602174560619602, + "grad_norm": 2.25258850150567e-06, + "learning_rate": 7.404701502329331e-07, + "loss": 0.0, + "num_input_tokens_seen": 72466792, + "step": 124895 + }, + { + "epoch": 18.60291927316056, + "grad_norm": 8.491359039908275e-05, + "learning_rate": 7.396853636731537e-07, + "loss": 0.0, + "num_input_tokens_seen": 72469512, + "step": 124900 + }, + { + "epoch": 18.60366398570152, + "grad_norm": 1.8904493117588572e-05, + "learning_rate": 7.389009869667341e-07, + "loss": 0.0, + "num_input_tokens_seen": 72472744, + "step": 124905 + }, + { + "epoch": 18.60440869824248, + "grad_norm": 4.972084752807859e-06, + "learning_rate": 7.381170201269244e-07, + "loss": 0.0, + "num_input_tokens_seen": 72475592, + "step": 124910 + }, + { + "epoch": 18.605153410783437, + "grad_norm": 2.248842065455392e-06, + "learning_rate": 7.373334631669698e-07, + "loss": 0.0, + "num_input_tokens_seen": 72478568, + "step": 124915 + }, + { + "epoch": 18.605898123324398, + "grad_norm": 3.406666291994043e-05, + "learning_rate": 7.365503161001013e-07, + "loss": 0.0, + "num_input_tokens_seen": 72481384, + "step": 124920 + }, + { + "epoch": 18.606642835865355, + "grad_norm": 2.0355896594992373e-06, + "learning_rate": 7.357675789395613e-07, + "loss": 0.0, + "num_input_tokens_seen": 72484104, + "step": 124925 + }, + { + "epoch": 18.607387548406315, + "grad_norm": 2.831761548804934e-06, + "learning_rate": 7.349852516985639e-07, + "loss": 0.0, + "num_input_tokens_seen": 72486920, + "step": 124930 + }, + { + "epoch": 18.608132260947276, + "grad_norm": 3.349893404447357e-06, + "learning_rate": 7.342033343903293e-07, + "loss": 0.0, + "num_input_tokens_seen": 72489896, + "step": 124935 + }, + { + "epoch": 18.608876973488233, + "grad_norm": 3.8456050788227e-06, + "learning_rate": 7.33421827028069e-07, + "loss": 0.0, + "num_input_tokens_seen": 72492936, + "step": 124940 + }, + { + "epoch": 18.609621686029193, + "grad_norm": 0.0001588146114954725, + "learning_rate": 7.326407296249782e-07, + "loss": 0.0, + "num_input_tokens_seen": 72496104, + "step": 124945 + }, + { + "epoch": 18.61036639857015, + "grad_norm": 9.87688599707326e-06, + "learning_rate": 7.318600421942628e-07, + "loss": 0.0, + "num_input_tokens_seen": 72498952, + "step": 124950 + }, + { + "epoch": 18.61111111111111, + "grad_norm": 3.6255237318982836e-06, + "learning_rate": 7.310797647491041e-07, + "loss": 0.0, + "num_input_tokens_seen": 72501864, + "step": 124955 + }, + { + "epoch": 18.61185582365207, + "grad_norm": 2.9097695914970245e-06, + "learning_rate": 7.302998973026887e-07, + "loss": 0.0, + "num_input_tokens_seen": 72504840, + "step": 124960 + }, + { + "epoch": 18.61260053619303, + "grad_norm": 2.2156753402668983e-05, + "learning_rate": 7.295204398681893e-07, + "loss": 0.0, + "num_input_tokens_seen": 72507816, + "step": 124965 + }, + { + "epoch": 18.61334524873399, + "grad_norm": 7.295599061762914e-06, + "learning_rate": 7.287413924587733e-07, + "loss": 0.0, + "num_input_tokens_seen": 72510600, + "step": 124970 + }, + { + "epoch": 18.61408996127495, + "grad_norm": 3.183994704158977e-05, + "learning_rate": 7.279627550876051e-07, + "loss": 0.0, + "num_input_tokens_seen": 72513512, + "step": 124975 + }, + { + "epoch": 18.614834673815906, + "grad_norm": 4.435949449543841e-06, + "learning_rate": 7.27184527767838e-07, + "loss": 0.0, + "num_input_tokens_seen": 72516296, + "step": 124980 + }, + { + "epoch": 18.615579386356867, + "grad_norm": 8.62818160385359e-06, + "learning_rate": 7.264067105126199e-07, + "loss": 0.0, + "num_input_tokens_seen": 72519048, + "step": 124985 + }, + { + "epoch": 18.616324098897824, + "grad_norm": 1.3124761608196422e-05, + "learning_rate": 7.256293033350847e-07, + "loss": 0.0, + "num_input_tokens_seen": 72522088, + "step": 124990 + }, + { + "epoch": 18.617068811438784, + "grad_norm": 3.7294576031854376e-05, + "learning_rate": 7.248523062483748e-07, + "loss": 0.0, + "num_input_tokens_seen": 72525096, + "step": 124995 + }, + { + "epoch": 18.617813523979745, + "grad_norm": 5.435761977423681e-06, + "learning_rate": 7.2407571926561e-07, + "loss": 0.0, + "num_input_tokens_seen": 72528008, + "step": 125000 + }, + { + "epoch": 18.618558236520702, + "grad_norm": 3.2390846627095016e-06, + "learning_rate": 7.232995423999162e-07, + "loss": 0.0, + "num_input_tokens_seen": 72531048, + "step": 125005 + }, + { + "epoch": 18.619302949061662, + "grad_norm": 5.5513851293653715e-06, + "learning_rate": 7.225237756644021e-07, + "loss": 0.0, + "num_input_tokens_seen": 72533800, + "step": 125010 + }, + { + "epoch": 18.620047661602623, + "grad_norm": 4.543529030343052e-06, + "learning_rate": 7.217484190721712e-07, + "loss": 0.0, + "num_input_tokens_seen": 72536552, + "step": 125015 + }, + { + "epoch": 18.62079237414358, + "grad_norm": 4.147825529798865e-05, + "learning_rate": 7.209734726363299e-07, + "loss": 0.0, + "num_input_tokens_seen": 72539112, + "step": 125020 + }, + { + "epoch": 18.62153708668454, + "grad_norm": 3.3663086469459813e-06, + "learning_rate": 7.201989363699618e-07, + "loss": 0.0, + "num_input_tokens_seen": 72542312, + "step": 125025 + }, + { + "epoch": 18.622281799225497, + "grad_norm": 2.072925326501718e-06, + "learning_rate": 7.194248102861594e-07, + "loss": 0.0, + "num_input_tokens_seen": 72545256, + "step": 125030 + }, + { + "epoch": 18.623026511766458, + "grad_norm": 0.00907036941498518, + "learning_rate": 7.186510943979957e-07, + "loss": 0.0, + "num_input_tokens_seen": 72548296, + "step": 125035 + }, + { + "epoch": 18.62377122430742, + "grad_norm": 2.8190702323627193e-06, + "learning_rate": 7.178777887185434e-07, + "loss": 0.0, + "num_input_tokens_seen": 72551080, + "step": 125040 + }, + { + "epoch": 18.624515936848375, + "grad_norm": 5.200014129513875e-06, + "learning_rate": 7.17104893260867e-07, + "loss": 0.0, + "num_input_tokens_seen": 72554120, + "step": 125045 + }, + { + "epoch": 18.625260649389336, + "grad_norm": 1.8252915197081165e-06, + "learning_rate": 7.16332408038023e-07, + "loss": 0.0, + "num_input_tokens_seen": 72556744, + "step": 125050 + }, + { + "epoch": 18.626005361930297, + "grad_norm": 4.4246294237382244e-06, + "learning_rate": 7.155603330630617e-07, + "loss": 0.0, + "num_input_tokens_seen": 72559624, + "step": 125055 + }, + { + "epoch": 18.626750074471254, + "grad_norm": 3.7528950542764505e-06, + "learning_rate": 7.147886683490256e-07, + "loss": 0.0, + "num_input_tokens_seen": 72562440, + "step": 125060 + }, + { + "epoch": 18.627494787012214, + "grad_norm": 3.632163134170696e-05, + "learning_rate": 7.140174139089545e-07, + "loss": 0.0, + "num_input_tokens_seen": 72564968, + "step": 125065 + }, + { + "epoch": 18.62823949955317, + "grad_norm": 3.2986801670631394e-05, + "learning_rate": 7.132465697558737e-07, + "loss": 0.0, + "num_input_tokens_seen": 72567592, + "step": 125070 + }, + { + "epoch": 18.62898421209413, + "grad_norm": 2.4670143830007873e-06, + "learning_rate": 7.124761359028121e-07, + "loss": 0.0, + "num_input_tokens_seen": 72570728, + "step": 125075 + }, + { + "epoch": 18.629728924635092, + "grad_norm": 1.244727991434047e-05, + "learning_rate": 7.117061123627783e-07, + "loss": 0.0, + "num_input_tokens_seen": 72573672, + "step": 125080 + }, + { + "epoch": 18.63047363717605, + "grad_norm": 3.073887228310923e-06, + "learning_rate": 7.109364991487872e-07, + "loss": 0.0, + "num_input_tokens_seen": 72576392, + "step": 125085 + }, + { + "epoch": 18.63121834971701, + "grad_norm": 6.162228237371892e-05, + "learning_rate": 7.101672962738365e-07, + "loss": 0.0, + "num_input_tokens_seen": 72579464, + "step": 125090 + }, + { + "epoch": 18.631963062257967, + "grad_norm": 0.06849255412817001, + "learning_rate": 7.093985037509188e-07, + "loss": 0.0001, + "num_input_tokens_seen": 72582312, + "step": 125095 + }, + { + "epoch": 18.632707774798927, + "grad_norm": 4.194789653411135e-06, + "learning_rate": 7.086301215930291e-07, + "loss": 0.0, + "num_input_tokens_seen": 72585000, + "step": 125100 + }, + { + "epoch": 18.633452487339888, + "grad_norm": 2.2000203898642212e-06, + "learning_rate": 7.078621498131461e-07, + "loss": 0.0, + "num_input_tokens_seen": 72587848, + "step": 125105 + }, + { + "epoch": 18.634197199880845, + "grad_norm": 3.7359427551564295e-06, + "learning_rate": 7.070945884242397e-07, + "loss": 0.0, + "num_input_tokens_seen": 72590760, + "step": 125110 + }, + { + "epoch": 18.634941912421805, + "grad_norm": 0.029173705726861954, + "learning_rate": 7.063274374392803e-07, + "loss": 0.0001, + "num_input_tokens_seen": 72593512, + "step": 125115 + }, + { + "epoch": 18.635686624962766, + "grad_norm": 4.633601747627836e-06, + "learning_rate": 7.055606968712297e-07, + "loss": 0.0, + "num_input_tokens_seen": 72596712, + "step": 125120 + }, + { + "epoch": 18.636431337503723, + "grad_norm": 9.803587090573274e-06, + "learning_rate": 7.047943667330386e-07, + "loss": 0.0, + "num_input_tokens_seen": 72599528, + "step": 125125 + }, + { + "epoch": 18.637176050044683, + "grad_norm": 1.0319400644220877e-05, + "learning_rate": 7.040284470376523e-07, + "loss": 0.0, + "num_input_tokens_seen": 72602376, + "step": 125130 + }, + { + "epoch": 18.63792076258564, + "grad_norm": 6.485275662271306e-06, + "learning_rate": 7.032629377980133e-07, + "loss": 0.0, + "num_input_tokens_seen": 72605224, + "step": 125135 + }, + { + "epoch": 18.6386654751266, + "grad_norm": 7.251903753058286e-06, + "learning_rate": 7.024978390270526e-07, + "loss": 0.0, + "num_input_tokens_seen": 72607976, + "step": 125140 + }, + { + "epoch": 18.63941018766756, + "grad_norm": 2.159640280297026e-05, + "learning_rate": 7.017331507376962e-07, + "loss": 0.0, + "num_input_tokens_seen": 72610824, + "step": 125145 + }, + { + "epoch": 18.64015490020852, + "grad_norm": 2.38384473050246e-05, + "learning_rate": 7.009688729428615e-07, + "loss": 0.0, + "num_input_tokens_seen": 72613640, + "step": 125150 + }, + { + "epoch": 18.64089961274948, + "grad_norm": 3.74616183762555e-06, + "learning_rate": 7.002050056554632e-07, + "loss": 0.0, + "num_input_tokens_seen": 72616808, + "step": 125155 + }, + { + "epoch": 18.64164432529044, + "grad_norm": 1.6261963537544943e-05, + "learning_rate": 6.994415488884021e-07, + "loss": 0.0, + "num_input_tokens_seen": 72619592, + "step": 125160 + }, + { + "epoch": 18.642389037831396, + "grad_norm": 4.295183316571638e-06, + "learning_rate": 6.986785026545789e-07, + "loss": 0.0011, + "num_input_tokens_seen": 72622472, + "step": 125165 + }, + { + "epoch": 18.643133750372357, + "grad_norm": 5.385119948186912e-06, + "learning_rate": 6.979158669668862e-07, + "loss": 0.0, + "num_input_tokens_seen": 72625352, + "step": 125170 + }, + { + "epoch": 18.643878462913314, + "grad_norm": 9.953334483725484e-06, + "learning_rate": 6.971536418382052e-07, + "loss": 0.0, + "num_input_tokens_seen": 72628232, + "step": 125175 + }, + { + "epoch": 18.644623175454274, + "grad_norm": 1.4138030564936344e-05, + "learning_rate": 6.963918272814119e-07, + "loss": 0.0, + "num_input_tokens_seen": 72631432, + "step": 125180 + }, + { + "epoch": 18.645367887995235, + "grad_norm": 3.9878224924905226e-05, + "learning_rate": 6.956304233093736e-07, + "loss": 0.0, + "num_input_tokens_seen": 72634280, + "step": 125185 + }, + { + "epoch": 18.646112600536192, + "grad_norm": 4.544851890386781e-06, + "learning_rate": 6.948694299349634e-07, + "loss": 0.0, + "num_input_tokens_seen": 72637160, + "step": 125190 + }, + { + "epoch": 18.646857313077152, + "grad_norm": 4.781690677191364e-06, + "learning_rate": 6.941088471710266e-07, + "loss": 0.0, + "num_input_tokens_seen": 72640136, + "step": 125195 + }, + { + "epoch": 18.647602025618113, + "grad_norm": 3.474545337667223e-06, + "learning_rate": 6.933486750304197e-07, + "loss": 0.0, + "num_input_tokens_seen": 72642984, + "step": 125200 + }, + { + "epoch": 18.64834673815907, + "grad_norm": 0.00012705130211543292, + "learning_rate": 6.92588913525985e-07, + "loss": 0.0, + "num_input_tokens_seen": 72645928, + "step": 125205 + }, + { + "epoch": 18.64909145070003, + "grad_norm": 4.573550540953875e-06, + "learning_rate": 6.918295626705512e-07, + "loss": 0.0, + "num_input_tokens_seen": 72648872, + "step": 125210 + }, + { + "epoch": 18.649836163240987, + "grad_norm": 2.1631872186844703e-06, + "learning_rate": 6.910706224769553e-07, + "loss": 0.0, + "num_input_tokens_seen": 72651560, + "step": 125215 + }, + { + "epoch": 18.650580875781948, + "grad_norm": 4.936559889756609e-06, + "learning_rate": 6.903120929580092e-07, + "loss": 0.0, + "num_input_tokens_seen": 72654440, + "step": 125220 + }, + { + "epoch": 18.65132558832291, + "grad_norm": 0.00027825060533359647, + "learning_rate": 6.895539741265389e-07, + "loss": 0.0, + "num_input_tokens_seen": 72657064, + "step": 125225 + }, + { + "epoch": 18.652070300863866, + "grad_norm": 2.1385892978287302e-05, + "learning_rate": 6.887962659953423e-07, + "loss": 0.0, + "num_input_tokens_seen": 72660136, + "step": 125230 + }, + { + "epoch": 18.652815013404826, + "grad_norm": 2.328347363800276e-06, + "learning_rate": 6.88038968577226e-07, + "loss": 0.0, + "num_input_tokens_seen": 72662824, + "step": 125235 + }, + { + "epoch": 18.653559725945787, + "grad_norm": 5.8868149608315434e-06, + "learning_rate": 6.872820818849823e-07, + "loss": 0.0, + "num_input_tokens_seen": 72665896, + "step": 125240 + }, + { + "epoch": 18.654304438486744, + "grad_norm": 2.571166078269016e-06, + "learning_rate": 6.865256059313985e-07, + "loss": 0.0, + "num_input_tokens_seen": 72668680, + "step": 125245 + }, + { + "epoch": 18.655049151027704, + "grad_norm": 2.2051242467568954e-06, + "learning_rate": 6.857695407292503e-07, + "loss": 0.0, + "num_input_tokens_seen": 72671336, + "step": 125250 + }, + { + "epoch": 18.65579386356866, + "grad_norm": 5.1937581702077296e-06, + "learning_rate": 6.850138862913136e-07, + "loss": 0.0, + "num_input_tokens_seen": 72674088, + "step": 125255 + }, + { + "epoch": 18.65653857610962, + "grad_norm": 2.96401844934735e-06, + "learning_rate": 6.842586426303588e-07, + "loss": 0.0, + "num_input_tokens_seen": 72677128, + "step": 125260 + }, + { + "epoch": 18.657283288650582, + "grad_norm": 4.225020529702306e-05, + "learning_rate": 6.835038097591367e-07, + "loss": 0.0, + "num_input_tokens_seen": 72680008, + "step": 125265 + }, + { + "epoch": 18.65802800119154, + "grad_norm": 2.471027300998685e-06, + "learning_rate": 6.82749387690404e-07, + "loss": 0.0, + "num_input_tokens_seen": 72682696, + "step": 125270 + }, + { + "epoch": 18.6587727137325, + "grad_norm": 1.9054825315834023e-05, + "learning_rate": 6.819953764369058e-07, + "loss": 0.0, + "num_input_tokens_seen": 72685672, + "step": 125275 + }, + { + "epoch": 18.65951742627346, + "grad_norm": 4.3658623326336965e-05, + "learning_rate": 6.812417760113821e-07, + "loss": 0.0, + "num_input_tokens_seen": 72688776, + "step": 125280 + }, + { + "epoch": 18.660262138814417, + "grad_norm": 3.3172877920151222e-06, + "learning_rate": 6.804885864265587e-07, + "loss": 0.0, + "num_input_tokens_seen": 72691848, + "step": 125285 + }, + { + "epoch": 18.661006851355378, + "grad_norm": 5.423313632491045e-05, + "learning_rate": 6.79735807695167e-07, + "loss": 0.0, + "num_input_tokens_seen": 72694760, + "step": 125290 + }, + { + "epoch": 18.661751563896335, + "grad_norm": 1.095725383493118e-05, + "learning_rate": 6.789834398299194e-07, + "loss": 0.0, + "num_input_tokens_seen": 72697384, + "step": 125295 + }, + { + "epoch": 18.662496276437295, + "grad_norm": 3.2363524951506406e-05, + "learning_rate": 6.782314828435249e-07, + "loss": 0.0, + "num_input_tokens_seen": 72700328, + "step": 125300 + }, + { + "epoch": 18.663240988978256, + "grad_norm": 0.000554886064492166, + "learning_rate": 6.774799367486956e-07, + "loss": 0.0, + "num_input_tokens_seen": 72703240, + "step": 125305 + }, + { + "epoch": 18.663985701519213, + "grad_norm": 2.4262542865471914e-05, + "learning_rate": 6.767288015581186e-07, + "loss": 0.0, + "num_input_tokens_seen": 72706440, + "step": 125310 + }, + { + "epoch": 18.664730414060173, + "grad_norm": 1.2665945178014226e-05, + "learning_rate": 6.759780772844892e-07, + "loss": 0.0229, + "num_input_tokens_seen": 72709256, + "step": 125315 + }, + { + "epoch": 18.66547512660113, + "grad_norm": 8.350382995558903e-05, + "learning_rate": 6.752277639404863e-07, + "loss": 0.0, + "num_input_tokens_seen": 72711976, + "step": 125320 + }, + { + "epoch": 18.66621983914209, + "grad_norm": 2.1419095901364926e-06, + "learning_rate": 6.744778615387914e-07, + "loss": 0.0, + "num_input_tokens_seen": 72714696, + "step": 125325 + }, + { + "epoch": 18.66696455168305, + "grad_norm": 2.354936668780283e-06, + "learning_rate": 6.737283700920666e-07, + "loss": 0.0, + "num_input_tokens_seen": 72717608, + "step": 125330 + }, + { + "epoch": 18.66770926422401, + "grad_norm": 3.681202770167147e-06, + "learning_rate": 6.729792896129767e-07, + "loss": 0.0, + "num_input_tokens_seen": 72720360, + "step": 125335 + }, + { + "epoch": 18.66845397676497, + "grad_norm": 2.9777095278404886e-06, + "learning_rate": 6.722306201141781e-07, + "loss": 0.0, + "num_input_tokens_seen": 72723080, + "step": 125340 + }, + { + "epoch": 18.66919868930593, + "grad_norm": 1.8576354705146514e-05, + "learning_rate": 6.714823616083165e-07, + "loss": 0.2281, + "num_input_tokens_seen": 72725928, + "step": 125345 + }, + { + "epoch": 18.669943401846886, + "grad_norm": 2.1384873889473965e-06, + "learning_rate": 6.707345141080345e-07, + "loss": 0.0, + "num_input_tokens_seen": 72728648, + "step": 125350 + }, + { + "epoch": 18.670688114387847, + "grad_norm": 6.819737791374791e-06, + "learning_rate": 6.699870776259637e-07, + "loss": 0.0, + "num_input_tokens_seen": 72731656, + "step": 125355 + }, + { + "epoch": 18.671432826928804, + "grad_norm": 1.4458612895396072e-05, + "learning_rate": 6.692400521747355e-07, + "loss": 0.0, + "num_input_tokens_seen": 72734632, + "step": 125360 + }, + { + "epoch": 18.672177539469764, + "grad_norm": 4.042900400236249e-06, + "learning_rate": 6.684934377669705e-07, + "loss": 0.0, + "num_input_tokens_seen": 72737480, + "step": 125365 + }, + { + "epoch": 18.672922252010725, + "grad_norm": 2.429984988339129e-06, + "learning_rate": 6.67747234415278e-07, + "loss": 0.0, + "num_input_tokens_seen": 72740584, + "step": 125370 + }, + { + "epoch": 18.673666964551682, + "grad_norm": 3.422415829845704e-06, + "learning_rate": 6.670014421322618e-07, + "loss": 0.0, + "num_input_tokens_seen": 72743592, + "step": 125375 + }, + { + "epoch": 18.674411677092642, + "grad_norm": 0.0001599590468686074, + "learning_rate": 6.662560609305285e-07, + "loss": 0.0, + "num_input_tokens_seen": 72746408, + "step": 125380 + }, + { + "epoch": 18.675156389633603, + "grad_norm": 3.4383645015623188e-06, + "learning_rate": 6.655110908226681e-07, + "loss": 0.0, + "num_input_tokens_seen": 72749192, + "step": 125385 + }, + { + "epoch": 18.67590110217456, + "grad_norm": 2.4992677936097607e-06, + "learning_rate": 6.647665318212621e-07, + "loss": 0.0, + "num_input_tokens_seen": 72751944, + "step": 125390 + }, + { + "epoch": 18.67664581471552, + "grad_norm": 4.6302051487145945e-05, + "learning_rate": 6.640223839388948e-07, + "loss": 0.0, + "num_input_tokens_seen": 72754824, + "step": 125395 + }, + { + "epoch": 18.677390527256478, + "grad_norm": 4.857481599174207e-06, + "learning_rate": 6.632786471881342e-07, + "loss": 0.0, + "num_input_tokens_seen": 72757736, + "step": 125400 + }, + { + "epoch": 18.678135239797438, + "grad_norm": 4.482042186282342e-06, + "learning_rate": 6.625353215815478e-07, + "loss": 0.0, + "num_input_tokens_seen": 72760520, + "step": 125405 + }, + { + "epoch": 18.6788799523384, + "grad_norm": 6.357412075885804e-06, + "learning_rate": 6.617924071316894e-07, + "loss": 0.0, + "num_input_tokens_seen": 72763432, + "step": 125410 + }, + { + "epoch": 18.679624664879356, + "grad_norm": 0.0016424772329628468, + "learning_rate": 6.610499038511131e-07, + "loss": 0.0, + "num_input_tokens_seen": 72766344, + "step": 125415 + }, + { + "epoch": 18.680369377420316, + "grad_norm": 5.5042064559529535e-06, + "learning_rate": 6.603078117523615e-07, + "loss": 0.0, + "num_input_tokens_seen": 72769096, + "step": 125420 + }, + { + "epoch": 18.681114089961277, + "grad_norm": 5.232679541222751e-06, + "learning_rate": 6.595661308479717e-07, + "loss": 0.0, + "num_input_tokens_seen": 72771784, + "step": 125425 + }, + { + "epoch": 18.681858802502234, + "grad_norm": 2.197345565946307e-05, + "learning_rate": 6.588248611504755e-07, + "loss": 0.0, + "num_input_tokens_seen": 72774632, + "step": 125430 + }, + { + "epoch": 18.682603515043194, + "grad_norm": 1.9538385913620004e-06, + "learning_rate": 6.580840026723934e-07, + "loss": 0.0, + "num_input_tokens_seen": 72777480, + "step": 125435 + }, + { + "epoch": 18.68334822758415, + "grad_norm": 2.300521146025858e-06, + "learning_rate": 6.573435554262403e-07, + "loss": 0.0, + "num_input_tokens_seen": 72780456, + "step": 125440 + }, + { + "epoch": 18.68409294012511, + "grad_norm": 1.421935849066358e-05, + "learning_rate": 6.566035194245257e-07, + "loss": 0.0, + "num_input_tokens_seen": 72783592, + "step": 125445 + }, + { + "epoch": 18.684837652666072, + "grad_norm": 1.799979327188339e-05, + "learning_rate": 6.558638946797563e-07, + "loss": 0.0, + "num_input_tokens_seen": 72786376, + "step": 125450 + }, + { + "epoch": 18.68558236520703, + "grad_norm": 2.3659260932618054e-06, + "learning_rate": 6.551246812044248e-07, + "loss": 0.0, + "num_input_tokens_seen": 72789512, + "step": 125455 + }, + { + "epoch": 18.68632707774799, + "grad_norm": 4.792802883457625e-06, + "learning_rate": 6.543858790110158e-07, + "loss": 0.0, + "num_input_tokens_seen": 72792456, + "step": 125460 + }, + { + "epoch": 18.687071790288947, + "grad_norm": 2.921845180026139e-06, + "learning_rate": 6.536474881120164e-07, + "loss": 0.0, + "num_input_tokens_seen": 72795368, + "step": 125465 + }, + { + "epoch": 18.687816502829907, + "grad_norm": 3.658379000626155e-06, + "learning_rate": 6.529095085198944e-07, + "loss": 0.0, + "num_input_tokens_seen": 72798408, + "step": 125470 + }, + { + "epoch": 18.688561215370868, + "grad_norm": 6.221549938345561e-06, + "learning_rate": 6.521719402471233e-07, + "loss": 0.0, + "num_input_tokens_seen": 72801128, + "step": 125475 + }, + { + "epoch": 18.689305927911825, + "grad_norm": 2.23660367737466e-06, + "learning_rate": 6.514347833061596e-07, + "loss": 0.0, + "num_input_tokens_seen": 72803944, + "step": 125480 + }, + { + "epoch": 18.690050640452785, + "grad_norm": 2.8715871849271934e-06, + "learning_rate": 6.506980377094601e-07, + "loss": 0.0, + "num_input_tokens_seen": 72806760, + "step": 125485 + }, + { + "epoch": 18.690795352993746, + "grad_norm": 0.0001861867494881153, + "learning_rate": 6.499617034694705e-07, + "loss": 0.0, + "num_input_tokens_seen": 72809544, + "step": 125490 + }, + { + "epoch": 18.691540065534703, + "grad_norm": 2.7883165785169695e-06, + "learning_rate": 6.492257805986279e-07, + "loss": 0.0, + "num_input_tokens_seen": 72812392, + "step": 125495 + }, + { + "epoch": 18.692284778075663, + "grad_norm": 1.288445037062047e-05, + "learning_rate": 6.48490269109367e-07, + "loss": 0.0, + "num_input_tokens_seen": 72815112, + "step": 125500 + }, + { + "epoch": 18.69302949061662, + "grad_norm": 4.969982455804711e-06, + "learning_rate": 6.477551690141165e-07, + "loss": 0.0, + "num_input_tokens_seen": 72818056, + "step": 125505 + }, + { + "epoch": 18.69377420315758, + "grad_norm": 4.327353053668048e-06, + "learning_rate": 6.470204803252888e-07, + "loss": 0.0, + "num_input_tokens_seen": 72820968, + "step": 125510 + }, + { + "epoch": 18.69451891569854, + "grad_norm": 0.00013789789227303118, + "learning_rate": 6.462862030552991e-07, + "loss": 0.0, + "num_input_tokens_seen": 72823816, + "step": 125515 + }, + { + "epoch": 18.6952636282395, + "grad_norm": 5.983312439639121e-05, + "learning_rate": 6.455523372165512e-07, + "loss": 0.0, + "num_input_tokens_seen": 72826696, + "step": 125520 + }, + { + "epoch": 18.69600834078046, + "grad_norm": 2.440945991111221e-06, + "learning_rate": 6.448188828214435e-07, + "loss": 0.0, + "num_input_tokens_seen": 72829544, + "step": 125525 + }, + { + "epoch": 18.69675305332142, + "grad_norm": 4.789931153936777e-06, + "learning_rate": 6.44085839882369e-07, + "loss": 0.0, + "num_input_tokens_seen": 72832360, + "step": 125530 + }, + { + "epoch": 18.697497765862376, + "grad_norm": 3.007671921295696e-06, + "learning_rate": 6.433532084117122e-07, + "loss": 0.0, + "num_input_tokens_seen": 72835176, + "step": 125535 + }, + { + "epoch": 18.698242478403337, + "grad_norm": 1.6862464690348133e-05, + "learning_rate": 6.426209884218437e-07, + "loss": 0.0, + "num_input_tokens_seen": 72838088, + "step": 125540 + }, + { + "epoch": 18.698987190944294, + "grad_norm": 4.232289211358875e-05, + "learning_rate": 6.418891799251397e-07, + "loss": 0.0, + "num_input_tokens_seen": 72840872, + "step": 125545 + }, + { + "epoch": 18.699731903485254, + "grad_norm": 2.6211655494989827e-06, + "learning_rate": 6.411577829339599e-07, + "loss": 0.0, + "num_input_tokens_seen": 72843656, + "step": 125550 + }, + { + "epoch": 18.700476616026215, + "grad_norm": 3.1107897484616842e-06, + "learning_rate": 6.404267974606637e-07, + "loss": 0.0, + "num_input_tokens_seen": 72846664, + "step": 125555 + }, + { + "epoch": 18.701221328567172, + "grad_norm": 0.00017764313088264316, + "learning_rate": 6.396962235175968e-07, + "loss": 0.0, + "num_input_tokens_seen": 72849512, + "step": 125560 + }, + { + "epoch": 18.701966041108133, + "grad_norm": 2.353669515287038e-06, + "learning_rate": 6.38966061117105e-07, + "loss": 0.0, + "num_input_tokens_seen": 72852488, + "step": 125565 + }, + { + "epoch": 18.702710753649093, + "grad_norm": 2.810764271998778e-05, + "learning_rate": 6.382363102715255e-07, + "loss": 0.0, + "num_input_tokens_seen": 72855080, + "step": 125570 + }, + { + "epoch": 18.70345546619005, + "grad_norm": 3.2109378480527084e-06, + "learning_rate": 6.375069709931792e-07, + "loss": 0.0, + "num_input_tokens_seen": 72857960, + "step": 125575 + }, + { + "epoch": 18.70420017873101, + "grad_norm": 2.422849320282694e-06, + "learning_rate": 6.367780432943948e-07, + "loss": 0.0, + "num_input_tokens_seen": 72861000, + "step": 125580 + }, + { + "epoch": 18.704944891271968, + "grad_norm": 2.5124352305283537e-06, + "learning_rate": 6.360495271874794e-07, + "loss": 0.0, + "num_input_tokens_seen": 72864104, + "step": 125585 + }, + { + "epoch": 18.705689603812928, + "grad_norm": 1.1030545465473551e-05, + "learning_rate": 6.353214226847482e-07, + "loss": 0.0001, + "num_input_tokens_seen": 72866888, + "step": 125590 + }, + { + "epoch": 18.70643431635389, + "grad_norm": 7.705553798587061e-06, + "learning_rate": 6.345937297984966e-07, + "loss": 0.0, + "num_input_tokens_seen": 72869704, + "step": 125595 + }, + { + "epoch": 18.707179028894846, + "grad_norm": 2.5190793166984804e-06, + "learning_rate": 6.338664485410206e-07, + "loss": 0.0, + "num_input_tokens_seen": 72872520, + "step": 125600 + }, + { + "epoch": 18.707923741435806, + "grad_norm": 2.014998608501628e-05, + "learning_rate": 6.331395789246048e-07, + "loss": 0.0, + "num_input_tokens_seen": 72875464, + "step": 125605 + }, + { + "epoch": 18.708668453976763, + "grad_norm": 3.8901957850612234e-06, + "learning_rate": 6.324131209615336e-07, + "loss": 0.0, + "num_input_tokens_seen": 72878248, + "step": 125610 + }, + { + "epoch": 18.709413166517724, + "grad_norm": 3.953114173782524e-06, + "learning_rate": 6.316870746640751e-07, + "loss": 0.0, + "num_input_tokens_seen": 72880968, + "step": 125615 + }, + { + "epoch": 18.710157879058684, + "grad_norm": 2.6958127818943467e-06, + "learning_rate": 6.309614400444946e-07, + "loss": 0.0, + "num_input_tokens_seen": 72883624, + "step": 125620 + }, + { + "epoch": 18.71090259159964, + "grad_norm": 6.120036414358765e-05, + "learning_rate": 6.302362171150572e-07, + "loss": 0.0, + "num_input_tokens_seen": 72886472, + "step": 125625 + }, + { + "epoch": 18.7116473041406, + "grad_norm": 4.162988261668943e-05, + "learning_rate": 6.295114058880059e-07, + "loss": 0.0, + "num_input_tokens_seen": 72889192, + "step": 125630 + }, + { + "epoch": 18.712392016681562, + "grad_norm": 1.255077586392872e-05, + "learning_rate": 6.287870063755946e-07, + "loss": 0.0, + "num_input_tokens_seen": 72892072, + "step": 125635 + }, + { + "epoch": 18.71313672922252, + "grad_norm": 8.30523367767455e-06, + "learning_rate": 6.280630185900555e-07, + "loss": 0.0, + "num_input_tokens_seen": 72895080, + "step": 125640 + }, + { + "epoch": 18.71388144176348, + "grad_norm": 7.134355837479234e-05, + "learning_rate": 6.273394425436202e-07, + "loss": 0.0, + "num_input_tokens_seen": 72897928, + "step": 125645 + }, + { + "epoch": 18.714626154304437, + "grad_norm": 3.222387022105977e-06, + "learning_rate": 6.26616278248518e-07, + "loss": 0.0, + "num_input_tokens_seen": 72900904, + "step": 125650 + }, + { + "epoch": 18.715370866845397, + "grad_norm": 3.1970855616236804e-06, + "learning_rate": 6.258935257169557e-07, + "loss": 0.0, + "num_input_tokens_seen": 72903528, + "step": 125655 + }, + { + "epoch": 18.716115579386358, + "grad_norm": 0.00010563458636170253, + "learning_rate": 6.251711849611513e-07, + "loss": 0.0, + "num_input_tokens_seen": 72906536, + "step": 125660 + }, + { + "epoch": 18.716860291927315, + "grad_norm": 2.0584277081070468e-05, + "learning_rate": 6.244492559933063e-07, + "loss": 0.0, + "num_input_tokens_seen": 72909576, + "step": 125665 + }, + { + "epoch": 18.717605004468275, + "grad_norm": 2.2727112991560716e-06, + "learning_rate": 6.237277388256191e-07, + "loss": 0.0, + "num_input_tokens_seen": 72912456, + "step": 125670 + }, + { + "epoch": 18.718349717009236, + "grad_norm": 6.678042609564727e-06, + "learning_rate": 6.230066334702744e-07, + "loss": 0.0, + "num_input_tokens_seen": 72915144, + "step": 125675 + }, + { + "epoch": 18.719094429550193, + "grad_norm": 4.8910980694927275e-05, + "learning_rate": 6.22285939939457e-07, + "loss": 0.0, + "num_input_tokens_seen": 72917832, + "step": 125680 + }, + { + "epoch": 18.719839142091153, + "grad_norm": 3.880227723129792e-06, + "learning_rate": 6.215656582453433e-07, + "loss": 0.0, + "num_input_tokens_seen": 72920808, + "step": 125685 + }, + { + "epoch": 18.72058385463211, + "grad_norm": 4.19912703364389e-06, + "learning_rate": 6.208457884001012e-07, + "loss": 0.0, + "num_input_tokens_seen": 72923720, + "step": 125690 + }, + { + "epoch": 18.72132856717307, + "grad_norm": 5.939130915066926e-06, + "learning_rate": 6.201263304158905e-07, + "loss": 0.0, + "num_input_tokens_seen": 72926600, + "step": 125695 + }, + { + "epoch": 18.72207327971403, + "grad_norm": 6.953649517527083e-06, + "learning_rate": 6.194072843048681e-07, + "loss": 0.0, + "num_input_tokens_seen": 72929480, + "step": 125700 + }, + { + "epoch": 18.72281799225499, + "grad_norm": 3.6640010421251645e-06, + "learning_rate": 6.1868865007918e-07, + "loss": 0.0, + "num_input_tokens_seen": 72932584, + "step": 125705 + }, + { + "epoch": 18.72356270479595, + "grad_norm": 2.0549539840430953e-05, + "learning_rate": 6.179704277509662e-07, + "loss": 0.0, + "num_input_tokens_seen": 72935368, + "step": 125710 + }, + { + "epoch": 18.72430741733691, + "grad_norm": 6.566583124367753e-06, + "learning_rate": 6.172526173323617e-07, + "loss": 0.0, + "num_input_tokens_seen": 72938312, + "step": 125715 + }, + { + "epoch": 18.725052129877866, + "grad_norm": 0.0001028282567858696, + "learning_rate": 6.165352188354928e-07, + "loss": 0.0, + "num_input_tokens_seen": 72941512, + "step": 125720 + }, + { + "epoch": 18.725796842418827, + "grad_norm": 2.5800488856475567e-06, + "learning_rate": 6.158182322724804e-07, + "loss": 0.0, + "num_input_tokens_seen": 72944264, + "step": 125725 + }, + { + "epoch": 18.726541554959784, + "grad_norm": 8.1030280853156e-05, + "learning_rate": 6.151016576554341e-07, + "loss": 0.0, + "num_input_tokens_seen": 72947464, + "step": 125730 + }, + { + "epoch": 18.727286267500745, + "grad_norm": 0.0005752811557613313, + "learning_rate": 6.143854949964611e-07, + "loss": 0.0, + "num_input_tokens_seen": 72950248, + "step": 125735 + }, + { + "epoch": 18.728030980041705, + "grad_norm": 2.564696842455305e-06, + "learning_rate": 6.136697443076628e-07, + "loss": 0.0, + "num_input_tokens_seen": 72953096, + "step": 125740 + }, + { + "epoch": 18.728775692582662, + "grad_norm": 5.792568572360324e-06, + "learning_rate": 6.129544056011266e-07, + "loss": 0.0, + "num_input_tokens_seen": 72955784, + "step": 125745 + }, + { + "epoch": 18.729520405123623, + "grad_norm": 5.852624781255145e-06, + "learning_rate": 6.122394788889402e-07, + "loss": 0.0, + "num_input_tokens_seen": 72958600, + "step": 125750 + }, + { + "epoch": 18.730265117664583, + "grad_norm": 2.004327325266786e-05, + "learning_rate": 6.115249641831828e-07, + "loss": 0.0, + "num_input_tokens_seen": 72961544, + "step": 125755 + }, + { + "epoch": 18.73100983020554, + "grad_norm": 5.823747778777033e-05, + "learning_rate": 6.108108614959224e-07, + "loss": 0.0, + "num_input_tokens_seen": 72964552, + "step": 125760 + }, + { + "epoch": 18.7317545427465, + "grad_norm": 9.624189260648564e-06, + "learning_rate": 6.100971708392272e-07, + "loss": 0.0, + "num_input_tokens_seen": 72967688, + "step": 125765 + }, + { + "epoch": 18.732499255287458, + "grad_norm": 0.0002583228633739054, + "learning_rate": 6.093838922251488e-07, + "loss": 0.0, + "num_input_tokens_seen": 72970568, + "step": 125770 + }, + { + "epoch": 18.733243967828418, + "grad_norm": 7.961630217323545e-06, + "learning_rate": 6.086710256657413e-07, + "loss": 0.0, + "num_input_tokens_seen": 72973448, + "step": 125775 + }, + { + "epoch": 18.73398868036938, + "grad_norm": 7.398770776489982e-06, + "learning_rate": 6.079585711730451e-07, + "loss": 0.0, + "num_input_tokens_seen": 72976680, + "step": 125780 + }, + { + "epoch": 18.734733392910336, + "grad_norm": 3.041797072000918e-06, + "learning_rate": 6.072465287591005e-07, + "loss": 0.0, + "num_input_tokens_seen": 72979592, + "step": 125785 + }, + { + "epoch": 18.735478105451296, + "grad_norm": 2.0184897948638536e-06, + "learning_rate": 6.065348984359314e-07, + "loss": 0.0, + "num_input_tokens_seen": 72982632, + "step": 125790 + }, + { + "epoch": 18.736222817992257, + "grad_norm": 3.716042101586936e-06, + "learning_rate": 6.058236802155643e-07, + "loss": 0.0, + "num_input_tokens_seen": 72985512, + "step": 125795 + }, + { + "epoch": 18.736967530533214, + "grad_norm": 2.434833731967956e-06, + "learning_rate": 6.051128741100115e-07, + "loss": 0.0, + "num_input_tokens_seen": 72988136, + "step": 125800 + }, + { + "epoch": 18.737712243074174, + "grad_norm": 0.00021116113930474967, + "learning_rate": 6.044024801312831e-07, + "loss": 0.0, + "num_input_tokens_seen": 72991048, + "step": 125805 + }, + { + "epoch": 18.73845695561513, + "grad_norm": 8.276236258097924e-06, + "learning_rate": 6.036924982913805e-07, + "loss": 0.0, + "num_input_tokens_seen": 72993960, + "step": 125810 + }, + { + "epoch": 18.739201668156092, + "grad_norm": 2.3978818717296235e-06, + "learning_rate": 6.029829286022998e-07, + "loss": 0.0, + "num_input_tokens_seen": 72996840, + "step": 125815 + }, + { + "epoch": 18.739946380697052, + "grad_norm": 6.196438334882259e-06, + "learning_rate": 6.022737710760256e-07, + "loss": 0.0, + "num_input_tokens_seen": 73000040, + "step": 125820 + }, + { + "epoch": 18.74069109323801, + "grad_norm": 3.4036688703054097e-06, + "learning_rate": 6.015650257245348e-07, + "loss": 0.0, + "num_input_tokens_seen": 73002856, + "step": 125825 + }, + { + "epoch": 18.74143580577897, + "grad_norm": 0.00043856038246303797, + "learning_rate": 6.008566925598119e-07, + "loss": 0.0, + "num_input_tokens_seen": 73005896, + "step": 125830 + }, + { + "epoch": 18.742180518319927, + "grad_norm": 1.919543365147547e-06, + "learning_rate": 6.001487715938142e-07, + "loss": 0.0, + "num_input_tokens_seen": 73008744, + "step": 125835 + }, + { + "epoch": 18.742925230860887, + "grad_norm": 8.417892786383163e-06, + "learning_rate": 5.994412628385043e-07, + "loss": 0.0, + "num_input_tokens_seen": 73011624, + "step": 125840 + }, + { + "epoch": 18.743669943401848, + "grad_norm": 3.119726898148656e-06, + "learning_rate": 5.987341663058338e-07, + "loss": 0.0, + "num_input_tokens_seen": 73014440, + "step": 125845 + }, + { + "epoch": 18.744414655942805, + "grad_norm": 4.8877441258809995e-06, + "learning_rate": 5.980274820077514e-07, + "loss": 0.0, + "num_input_tokens_seen": 73017064, + "step": 125850 + }, + { + "epoch": 18.745159368483765, + "grad_norm": 2.666476120793959e-06, + "learning_rate": 5.97321209956192e-07, + "loss": 0.0, + "num_input_tokens_seen": 73019784, + "step": 125855 + }, + { + "epoch": 18.745904081024726, + "grad_norm": 5.624256573355524e-06, + "learning_rate": 5.966153501630877e-07, + "loss": 0.0, + "num_input_tokens_seen": 73022504, + "step": 125860 + }, + { + "epoch": 18.746648793565683, + "grad_norm": 0.004734338726848364, + "learning_rate": 5.95909902640368e-07, + "loss": 0.0, + "num_input_tokens_seen": 73025064, + "step": 125865 + }, + { + "epoch": 18.747393506106643, + "grad_norm": 2.706570376176387e-05, + "learning_rate": 5.952048673999427e-07, + "loss": 0.0, + "num_input_tokens_seen": 73027944, + "step": 125870 + }, + { + "epoch": 18.7481382186476, + "grad_norm": 8.552885446988512e-06, + "learning_rate": 5.945002444537329e-07, + "loss": 0.0, + "num_input_tokens_seen": 73030920, + "step": 125875 + }, + { + "epoch": 18.74888293118856, + "grad_norm": 0.0001031081992550753, + "learning_rate": 5.937960338136317e-07, + "loss": 0.0, + "num_input_tokens_seen": 73034088, + "step": 125880 + }, + { + "epoch": 18.74962764372952, + "grad_norm": 1.1178366548847407e-05, + "learning_rate": 5.930922354915436e-07, + "loss": 0.0, + "num_input_tokens_seen": 73037000, + "step": 125885 + }, + { + "epoch": 18.75037235627048, + "grad_norm": 0.05572271719574928, + "learning_rate": 5.923888494993562e-07, + "loss": 0.0, + "num_input_tokens_seen": 73039944, + "step": 125890 + }, + { + "epoch": 18.75111706881144, + "grad_norm": 0.0002714312286116183, + "learning_rate": 5.916858758489519e-07, + "loss": 0.0, + "num_input_tokens_seen": 73042664, + "step": 125895 + }, + { + "epoch": 18.7518617813524, + "grad_norm": 2.7172884529136354e-06, + "learning_rate": 5.90983314552207e-07, + "loss": 0.0, + "num_input_tokens_seen": 73045736, + "step": 125900 + }, + { + "epoch": 18.752606493893357, + "grad_norm": 2.7628916541289072e-06, + "learning_rate": 5.902811656209927e-07, + "loss": 0.0, + "num_input_tokens_seen": 73049224, + "step": 125905 + }, + { + "epoch": 18.753351206434317, + "grad_norm": 3.6309418192104204e-06, + "learning_rate": 5.895794290671691e-07, + "loss": 0.0, + "num_input_tokens_seen": 73052008, + "step": 125910 + }, + { + "epoch": 18.754095918975274, + "grad_norm": 2.097289325320162e-06, + "learning_rate": 5.888781049025877e-07, + "loss": 0.0, + "num_input_tokens_seen": 73054728, + "step": 125915 + }, + { + "epoch": 18.754840631516235, + "grad_norm": 2.8410190680006053e-06, + "learning_rate": 5.881771931391028e-07, + "loss": 0.0, + "num_input_tokens_seen": 73057224, + "step": 125920 + }, + { + "epoch": 18.755585344057195, + "grad_norm": 0.01203170232474804, + "learning_rate": 5.874766937885523e-07, + "loss": 0.0, + "num_input_tokens_seen": 73060232, + "step": 125925 + }, + { + "epoch": 18.756330056598152, + "grad_norm": 0.027306972071528435, + "learning_rate": 5.867766068627739e-07, + "loss": 0.0, + "num_input_tokens_seen": 73062952, + "step": 125930 + }, + { + "epoch": 18.757074769139113, + "grad_norm": 6.336966180242598e-06, + "learning_rate": 5.860769323735887e-07, + "loss": 0.0, + "num_input_tokens_seen": 73065640, + "step": 125935 + }, + { + "epoch": 18.757819481680073, + "grad_norm": 1.0067532457469497e-05, + "learning_rate": 5.853776703328207e-07, + "loss": 0.0, + "num_input_tokens_seen": 73068616, + "step": 125940 + }, + { + "epoch": 18.75856419422103, + "grad_norm": 6.448532803915441e-05, + "learning_rate": 5.846788207522852e-07, + "loss": 0.0, + "num_input_tokens_seen": 73071624, + "step": 125945 + }, + { + "epoch": 18.75930890676199, + "grad_norm": 2.5109720809268765e-06, + "learning_rate": 5.83980383643784e-07, + "loss": 0.0, + "num_input_tokens_seen": 73074632, + "step": 125950 + }, + { + "epoch": 18.760053619302948, + "grad_norm": 5.152393350726925e-05, + "learning_rate": 5.832823590191216e-07, + "loss": 0.0, + "num_input_tokens_seen": 73077672, + "step": 125955 + }, + { + "epoch": 18.760798331843908, + "grad_norm": 3.5287295759189874e-05, + "learning_rate": 5.825847468900858e-07, + "loss": 0.0, + "num_input_tokens_seen": 73080520, + "step": 125960 + }, + { + "epoch": 18.76154304438487, + "grad_norm": 2.2655092834611423e-05, + "learning_rate": 5.81887547268467e-07, + "loss": 0.0, + "num_input_tokens_seen": 73083240, + "step": 125965 + }, + { + "epoch": 18.762287756925826, + "grad_norm": 2.08757478503685e-06, + "learning_rate": 5.811907601660393e-07, + "loss": 0.0, + "num_input_tokens_seen": 73085992, + "step": 125970 + }, + { + "epoch": 18.763032469466786, + "grad_norm": 3.5399564239924075e-06, + "learning_rate": 5.804943855945738e-07, + "loss": 0.0, + "num_input_tokens_seen": 73088744, + "step": 125975 + }, + { + "epoch": 18.763777182007743, + "grad_norm": 2.8864758405688917e-06, + "learning_rate": 5.797984235658388e-07, + "loss": 0.0, + "num_input_tokens_seen": 73091304, + "step": 125980 + }, + { + "epoch": 18.764521894548704, + "grad_norm": 7.029077096376568e-06, + "learning_rate": 5.791028740915888e-07, + "loss": 0.0, + "num_input_tokens_seen": 73094216, + "step": 125985 + }, + { + "epoch": 18.765266607089664, + "grad_norm": 0.0006225245306268334, + "learning_rate": 5.784077371835756e-07, + "loss": 0.0, + "num_input_tokens_seen": 73097288, + "step": 125990 + }, + { + "epoch": 18.76601131963062, + "grad_norm": 3.218306119379122e-06, + "learning_rate": 5.777130128535396e-07, + "loss": 0.0, + "num_input_tokens_seen": 73100264, + "step": 125995 + }, + { + "epoch": 18.766756032171582, + "grad_norm": 3.883820681949146e-05, + "learning_rate": 5.770187011132244e-07, + "loss": 0.0, + "num_input_tokens_seen": 73103144, + "step": 126000 + }, + { + "epoch": 18.767500744712542, + "grad_norm": 2.7870521535078296e-06, + "learning_rate": 5.763248019743539e-07, + "loss": 0.0, + "num_input_tokens_seen": 73105896, + "step": 126005 + }, + { + "epoch": 18.7682454572535, + "grad_norm": 0.00014445620763581246, + "learning_rate": 5.756313154486547e-07, + "loss": 0.0, + "num_input_tokens_seen": 73108968, + "step": 126010 + }, + { + "epoch": 18.76899016979446, + "grad_norm": 5.327905000740429e-06, + "learning_rate": 5.7493824154784e-07, + "loss": 0.0, + "num_input_tokens_seen": 73111784, + "step": 126015 + }, + { + "epoch": 18.769734882335417, + "grad_norm": 1.3097181863486185e-06, + "learning_rate": 5.742455802836166e-07, + "loss": 0.0, + "num_input_tokens_seen": 73114536, + "step": 126020 + }, + { + "epoch": 18.770479594876377, + "grad_norm": 3.57520275429124e-06, + "learning_rate": 5.735533316676922e-07, + "loss": 0.0, + "num_input_tokens_seen": 73117480, + "step": 126025 + }, + { + "epoch": 18.771224307417338, + "grad_norm": 0.0003307458246126771, + "learning_rate": 5.728614957117573e-07, + "loss": 0.0, + "num_input_tokens_seen": 73120712, + "step": 126030 + }, + { + "epoch": 18.771969019958295, + "grad_norm": 1.9043590100409347e-06, + "learning_rate": 5.721700724274997e-07, + "loss": 0.0, + "num_input_tokens_seen": 73123976, + "step": 126035 + }, + { + "epoch": 18.772713732499255, + "grad_norm": 7.021808414719999e-06, + "learning_rate": 5.714790618266019e-07, + "loss": 0.0002, + "num_input_tokens_seen": 73127048, + "step": 126040 + }, + { + "epoch": 18.773458445040216, + "grad_norm": 7.837738849048037e-06, + "learning_rate": 5.707884639207406e-07, + "loss": 0.0, + "num_input_tokens_seen": 73129800, + "step": 126045 + }, + { + "epoch": 18.774203157581173, + "grad_norm": 2.4550316084059887e-05, + "learning_rate": 5.700982787215759e-07, + "loss": 0.0, + "num_input_tokens_seen": 73132744, + "step": 126050 + }, + { + "epoch": 18.774947870122134, + "grad_norm": 0.00030140826129354537, + "learning_rate": 5.694085062407705e-07, + "loss": 0.0, + "num_input_tokens_seen": 73136072, + "step": 126055 + }, + { + "epoch": 18.77569258266309, + "grad_norm": 0.014004010707139969, + "learning_rate": 5.687191464899821e-07, + "loss": 0.0, + "num_input_tokens_seen": 73139080, + "step": 126060 + }, + { + "epoch": 18.77643729520405, + "grad_norm": 8.515386070939712e-06, + "learning_rate": 5.680301994808485e-07, + "loss": 0.0, + "num_input_tokens_seen": 73141704, + "step": 126065 + }, + { + "epoch": 18.77718200774501, + "grad_norm": 1.0624135938996915e-05, + "learning_rate": 5.673416652250158e-07, + "loss": 0.0, + "num_input_tokens_seen": 73144904, + "step": 126070 + }, + { + "epoch": 18.77792672028597, + "grad_norm": 9.116622095461935e-06, + "learning_rate": 5.666535437341108e-07, + "loss": 0.0, + "num_input_tokens_seen": 73147656, + "step": 126075 + }, + { + "epoch": 18.77867143282693, + "grad_norm": 2.9410157367237844e-06, + "learning_rate": 5.659658350197661e-07, + "loss": 0.0, + "num_input_tokens_seen": 73150472, + "step": 126080 + }, + { + "epoch": 18.77941614536789, + "grad_norm": 6.209093044162728e-06, + "learning_rate": 5.652785390935889e-07, + "loss": 0.0, + "num_input_tokens_seen": 73153192, + "step": 126085 + }, + { + "epoch": 18.780160857908847, + "grad_norm": 2.3652567051613005e-06, + "learning_rate": 5.645916559672004e-07, + "loss": 0.0, + "num_input_tokens_seen": 73156168, + "step": 126090 + }, + { + "epoch": 18.780905570449807, + "grad_norm": 3.7331490148062585e-06, + "learning_rate": 5.639051856522026e-07, + "loss": 0.0, + "num_input_tokens_seen": 73158792, + "step": 126095 + }, + { + "epoch": 18.781650282990764, + "grad_norm": 6.537909939652309e-05, + "learning_rate": 5.63219128160189e-07, + "loss": 0.0, + "num_input_tokens_seen": 73161448, + "step": 126100 + }, + { + "epoch": 18.782394995531725, + "grad_norm": 3.734847268788144e-06, + "learning_rate": 5.625334835027502e-07, + "loss": 0.0, + "num_input_tokens_seen": 73164168, + "step": 126105 + }, + { + "epoch": 18.783139708072685, + "grad_norm": 0.0005585174076259136, + "learning_rate": 5.618482516914714e-07, + "loss": 0.0, + "num_input_tokens_seen": 73167176, + "step": 126110 + }, + { + "epoch": 18.783884420613642, + "grad_norm": 8.642605280329008e-06, + "learning_rate": 5.611634327379295e-07, + "loss": 0.0, + "num_input_tokens_seen": 73170024, + "step": 126115 + }, + { + "epoch": 18.784629133154603, + "grad_norm": 0.0009211668511852622, + "learning_rate": 5.60479026653693e-07, + "loss": 0.0, + "num_input_tokens_seen": 73172648, + "step": 126120 + }, + { + "epoch": 18.785373845695563, + "grad_norm": 9.819723345572129e-06, + "learning_rate": 5.59795033450325e-07, + "loss": 0.0, + "num_input_tokens_seen": 73175560, + "step": 126125 + }, + { + "epoch": 18.78611855823652, + "grad_norm": 2.641766286615166e-06, + "learning_rate": 5.591114531393771e-07, + "loss": 0.0, + "num_input_tokens_seen": 73178536, + "step": 126130 + }, + { + "epoch": 18.78686327077748, + "grad_norm": 3.832984930340899e-06, + "learning_rate": 5.584282857324014e-07, + "loss": 0.0, + "num_input_tokens_seen": 73181544, + "step": 126135 + }, + { + "epoch": 18.787607983318438, + "grad_norm": 7.639881914656144e-06, + "learning_rate": 5.577455312409413e-07, + "loss": 0.0, + "num_input_tokens_seen": 73184360, + "step": 126140 + }, + { + "epoch": 18.7883526958594, + "grad_norm": 5.6004464568104595e-06, + "learning_rate": 5.570631896765239e-07, + "loss": 0.0, + "num_input_tokens_seen": 73187048, + "step": 126145 + }, + { + "epoch": 18.78909740840036, + "grad_norm": 7.703333540121093e-05, + "learning_rate": 5.563812610506841e-07, + "loss": 0.0, + "num_input_tokens_seen": 73189928, + "step": 126150 + }, + { + "epoch": 18.789842120941316, + "grad_norm": 7.402818482660223e-06, + "learning_rate": 5.556997453749379e-07, + "loss": 0.0, + "num_input_tokens_seen": 73192744, + "step": 126155 + }, + { + "epoch": 18.790586833482276, + "grad_norm": 0.00033838627859950066, + "learning_rate": 5.550186426608039e-07, + "loss": 0.0, + "num_input_tokens_seen": 73195944, + "step": 126160 + }, + { + "epoch": 18.791331546023237, + "grad_norm": 4.5379249058896676e-05, + "learning_rate": 5.543379529197839e-07, + "loss": 0.0, + "num_input_tokens_seen": 73198888, + "step": 126165 + }, + { + "epoch": 18.792076258564194, + "grad_norm": 4.807884579349775e-06, + "learning_rate": 5.536576761633772e-07, + "loss": 0.0, + "num_input_tokens_seen": 73201928, + "step": 126170 + }, + { + "epoch": 18.792820971105154, + "grad_norm": 5.116631291457452e-05, + "learning_rate": 5.529778124030799e-07, + "loss": 0.0, + "num_input_tokens_seen": 73204904, + "step": 126175 + }, + { + "epoch": 18.79356568364611, + "grad_norm": 2.8338968149910215e-06, + "learning_rate": 5.522983616503746e-07, + "loss": 0.0, + "num_input_tokens_seen": 73207784, + "step": 126180 + }, + { + "epoch": 18.794310396187072, + "grad_norm": 0.0008329308475367725, + "learning_rate": 5.51619323916741e-07, + "loss": 0.0, + "num_input_tokens_seen": 73210568, + "step": 126185 + }, + { + "epoch": 18.795055108728032, + "grad_norm": 2.9580028240161482e-06, + "learning_rate": 5.509406992136479e-07, + "loss": 0.0, + "num_input_tokens_seen": 73213320, + "step": 126190 + }, + { + "epoch": 18.79579982126899, + "grad_norm": 6.811614730395377e-05, + "learning_rate": 5.502624875525664e-07, + "loss": 0.0, + "num_input_tokens_seen": 73215944, + "step": 126195 + }, + { + "epoch": 18.79654453380995, + "grad_norm": 3.867782197630731e-06, + "learning_rate": 5.495846889449485e-07, + "loss": 0.0, + "num_input_tokens_seen": 73219016, + "step": 126200 + }, + { + "epoch": 18.797289246350907, + "grad_norm": 1.6995205442071892e-05, + "learning_rate": 5.48907303402249e-07, + "loss": 0.0, + "num_input_tokens_seen": 73222216, + "step": 126205 + }, + { + "epoch": 18.798033958891867, + "grad_norm": 3.5470422972139204e-06, + "learning_rate": 5.48230330935906e-07, + "loss": 0.0, + "num_input_tokens_seen": 73225032, + "step": 126210 + }, + { + "epoch": 18.798778671432828, + "grad_norm": 8.917297964217141e-05, + "learning_rate": 5.475537715573631e-07, + "loss": 0.0, + "num_input_tokens_seen": 73227848, + "step": 126215 + }, + { + "epoch": 18.799523383973785, + "grad_norm": 0.001000408548861742, + "learning_rate": 5.468776252780472e-07, + "loss": 0.0, + "num_input_tokens_seen": 73230760, + "step": 126220 + }, + { + "epoch": 18.800268096514746, + "grad_norm": 2.6272582545061596e-06, + "learning_rate": 5.46201892109377e-07, + "loss": 0.0, + "num_input_tokens_seen": 73233768, + "step": 126225 + }, + { + "epoch": 18.801012809055706, + "grad_norm": 3.10255418298766e-05, + "learning_rate": 5.455265720627767e-07, + "loss": 0.0, + "num_input_tokens_seen": 73236552, + "step": 126230 + }, + { + "epoch": 18.801757521596663, + "grad_norm": 2.262692760268692e-06, + "learning_rate": 5.448516651496482e-07, + "loss": 0.0, + "num_input_tokens_seen": 73239528, + "step": 126235 + }, + { + "epoch": 18.802502234137624, + "grad_norm": 4.193991571810329e-06, + "learning_rate": 5.441771713813992e-07, + "loss": 0.0, + "num_input_tokens_seen": 73242216, + "step": 126240 + }, + { + "epoch": 18.80324694667858, + "grad_norm": 0.0045118811540305614, + "learning_rate": 5.435030907694149e-07, + "loss": 0.0, + "num_input_tokens_seen": 73245192, + "step": 126245 + }, + { + "epoch": 18.80399165921954, + "grad_norm": 1.1944960533583071e-05, + "learning_rate": 5.428294233250947e-07, + "loss": 0.0, + "num_input_tokens_seen": 73248104, + "step": 126250 + }, + { + "epoch": 18.8047363717605, + "grad_norm": 3.807506072917022e-05, + "learning_rate": 5.421561690598126e-07, + "loss": 0.0111, + "num_input_tokens_seen": 73250952, + "step": 126255 + }, + { + "epoch": 18.80548108430146, + "grad_norm": 7.264191026479239e-06, + "learning_rate": 5.414833279849429e-07, + "loss": 0.0, + "num_input_tokens_seen": 73253992, + "step": 126260 + }, + { + "epoch": 18.80622579684242, + "grad_norm": 6.326310995063977e-06, + "learning_rate": 5.408109001118544e-07, + "loss": 0.0, + "num_input_tokens_seen": 73256488, + "step": 126265 + }, + { + "epoch": 18.80697050938338, + "grad_norm": 1.5930609151837416e-05, + "learning_rate": 5.401388854519046e-07, + "loss": 0.0, + "num_input_tokens_seen": 73259208, + "step": 126270 + }, + { + "epoch": 18.807715221924337, + "grad_norm": 1.1841486411867663e-05, + "learning_rate": 5.394672840164511e-07, + "loss": 0.0, + "num_input_tokens_seen": 73261992, + "step": 126275 + }, + { + "epoch": 18.808459934465297, + "grad_norm": 1.277621413464658e-05, + "learning_rate": 5.387960958168375e-07, + "loss": 0.0, + "num_input_tokens_seen": 73265096, + "step": 126280 + }, + { + "epoch": 18.809204647006254, + "grad_norm": 4.632790023606503e-06, + "learning_rate": 5.381253208644021e-07, + "loss": 0.0, + "num_input_tokens_seen": 73267880, + "step": 126285 + }, + { + "epoch": 18.809949359547215, + "grad_norm": 0.0007533056195825338, + "learning_rate": 5.374549591704747e-07, + "loss": 0.0, + "num_input_tokens_seen": 73270760, + "step": 126290 + }, + { + "epoch": 18.810694072088175, + "grad_norm": 1.2927032003062777e-05, + "learning_rate": 5.367850107463879e-07, + "loss": 0.0, + "num_input_tokens_seen": 73273896, + "step": 126295 + }, + { + "epoch": 18.811438784629132, + "grad_norm": 0.00010792647663038224, + "learning_rate": 5.36115475603452e-07, + "loss": 0.0, + "num_input_tokens_seen": 73277128, + "step": 126300 + }, + { + "epoch": 18.812183497170093, + "grad_norm": 1.040216284309281e-05, + "learning_rate": 5.354463537529831e-07, + "loss": 0.0, + "num_input_tokens_seen": 73279976, + "step": 126305 + }, + { + "epoch": 18.812928209711053, + "grad_norm": 4.0256636566482484e-06, + "learning_rate": 5.347776452062831e-07, + "loss": 0.0, + "num_input_tokens_seen": 73282856, + "step": 126310 + }, + { + "epoch": 18.81367292225201, + "grad_norm": 3.4696429338509915e-06, + "learning_rate": 5.341093499746485e-07, + "loss": 0.0, + "num_input_tokens_seen": 73286024, + "step": 126315 + }, + { + "epoch": 18.81441763479297, + "grad_norm": 3.7506501939787995e-06, + "learning_rate": 5.334414680693705e-07, + "loss": 0.0, + "num_input_tokens_seen": 73288936, + "step": 126320 + }, + { + "epoch": 18.815162347333928, + "grad_norm": 0.0003908902290277183, + "learning_rate": 5.327739995017316e-07, + "loss": 0.0, + "num_input_tokens_seen": 73291912, + "step": 126325 + }, + { + "epoch": 18.81590705987489, + "grad_norm": 1.88786395938223e-06, + "learning_rate": 5.32106944283009e-07, + "loss": 0.0, + "num_input_tokens_seen": 73295048, + "step": 126330 + }, + { + "epoch": 18.81665177241585, + "grad_norm": 2.544924655012437e-06, + "learning_rate": 5.31440302424474e-07, + "loss": 0.0, + "num_input_tokens_seen": 73297736, + "step": 126335 + }, + { + "epoch": 18.817396484956806, + "grad_norm": 1.977353895199485e-05, + "learning_rate": 5.307740739373818e-07, + "loss": 0.0, + "num_input_tokens_seen": 73300648, + "step": 126340 + }, + { + "epoch": 18.818141197497766, + "grad_norm": 4.611163149093045e-06, + "learning_rate": 5.301082588329953e-07, + "loss": 0.0, + "num_input_tokens_seen": 73304040, + "step": 126345 + }, + { + "epoch": 18.818885910038723, + "grad_norm": 1.566001446917653e-05, + "learning_rate": 5.294428571225585e-07, + "loss": 0.0, + "num_input_tokens_seen": 73306792, + "step": 126350 + }, + { + "epoch": 18.819630622579684, + "grad_norm": 4.13708085034159e-06, + "learning_rate": 5.287778688173151e-07, + "loss": 0.0, + "num_input_tokens_seen": 73309672, + "step": 126355 + }, + { + "epoch": 18.820375335120644, + "grad_norm": 7.811810064595193e-06, + "learning_rate": 5.281132939284977e-07, + "loss": 0.0, + "num_input_tokens_seen": 73312808, + "step": 126360 + }, + { + "epoch": 18.8211200476616, + "grad_norm": 5.121689355291892e-06, + "learning_rate": 5.274491324673309e-07, + "loss": 0.0, + "num_input_tokens_seen": 73315624, + "step": 126365 + }, + { + "epoch": 18.821864760202562, + "grad_norm": 1.37667575472733e-05, + "learning_rate": 5.267853844450416e-07, + "loss": 0.0, + "num_input_tokens_seen": 73318632, + "step": 126370 + }, + { + "epoch": 18.822609472743522, + "grad_norm": 3.5697910334420158e-06, + "learning_rate": 5.261220498728403e-07, + "loss": 0.0, + "num_input_tokens_seen": 73321480, + "step": 126375 + }, + { + "epoch": 18.82335418528448, + "grad_norm": 0.0001406494266120717, + "learning_rate": 5.254591287619348e-07, + "loss": 0.0, + "num_input_tokens_seen": 73324232, + "step": 126380 + }, + { + "epoch": 18.82409889782544, + "grad_norm": 1.1946495760639664e-05, + "learning_rate": 5.247966211235161e-07, + "loss": 0.0, + "num_input_tokens_seen": 73327432, + "step": 126385 + }, + { + "epoch": 18.824843610366397, + "grad_norm": 1.6419606254203245e-05, + "learning_rate": 5.241345269687864e-07, + "loss": 0.0, + "num_input_tokens_seen": 73330408, + "step": 126390 + }, + { + "epoch": 18.825588322907358, + "grad_norm": 3.622281838033814e-06, + "learning_rate": 5.234728463089284e-07, + "loss": 0.0, + "num_input_tokens_seen": 73333064, + "step": 126395 + }, + { + "epoch": 18.826333035448318, + "grad_norm": 3.090838390562567e-06, + "learning_rate": 5.228115791551191e-07, + "loss": 0.0, + "num_input_tokens_seen": 73336104, + "step": 126400 + }, + { + "epoch": 18.827077747989275, + "grad_norm": 4.295867711334722e-06, + "learning_rate": 5.221507255185304e-07, + "loss": 0.0, + "num_input_tokens_seen": 73338824, + "step": 126405 + }, + { + "epoch": 18.827822460530236, + "grad_norm": 0.0008838183712214231, + "learning_rate": 5.214902854103282e-07, + "loss": 0.0, + "num_input_tokens_seen": 73341864, + "step": 126410 + }, + { + "epoch": 18.828567173071196, + "grad_norm": 0.0002647622022777796, + "learning_rate": 5.208302588416647e-07, + "loss": 0.0, + "num_input_tokens_seen": 73344648, + "step": 126415 + }, + { + "epoch": 18.829311885612153, + "grad_norm": 5.355482699087588e-06, + "learning_rate": 5.201706458236977e-07, + "loss": 0.0, + "num_input_tokens_seen": 73347848, + "step": 126420 + }, + { + "epoch": 18.830056598153114, + "grad_norm": 5.8463888308324385e-06, + "learning_rate": 5.195114463675682e-07, + "loss": 0.0, + "num_input_tokens_seen": 73350632, + "step": 126425 + }, + { + "epoch": 18.83080131069407, + "grad_norm": 4.148986135987798e-06, + "learning_rate": 5.188526604844118e-07, + "loss": 0.0, + "num_input_tokens_seen": 73353384, + "step": 126430 + }, + { + "epoch": 18.83154602323503, + "grad_norm": 4.7621178964618593e-05, + "learning_rate": 5.181942881853585e-07, + "loss": 0.0, + "num_input_tokens_seen": 73356456, + "step": 126435 + }, + { + "epoch": 18.83229073577599, + "grad_norm": 1.1519799954839982e-05, + "learning_rate": 5.1753632948153e-07, + "loss": 0.0, + "num_input_tokens_seen": 73359208, + "step": 126440 + }, + { + "epoch": 18.83303544831695, + "grad_norm": 2.97764358947461e-06, + "learning_rate": 5.168787843840423e-07, + "loss": 0.0, + "num_input_tokens_seen": 73362024, + "step": 126445 + }, + { + "epoch": 18.83378016085791, + "grad_norm": 7.39392708055675e-05, + "learning_rate": 5.162216529040004e-07, + "loss": 0.0, + "num_input_tokens_seen": 73364936, + "step": 126450 + }, + { + "epoch": 18.83452487339887, + "grad_norm": 2.338234708076925e-06, + "learning_rate": 5.155649350525149e-07, + "loss": 0.0, + "num_input_tokens_seen": 73367688, + "step": 126455 + }, + { + "epoch": 18.835269585939827, + "grad_norm": 4.967577297065873e-06, + "learning_rate": 5.149086308406742e-07, + "loss": 0.0, + "num_input_tokens_seen": 73370536, + "step": 126460 + }, + { + "epoch": 18.836014298480787, + "grad_norm": 2.7604057777352864e-06, + "learning_rate": 5.142527402795638e-07, + "loss": 0.0, + "num_input_tokens_seen": 73373160, + "step": 126465 + }, + { + "epoch": 18.836759011021744, + "grad_norm": 1.2782931662513874e-05, + "learning_rate": 5.135972633802694e-07, + "loss": 0.0, + "num_input_tokens_seen": 73376104, + "step": 126470 + }, + { + "epoch": 18.837503723562705, + "grad_norm": 4.597981387632899e-06, + "learning_rate": 5.129422001538597e-07, + "loss": 0.0, + "num_input_tokens_seen": 73379048, + "step": 126475 + }, + { + "epoch": 18.838248436103665, + "grad_norm": 1.3233781828603242e-05, + "learning_rate": 5.122875506114067e-07, + "loss": 0.0, + "num_input_tokens_seen": 73381736, + "step": 126480 + }, + { + "epoch": 18.838993148644622, + "grad_norm": 9.63579168455908e-06, + "learning_rate": 5.116333147639651e-07, + "loss": 0.0, + "num_input_tokens_seen": 73384616, + "step": 126485 + }, + { + "epoch": 18.839737861185583, + "grad_norm": 4.515178716246737e-06, + "learning_rate": 5.109794926225903e-07, + "loss": 0.0, + "num_input_tokens_seen": 73387496, + "step": 126490 + }, + { + "epoch": 18.84048257372654, + "grad_norm": 4.666598397307098e-05, + "learning_rate": 5.103260841983287e-07, + "loss": 0.0, + "num_input_tokens_seen": 73390344, + "step": 126495 + }, + { + "epoch": 18.8412272862675, + "grad_norm": 6.897483399370685e-05, + "learning_rate": 5.096730895022189e-07, + "loss": 0.0, + "num_input_tokens_seen": 73393160, + "step": 126500 + }, + { + "epoch": 18.84197199880846, + "grad_norm": 2.2507956600748003e-05, + "learning_rate": 5.090205085452909e-07, + "loss": 0.0, + "num_input_tokens_seen": 73395816, + "step": 126505 + }, + { + "epoch": 18.842716711349418, + "grad_norm": 2.96430835078354e-06, + "learning_rate": 5.083683413385665e-07, + "loss": 0.0, + "num_input_tokens_seen": 73398952, + "step": 126510 + }, + { + "epoch": 18.84346142389038, + "grad_norm": 0.0004792090621776879, + "learning_rate": 5.077165878930701e-07, + "loss": 0.0, + "num_input_tokens_seen": 73401960, + "step": 126515 + }, + { + "epoch": 18.84420613643134, + "grad_norm": 3.736974758794531e-05, + "learning_rate": 5.070652482198069e-07, + "loss": 0.0, + "num_input_tokens_seen": 73404648, + "step": 126520 + }, + { + "epoch": 18.844950848972296, + "grad_norm": 4.517398701864295e-05, + "learning_rate": 5.064143223297845e-07, + "loss": 0.0, + "num_input_tokens_seen": 73407272, + "step": 126525 + }, + { + "epoch": 18.845695561513256, + "grad_norm": 2.5703702704049647e-05, + "learning_rate": 5.057638102339945e-07, + "loss": 0.0, + "num_input_tokens_seen": 73410088, + "step": 126530 + }, + { + "epoch": 18.846440274054213, + "grad_norm": 8.231344509113114e-06, + "learning_rate": 5.051137119434362e-07, + "loss": 0.0, + "num_input_tokens_seen": 73412872, + "step": 126535 + }, + { + "epoch": 18.847184986595174, + "grad_norm": 2.923063402704429e-05, + "learning_rate": 5.044640274690815e-07, + "loss": 0.0, + "num_input_tokens_seen": 73415816, + "step": 126540 + }, + { + "epoch": 18.847929699136134, + "grad_norm": 7.604686834383756e-05, + "learning_rate": 5.038147568219131e-07, + "loss": 0.0, + "num_input_tokens_seen": 73418600, + "step": 126545 + }, + { + "epoch": 18.84867441167709, + "grad_norm": 6.6658772084338125e-06, + "learning_rate": 5.031659000128974e-07, + "loss": 0.0, + "num_input_tokens_seen": 73421448, + "step": 126550 + }, + { + "epoch": 18.849419124218052, + "grad_norm": 1.2748832887155004e-05, + "learning_rate": 5.02517457052995e-07, + "loss": 0.0, + "num_input_tokens_seen": 73424296, + "step": 126555 + }, + { + "epoch": 18.850163836759013, + "grad_norm": 1.911450453917496e-05, + "learning_rate": 5.018694279531638e-07, + "loss": 0.0, + "num_input_tokens_seen": 73427528, + "step": 126560 + }, + { + "epoch": 18.85090854929997, + "grad_norm": 3.3480262118246173e-06, + "learning_rate": 5.012218127243478e-07, + "loss": 0.0, + "num_input_tokens_seen": 73430504, + "step": 126565 + }, + { + "epoch": 18.85165326184093, + "grad_norm": 5.776605121354805e-06, + "learning_rate": 5.005746113774912e-07, + "loss": 0.0, + "num_input_tokens_seen": 73433384, + "step": 126570 + }, + { + "epoch": 18.852397974381887, + "grad_norm": 4.442357294465182e-06, + "learning_rate": 4.999278239235267e-07, + "loss": 0.0, + "num_input_tokens_seen": 73436264, + "step": 126575 + }, + { + "epoch": 18.853142686922848, + "grad_norm": 1.2239856005180627e-05, + "learning_rate": 4.992814503733817e-07, + "loss": 0.0, + "num_input_tokens_seen": 73439048, + "step": 126580 + }, + { + "epoch": 18.853887399463808, + "grad_norm": 1.9943647657783004e-06, + "learning_rate": 4.986354907379726e-07, + "loss": 0.0, + "num_input_tokens_seen": 73441672, + "step": 126585 + }, + { + "epoch": 18.854632112004765, + "grad_norm": 2.840181150531862e-05, + "learning_rate": 4.979899450282155e-07, + "loss": 0.0045, + "num_input_tokens_seen": 73444648, + "step": 126590 + }, + { + "epoch": 18.855376824545726, + "grad_norm": 2.2950960101297824e-06, + "learning_rate": 4.973448132550157e-07, + "loss": 0.0, + "num_input_tokens_seen": 73447720, + "step": 126595 + }, + { + "epoch": 18.856121537086686, + "grad_norm": 9.570661495672539e-05, + "learning_rate": 4.967000954292728e-07, + "loss": 0.0, + "num_input_tokens_seen": 73450920, + "step": 126600 + }, + { + "epoch": 18.856866249627643, + "grad_norm": 0.0013778251595795155, + "learning_rate": 4.96055791561878e-07, + "loss": 0.0, + "num_input_tokens_seen": 73453608, + "step": 126605 + }, + { + "epoch": 18.857610962168604, + "grad_norm": 2.4329683583346196e-06, + "learning_rate": 4.954119016637115e-07, + "loss": 0.0, + "num_input_tokens_seen": 73456968, + "step": 126610 + }, + { + "epoch": 18.85835567470956, + "grad_norm": 1.558240364829544e-05, + "learning_rate": 4.94768425745662e-07, + "loss": 0.0, + "num_input_tokens_seen": 73459848, + "step": 126615 + }, + { + "epoch": 18.85910038725052, + "grad_norm": 0.00011344814265612513, + "learning_rate": 4.9412536381859e-07, + "loss": 0.0, + "num_input_tokens_seen": 73462600, + "step": 126620 + }, + { + "epoch": 18.85984509979148, + "grad_norm": 2.0908008082187735e-05, + "learning_rate": 4.934827158933647e-07, + "loss": 0.0, + "num_input_tokens_seen": 73465576, + "step": 126625 + }, + { + "epoch": 18.86058981233244, + "grad_norm": 2.668384922799305e-06, + "learning_rate": 4.928404819808413e-07, + "loss": 0.0, + "num_input_tokens_seen": 73468232, + "step": 126630 + }, + { + "epoch": 18.8613345248734, + "grad_norm": 4.946629360347288e-06, + "learning_rate": 4.921986620918723e-07, + "loss": 0.0, + "num_input_tokens_seen": 73471208, + "step": 126635 + }, + { + "epoch": 18.86207923741436, + "grad_norm": 3.8454811146948487e-05, + "learning_rate": 4.915572562372961e-07, + "loss": 0.0, + "num_input_tokens_seen": 73474248, + "step": 126640 + }, + { + "epoch": 18.862823949955317, + "grad_norm": 6.194902653078316e-06, + "learning_rate": 4.909162644279486e-07, + "loss": 0.0, + "num_input_tokens_seen": 73476840, + "step": 126645 + }, + { + "epoch": 18.863568662496277, + "grad_norm": 8.962965694081504e-06, + "learning_rate": 4.902756866746627e-07, + "loss": 0.0, + "num_input_tokens_seen": 73479528, + "step": 126650 + }, + { + "epoch": 18.864313375037234, + "grad_norm": 2.0645728000090457e-05, + "learning_rate": 4.896355229882576e-07, + "loss": 0.0, + "num_input_tokens_seen": 73482472, + "step": 126655 + }, + { + "epoch": 18.865058087578195, + "grad_norm": 2.5514577828289475e-06, + "learning_rate": 4.889957733795525e-07, + "loss": 0.0, + "num_input_tokens_seen": 73485672, + "step": 126660 + }, + { + "epoch": 18.865802800119155, + "grad_norm": 6.937907983228797e-06, + "learning_rate": 4.883564378593497e-07, + "loss": 0.0, + "num_input_tokens_seen": 73488584, + "step": 126665 + }, + { + "epoch": 18.866547512660112, + "grad_norm": 0.0006776974769309163, + "learning_rate": 4.877175164384518e-07, + "loss": 0.0, + "num_input_tokens_seen": 73491720, + "step": 126670 + }, + { + "epoch": 18.867292225201073, + "grad_norm": 8.514286491845269e-06, + "learning_rate": 4.870790091276555e-07, + "loss": 0.0, + "num_input_tokens_seen": 73494664, + "step": 126675 + }, + { + "epoch": 18.868036937742033, + "grad_norm": 1.9502549548633397e-06, + "learning_rate": 4.864409159377415e-07, + "loss": 0.0, + "num_input_tokens_seen": 73497640, + "step": 126680 + }, + { + "epoch": 18.86878165028299, + "grad_norm": 1.0918918633251451e-05, + "learning_rate": 4.858032368794979e-07, + "loss": 0.0, + "num_input_tokens_seen": 73500264, + "step": 126685 + }, + { + "epoch": 18.86952636282395, + "grad_norm": 4.423707196110627e-06, + "learning_rate": 4.851659719636915e-07, + "loss": 0.0, + "num_input_tokens_seen": 73503144, + "step": 126690 + }, + { + "epoch": 18.870271075364908, + "grad_norm": 5.850741672475124e-06, + "learning_rate": 4.845291212010883e-07, + "loss": 0.0001, + "num_input_tokens_seen": 73506088, + "step": 126695 + }, + { + "epoch": 18.87101578790587, + "grad_norm": 3.279626616858877e-05, + "learning_rate": 4.838926846024522e-07, + "loss": 0.0, + "num_input_tokens_seen": 73508936, + "step": 126700 + }, + { + "epoch": 18.87176050044683, + "grad_norm": 1.5520072338404134e-05, + "learning_rate": 4.832566621785329e-07, + "loss": 0.0, + "num_input_tokens_seen": 73511688, + "step": 126705 + }, + { + "epoch": 18.872505212987786, + "grad_norm": 6.754578407708323e-06, + "learning_rate": 4.826210539400744e-07, + "loss": 0.0, + "num_input_tokens_seen": 73514632, + "step": 126710 + }, + { + "epoch": 18.873249925528746, + "grad_norm": 2.7709240839612903e-06, + "learning_rate": 4.819858598978127e-07, + "loss": 0.0, + "num_input_tokens_seen": 73517608, + "step": 126715 + }, + { + "epoch": 18.873994638069703, + "grad_norm": 1.961390125870821e-06, + "learning_rate": 4.81351080062481e-07, + "loss": 0.0, + "num_input_tokens_seen": 73520456, + "step": 126720 + }, + { + "epoch": 18.874739350610664, + "grad_norm": 2.7669464088830864e-06, + "learning_rate": 4.807167144448039e-07, + "loss": 0.0, + "num_input_tokens_seen": 73523400, + "step": 126725 + }, + { + "epoch": 18.875484063151625, + "grad_norm": 9.78275784291327e-06, + "learning_rate": 4.800827630554977e-07, + "loss": 0.0, + "num_input_tokens_seen": 73526088, + "step": 126730 + }, + { + "epoch": 18.87622877569258, + "grad_norm": 4.404255378176458e-06, + "learning_rate": 4.794492259052708e-07, + "loss": 0.0, + "num_input_tokens_seen": 73529096, + "step": 126735 + }, + { + "epoch": 18.876973488233542, + "grad_norm": 1.970332505152328e-06, + "learning_rate": 4.788161030048282e-07, + "loss": 0.0, + "num_input_tokens_seen": 73531912, + "step": 126740 + }, + { + "epoch": 18.877718200774503, + "grad_norm": 3.3589858503546566e-06, + "learning_rate": 4.781833943648672e-07, + "loss": 0.0, + "num_input_tokens_seen": 73534536, + "step": 126745 + }, + { + "epoch": 18.87846291331546, + "grad_norm": 2.9890783480368555e-05, + "learning_rate": 4.775510999960736e-07, + "loss": 0.0, + "num_input_tokens_seen": 73537480, + "step": 126750 + }, + { + "epoch": 18.87920762585642, + "grad_norm": 3.703682523337193e-05, + "learning_rate": 4.769192199091305e-07, + "loss": 0.0, + "num_input_tokens_seen": 73540520, + "step": 126755 + }, + { + "epoch": 18.879952338397377, + "grad_norm": 1.880886316030228e-06, + "learning_rate": 4.7628775411471536e-07, + "loss": 0.0, + "num_input_tokens_seen": 73543304, + "step": 126760 + }, + { + "epoch": 18.880697050938338, + "grad_norm": 2.9130189886927838e-06, + "learning_rate": 4.7565670262349207e-07, + "loss": 0.0, + "num_input_tokens_seen": 73546120, + "step": 126765 + }, + { + "epoch": 18.881441763479298, + "grad_norm": 2.3364369553746656e-05, + "learning_rate": 4.750260654461214e-07, + "loss": 0.0, + "num_input_tokens_seen": 73548968, + "step": 126770 + }, + { + "epoch": 18.882186476020255, + "grad_norm": 3.716963874467183e-06, + "learning_rate": 4.743958425932615e-07, + "loss": 0.0, + "num_input_tokens_seen": 73551848, + "step": 126775 + }, + { + "epoch": 18.882931188561216, + "grad_norm": 2.3671357212151634e-06, + "learning_rate": 4.737660340755595e-07, + "loss": 0.0, + "num_input_tokens_seen": 73555080, + "step": 126780 + }, + { + "epoch": 18.883675901102176, + "grad_norm": 7.155616003728937e-06, + "learning_rate": 4.731366399036485e-07, + "loss": 0.0107, + "num_input_tokens_seen": 73557896, + "step": 126785 + }, + { + "epoch": 18.884420613643133, + "grad_norm": 2.987721245517605e-06, + "learning_rate": 4.7250766008816726e-07, + "loss": 0.0, + "num_input_tokens_seen": 73561064, + "step": 126790 + }, + { + "epoch": 18.885165326184094, + "grad_norm": 3.093313716817647e-05, + "learning_rate": 4.7187909463974054e-07, + "loss": 0.0, + "num_input_tokens_seen": 73563976, + "step": 126795 + }, + { + "epoch": 18.88591003872505, + "grad_norm": 5.030558895668946e-06, + "learning_rate": 4.712509435689877e-07, + "loss": 0.0, + "num_input_tokens_seen": 73566984, + "step": 126800 + }, + { + "epoch": 18.88665475126601, + "grad_norm": 1.6388605217798613e-05, + "learning_rate": 4.706232068865196e-07, + "loss": 0.0882, + "num_input_tokens_seen": 73570280, + "step": 126805 + }, + { + "epoch": 18.88739946380697, + "grad_norm": 1.8137326378564467e-06, + "learning_rate": 4.6999588460294177e-07, + "loss": 0.0, + "num_input_tokens_seen": 73573192, + "step": 126810 + }, + { + "epoch": 18.88814417634793, + "grad_norm": 6.948308509890921e-06, + "learning_rate": 4.6936897672885117e-07, + "loss": 0.0, + "num_input_tokens_seen": 73576008, + "step": 126815 + }, + { + "epoch": 18.88888888888889, + "grad_norm": 5.388211775425589e-06, + "learning_rate": 4.6874248327484494e-07, + "loss": 0.0, + "num_input_tokens_seen": 73579240, + "step": 126820 + }, + { + "epoch": 18.88963360142985, + "grad_norm": 3.847542029689066e-05, + "learning_rate": 4.681164042514979e-07, + "loss": 0.0, + "num_input_tokens_seen": 73582184, + "step": 126825 + }, + { + "epoch": 18.890378313970807, + "grad_norm": 4.95827043778263e-06, + "learning_rate": 4.674907396693934e-07, + "loss": 0.0, + "num_input_tokens_seen": 73585224, + "step": 126830 + }, + { + "epoch": 18.891123026511767, + "grad_norm": 0.009544840082526207, + "learning_rate": 4.668654895390978e-07, + "loss": 0.0, + "num_input_tokens_seen": 73588168, + "step": 126835 + }, + { + "epoch": 18.891867739052724, + "grad_norm": 1.2161332051618956e-05, + "learning_rate": 4.66240653871175e-07, + "loss": 0.0, + "num_input_tokens_seen": 73590920, + "step": 126840 + }, + { + "epoch": 18.892612451593685, + "grad_norm": 4.5725213567493483e-05, + "learning_rate": 4.6561623267618037e-07, + "loss": 0.0, + "num_input_tokens_seen": 73593576, + "step": 126845 + }, + { + "epoch": 18.893357164134645, + "grad_norm": 0.00012426153989508748, + "learning_rate": 4.6499222596466386e-07, + "loss": 0.0, + "num_input_tokens_seen": 73596552, + "step": 126850 + }, + { + "epoch": 18.894101876675602, + "grad_norm": 0.00019091439025942236, + "learning_rate": 4.6436863374716976e-07, + "loss": 0.0, + "num_input_tokens_seen": 73599464, + "step": 126855 + }, + { + "epoch": 18.894846589216563, + "grad_norm": 3.856315288430778e-06, + "learning_rate": 4.6374545603423134e-07, + "loss": 0.0, + "num_input_tokens_seen": 73602312, + "step": 126860 + }, + { + "epoch": 18.89559130175752, + "grad_norm": 2.2801111754233716e-06, + "learning_rate": 4.6312269283637357e-07, + "loss": 0.0, + "num_input_tokens_seen": 73605128, + "step": 126865 + }, + { + "epoch": 18.89633601429848, + "grad_norm": 2.4924422632466303e-06, + "learning_rate": 4.6250034416411845e-07, + "loss": 0.0, + "num_input_tokens_seen": 73607976, + "step": 126870 + }, + { + "epoch": 18.89708072683944, + "grad_norm": 3.756530759346788e-06, + "learning_rate": 4.618784100279827e-07, + "loss": 0.0, + "num_input_tokens_seen": 73611016, + "step": 126875 + }, + { + "epoch": 18.897825439380398, + "grad_norm": 2.271025778100011e-06, + "learning_rate": 4.6125689043847453e-07, + "loss": 0.0, + "num_input_tokens_seen": 73613704, + "step": 126880 + }, + { + "epoch": 18.89857015192136, + "grad_norm": 3.4161232633778127e-06, + "learning_rate": 4.606357854060855e-07, + "loss": 0.0, + "num_input_tokens_seen": 73616552, + "step": 126885 + }, + { + "epoch": 18.89931486446232, + "grad_norm": 2.8617582756851334e-06, + "learning_rate": 4.6001509494131846e-07, + "loss": 0.0, + "num_input_tokens_seen": 73619176, + "step": 126890 + }, + { + "epoch": 18.900059577003276, + "grad_norm": 5.900838004890829e-06, + "learning_rate": 4.5939481905465655e-07, + "loss": 0.0, + "num_input_tokens_seen": 73622120, + "step": 126895 + }, + { + "epoch": 18.900804289544237, + "grad_norm": 2.0352147203084314e-06, + "learning_rate": 4.5877495775657476e-07, + "loss": 0.0, + "num_input_tokens_seen": 73625064, + "step": 126900 + }, + { + "epoch": 18.901549002085194, + "grad_norm": 1.7440390365663916e-05, + "learning_rate": 4.5815551105754804e-07, + "loss": 0.0, + "num_input_tokens_seen": 73627944, + "step": 126905 + }, + { + "epoch": 18.902293714626154, + "grad_norm": 3.5090128221781924e-05, + "learning_rate": 4.575364789680375e-07, + "loss": 0.0, + "num_input_tokens_seen": 73630760, + "step": 126910 + }, + { + "epoch": 18.903038427167115, + "grad_norm": 2.8780048069165787e-06, + "learning_rate": 4.5691786149850977e-07, + "loss": 0.0, + "num_input_tokens_seen": 73633480, + "step": 126915 + }, + { + "epoch": 18.90378313970807, + "grad_norm": 0.0001718544663162902, + "learning_rate": 4.562996586594037e-07, + "loss": 0.0, + "num_input_tokens_seen": 73636584, + "step": 126920 + }, + { + "epoch": 18.904527852249032, + "grad_norm": 2.3824341042200103e-05, + "learning_rate": 4.5568187046117484e-07, + "loss": 0.0, + "num_input_tokens_seen": 73639272, + "step": 126925 + }, + { + "epoch": 18.905272564789993, + "grad_norm": 4.180702944722725e-06, + "learning_rate": 4.550644969142537e-07, + "loss": 0.0, + "num_input_tokens_seen": 73642408, + "step": 126930 + }, + { + "epoch": 18.90601727733095, + "grad_norm": 7.644461584277451e-05, + "learning_rate": 4.544475380290708e-07, + "loss": 0.0, + "num_input_tokens_seen": 73645640, + "step": 126935 + }, + { + "epoch": 18.90676198987191, + "grad_norm": 5.667862296832027e-06, + "learning_rate": 4.538309938160512e-07, + "loss": 0.0, + "num_input_tokens_seen": 73648424, + "step": 126940 + }, + { + "epoch": 18.907506702412867, + "grad_norm": 2.442024424453848e-06, + "learning_rate": 4.532148642856088e-07, + "loss": 0.0, + "num_input_tokens_seen": 73651432, + "step": 126945 + }, + { + "epoch": 18.908251414953828, + "grad_norm": 3.973731509177014e-06, + "learning_rate": 4.5259914944815184e-07, + "loss": 0.0, + "num_input_tokens_seen": 73654216, + "step": 126950 + }, + { + "epoch": 18.908996127494788, + "grad_norm": 1.8435324591337121e-06, + "learning_rate": 4.519838493140832e-07, + "loss": 0.0, + "num_input_tokens_seen": 73657064, + "step": 126955 + }, + { + "epoch": 18.909740840035745, + "grad_norm": 3.2988434668368427e-06, + "learning_rate": 4.513689638938001e-07, + "loss": 0.0, + "num_input_tokens_seen": 73660008, + "step": 126960 + }, + { + "epoch": 18.910485552576706, + "grad_norm": 1.2060293556714896e-05, + "learning_rate": 4.507544931976887e-07, + "loss": 0.0, + "num_input_tokens_seen": 73662856, + "step": 126965 + }, + { + "epoch": 18.911230265117666, + "grad_norm": 2.192435749748256e-05, + "learning_rate": 4.501404372361295e-07, + "loss": 0.0, + "num_input_tokens_seen": 73665672, + "step": 126970 + }, + { + "epoch": 18.911974977658623, + "grad_norm": 8.403232641285285e-06, + "learning_rate": 4.495267960194921e-07, + "loss": 0.0, + "num_input_tokens_seen": 73668584, + "step": 126975 + }, + { + "epoch": 18.912719690199584, + "grad_norm": 1.882599917735206e-06, + "learning_rate": 4.4891356955815145e-07, + "loss": 0.0, + "num_input_tokens_seen": 73671464, + "step": 126980 + }, + { + "epoch": 18.91346440274054, + "grad_norm": 6.467809726018459e-06, + "learning_rate": 4.483007578624632e-07, + "loss": 0.0, + "num_input_tokens_seen": 73674088, + "step": 126985 + }, + { + "epoch": 18.9142091152815, + "grad_norm": 4.085125965502812e-06, + "learning_rate": 4.476883609427773e-07, + "loss": 0.0, + "num_input_tokens_seen": 73677000, + "step": 126990 + }, + { + "epoch": 18.914953827822462, + "grad_norm": 3.6700639611808583e-06, + "learning_rate": 4.4707637880944675e-07, + "loss": 0.0063, + "num_input_tokens_seen": 73679784, + "step": 126995 + }, + { + "epoch": 18.91569854036342, + "grad_norm": 0.0003512442053761333, + "learning_rate": 4.4646481147280206e-07, + "loss": 0.0, + "num_input_tokens_seen": 73682600, + "step": 127000 + }, + { + "epoch": 18.91644325290438, + "grad_norm": 5.026738654123619e-05, + "learning_rate": 4.458536589431822e-07, + "loss": 0.0, + "num_input_tokens_seen": 73685288, + "step": 127005 + }, + { + "epoch": 18.917187965445336, + "grad_norm": 1.8192935385741293e-05, + "learning_rate": 4.4524292123090673e-07, + "loss": 0.0, + "num_input_tokens_seen": 73688008, + "step": 127010 + }, + { + "epoch": 18.917932677986297, + "grad_norm": 2.5976139568228973e-06, + "learning_rate": 4.4463259834630066e-07, + "loss": 0.0, + "num_input_tokens_seen": 73691048, + "step": 127015 + }, + { + "epoch": 18.918677390527257, + "grad_norm": 3.6879864637739956e-06, + "learning_rate": 4.440226902996669e-07, + "loss": 0.0, + "num_input_tokens_seen": 73693864, + "step": 127020 + }, + { + "epoch": 18.919422103068214, + "grad_norm": 0.00010111535084433854, + "learning_rate": 4.4341319710131115e-07, + "loss": 0.0, + "num_input_tokens_seen": 73696488, + "step": 127025 + }, + { + "epoch": 18.920166815609175, + "grad_norm": 4.88266050524544e-06, + "learning_rate": 4.428041187615306e-07, + "loss": 0.0, + "num_input_tokens_seen": 73699112, + "step": 127030 + }, + { + "epoch": 18.920911528150135, + "grad_norm": 1.8296017287866562e-06, + "learning_rate": 4.421954552906199e-07, + "loss": 0.0, + "num_input_tokens_seen": 73701960, + "step": 127035 + }, + { + "epoch": 18.921656240691092, + "grad_norm": 3.745450840142439e-06, + "learning_rate": 4.415872066988541e-07, + "loss": 0.0, + "num_input_tokens_seen": 73704872, + "step": 127040 + }, + { + "epoch": 18.922400953232053, + "grad_norm": 1.3048073924437631e-05, + "learning_rate": 4.4097937299651115e-07, + "loss": 0.0, + "num_input_tokens_seen": 73707944, + "step": 127045 + }, + { + "epoch": 18.92314566577301, + "grad_norm": 367.91107177734375, + "learning_rate": 4.4037195419386336e-07, + "loss": 0.0173, + "num_input_tokens_seen": 73711016, + "step": 127050 + }, + { + "epoch": 18.92389037831397, + "grad_norm": 8.777617949817795e-06, + "learning_rate": 4.3976495030116915e-07, + "loss": 0.0, + "num_input_tokens_seen": 73713672, + "step": 127055 + }, + { + "epoch": 18.92463509085493, + "grad_norm": 3.336507916174014e-06, + "learning_rate": 4.3915836132868426e-07, + "loss": 0.0, + "num_input_tokens_seen": 73716488, + "step": 127060 + }, + { + "epoch": 18.925379803395888, + "grad_norm": 7.734011887805536e-06, + "learning_rate": 4.3855218728665883e-07, + "loss": 0.0001, + "num_input_tokens_seen": 73719112, + "step": 127065 + }, + { + "epoch": 18.92612451593685, + "grad_norm": 3.1873892112344038e-06, + "learning_rate": 4.3794642818532905e-07, + "loss": 0.0, + "num_input_tokens_seen": 73721992, + "step": 127070 + }, + { + "epoch": 18.92686922847781, + "grad_norm": 3.5452849260764197e-05, + "learning_rate": 4.3734108403493125e-07, + "loss": 0.0, + "num_input_tokens_seen": 73724840, + "step": 127075 + }, + { + "epoch": 18.927613941018766, + "grad_norm": 4.0925278881331906e-06, + "learning_rate": 4.3673615484568776e-07, + "loss": 0.0, + "num_input_tokens_seen": 73727688, + "step": 127080 + }, + { + "epoch": 18.928358653559727, + "grad_norm": 1.6730966763134347e-06, + "learning_rate": 4.3613164062782653e-07, + "loss": 0.0, + "num_input_tokens_seen": 73730280, + "step": 127085 + }, + { + "epoch": 18.929103366100684, + "grad_norm": 1.8046297327600769e-06, + "learning_rate": 4.3552754139155327e-07, + "loss": 0.0, + "num_input_tokens_seen": 73733160, + "step": 127090 + }, + { + "epoch": 18.929848078641644, + "grad_norm": 2.923183956227149e-06, + "learning_rate": 4.3492385714707927e-07, + "loss": 0.0, + "num_input_tokens_seen": 73735880, + "step": 127095 + }, + { + "epoch": 18.930592791182605, + "grad_norm": 2.946303538919892e-05, + "learning_rate": 4.343205879045964e-07, + "loss": 0.0, + "num_input_tokens_seen": 73738888, + "step": 127100 + }, + { + "epoch": 18.93133750372356, + "grad_norm": 1.3459228284773417e-05, + "learning_rate": 4.3371773367429924e-07, + "loss": 0.0, + "num_input_tokens_seen": 73741544, + "step": 127105 + }, + { + "epoch": 18.932082216264522, + "grad_norm": 7.135722989914939e-05, + "learning_rate": 4.331152944663769e-07, + "loss": 0.0, + "num_input_tokens_seen": 73744296, + "step": 127110 + }, + { + "epoch": 18.932826928805483, + "grad_norm": 5.796730692964047e-05, + "learning_rate": 4.3251327029099897e-07, + "loss": 0.0, + "num_input_tokens_seen": 73747112, + "step": 127115 + }, + { + "epoch": 18.93357164134644, + "grad_norm": 0.0008267146185971797, + "learning_rate": 4.319116611583407e-07, + "loss": 0.0, + "num_input_tokens_seen": 73750120, + "step": 127120 + }, + { + "epoch": 18.9343163538874, + "grad_norm": 1.5666219042032026e-05, + "learning_rate": 4.3131046707856613e-07, + "loss": 0.0, + "num_input_tokens_seen": 73753096, + "step": 127125 + }, + { + "epoch": 18.935061066428357, + "grad_norm": 4.897422968497267e-06, + "learning_rate": 4.307096880618311e-07, + "loss": 0.0, + "num_input_tokens_seen": 73756008, + "step": 127130 + }, + { + "epoch": 18.935805778969318, + "grad_norm": 2.413348738627974e-06, + "learning_rate": 4.30109324118283e-07, + "loss": 0.0, + "num_input_tokens_seen": 73758856, + "step": 127135 + }, + { + "epoch": 18.93655049151028, + "grad_norm": 2.0176480575173628e-06, + "learning_rate": 4.295093752580664e-07, + "loss": 0.0, + "num_input_tokens_seen": 73762056, + "step": 127140 + }, + { + "epoch": 18.937295204051235, + "grad_norm": 2.480338707755436e-06, + "learning_rate": 4.289098414913206e-07, + "loss": 0.0, + "num_input_tokens_seen": 73765128, + "step": 127145 + }, + { + "epoch": 18.938039916592196, + "grad_norm": 2.3885151676950045e-05, + "learning_rate": 4.283107228281652e-07, + "loss": 0.0, + "num_input_tokens_seen": 73768008, + "step": 127150 + }, + { + "epoch": 18.938784629133156, + "grad_norm": 0.0005291304551064968, + "learning_rate": 4.277120192787282e-07, + "loss": 0.0, + "num_input_tokens_seen": 73770760, + "step": 127155 + }, + { + "epoch": 18.939529341674113, + "grad_norm": 4.9368250074621756e-06, + "learning_rate": 4.271137308531237e-07, + "loss": 0.0, + "num_input_tokens_seen": 73773992, + "step": 127160 + }, + { + "epoch": 18.940274054215074, + "grad_norm": 1.082285052689258e-05, + "learning_rate": 4.265158575614575e-07, + "loss": 0.0, + "num_input_tokens_seen": 73777128, + "step": 127165 + }, + { + "epoch": 18.94101876675603, + "grad_norm": 2.3592833713337313e-06, + "learning_rate": 4.259183994138299e-07, + "loss": 0.0, + "num_input_tokens_seen": 73780456, + "step": 127170 + }, + { + "epoch": 18.94176347929699, + "grad_norm": 4.057179921801435e-06, + "learning_rate": 4.2532135642033565e-07, + "loss": 0.0, + "num_input_tokens_seen": 73783592, + "step": 127175 + }, + { + "epoch": 18.942508191837952, + "grad_norm": 1.633024658076465e-05, + "learning_rate": 4.2472472859105827e-07, + "loss": 0.0, + "num_input_tokens_seen": 73786696, + "step": 127180 + }, + { + "epoch": 18.94325290437891, + "grad_norm": 2.2363105927070137e-06, + "learning_rate": 4.241285159360814e-07, + "loss": 0.0, + "num_input_tokens_seen": 73789640, + "step": 127185 + }, + { + "epoch": 18.94399761691987, + "grad_norm": 3.4080921977874823e-06, + "learning_rate": 4.235327184654747e-07, + "loss": 0.0, + "num_input_tokens_seen": 73792456, + "step": 127190 + }, + { + "epoch": 18.94474232946083, + "grad_norm": 3.1861159186519217e-06, + "learning_rate": 4.229373361893024e-07, + "loss": 0.0, + "num_input_tokens_seen": 73795304, + "step": 127195 + }, + { + "epoch": 18.945487042001787, + "grad_norm": 9.514911653241143e-06, + "learning_rate": 4.223423691176287e-07, + "loss": 0.0, + "num_input_tokens_seen": 73798376, + "step": 127200 + }, + { + "epoch": 18.946231754542747, + "grad_norm": 1.5268986317096278e-05, + "learning_rate": 4.2174781726049826e-07, + "loss": 0.0, + "num_input_tokens_seen": 73801032, + "step": 127205 + }, + { + "epoch": 18.946976467083704, + "grad_norm": 3.7194326978351455e-06, + "learning_rate": 4.2115368062796147e-07, + "loss": 0.0, + "num_input_tokens_seen": 73803912, + "step": 127210 + }, + { + "epoch": 18.947721179624665, + "grad_norm": 8.661736501380801e-06, + "learning_rate": 4.205599592300491e-07, + "loss": 0.0, + "num_input_tokens_seen": 73806696, + "step": 127215 + }, + { + "epoch": 18.948465892165625, + "grad_norm": 2.4414491690549767e-06, + "learning_rate": 4.199666530767948e-07, + "loss": 0.0, + "num_input_tokens_seen": 73809736, + "step": 127220 + }, + { + "epoch": 18.949210604706582, + "grad_norm": 6.475695954577532e-06, + "learning_rate": 4.19373762178224e-07, + "loss": 0.0, + "num_input_tokens_seen": 73812904, + "step": 127225 + }, + { + "epoch": 18.949955317247543, + "grad_norm": 2.5665572138677817e-06, + "learning_rate": 4.187812865443508e-07, + "loss": 0.0, + "num_input_tokens_seen": 73815624, + "step": 127230 + }, + { + "epoch": 18.9507000297885, + "grad_norm": 4.004669790447224e-06, + "learning_rate": 4.1818922618518386e-07, + "loss": 0.0, + "num_input_tokens_seen": 73818568, + "step": 127235 + }, + { + "epoch": 18.95144474232946, + "grad_norm": 6.217543614184251e-06, + "learning_rate": 4.175975811107263e-07, + "loss": 0.0, + "num_input_tokens_seen": 73821352, + "step": 127240 + }, + { + "epoch": 18.95218945487042, + "grad_norm": 2.8818690225307364e-06, + "learning_rate": 4.17006351330973e-07, + "loss": 0.0, + "num_input_tokens_seen": 73824232, + "step": 127245 + }, + { + "epoch": 18.952934167411378, + "grad_norm": 5.175052592676366e-06, + "learning_rate": 4.164155368559103e-07, + "loss": 0.0, + "num_input_tokens_seen": 73827080, + "step": 127250 + }, + { + "epoch": 18.95367887995234, + "grad_norm": 5.048825642006705e-06, + "learning_rate": 4.1582513769552467e-07, + "loss": 0.0, + "num_input_tokens_seen": 73829960, + "step": 127255 + }, + { + "epoch": 18.9544235924933, + "grad_norm": 0.00017803952505346388, + "learning_rate": 4.1523515385978317e-07, + "loss": 0.0, + "num_input_tokens_seen": 73832776, + "step": 127260 + }, + { + "epoch": 18.955168305034256, + "grad_norm": 2.4443936581519665e-06, + "learning_rate": 4.1464558535866117e-07, + "loss": 0.0, + "num_input_tokens_seen": 73835528, + "step": 127265 + }, + { + "epoch": 18.955913017575217, + "grad_norm": 6.409962225006893e-06, + "learning_rate": 4.140564322021145e-07, + "loss": 0.0, + "num_input_tokens_seen": 73838472, + "step": 127270 + }, + { + "epoch": 18.956657730116174, + "grad_norm": 3.7479509046534076e-05, + "learning_rate": 4.1346769440009094e-07, + "loss": 0.0, + "num_input_tokens_seen": 73841352, + "step": 127275 + }, + { + "epoch": 18.957402442657134, + "grad_norm": 3.0530702588293934e-06, + "learning_rate": 4.12879371962549e-07, + "loss": 0.0, + "num_input_tokens_seen": 73844392, + "step": 127280 + }, + { + "epoch": 18.958147155198095, + "grad_norm": 2.7814585337182507e-06, + "learning_rate": 4.1229146489941416e-07, + "loss": 0.0, + "num_input_tokens_seen": 73847432, + "step": 127285 + }, + { + "epoch": 18.95889186773905, + "grad_norm": 0.00014691680553369224, + "learning_rate": 4.1170397322063125e-07, + "loss": 0.0, + "num_input_tokens_seen": 73850408, + "step": 127290 + }, + { + "epoch": 18.959636580280012, + "grad_norm": 6.2240665101853665e-06, + "learning_rate": 4.111168969361173e-07, + "loss": 0.0, + "num_input_tokens_seen": 73853480, + "step": 127295 + }, + { + "epoch": 18.960381292820973, + "grad_norm": 2.9211867513367906e-05, + "learning_rate": 4.1053023605579223e-07, + "loss": 0.0, + "num_input_tokens_seen": 73856456, + "step": 127300 + }, + { + "epoch": 18.96112600536193, + "grad_norm": 3.4879474242188735e-06, + "learning_rate": 4.0994399058956743e-07, + "loss": 0.0, + "num_input_tokens_seen": 73859112, + "step": 127305 + }, + { + "epoch": 18.96187071790289, + "grad_norm": 4.717552201327635e-06, + "learning_rate": 4.0935816054734343e-07, + "loss": 0.0, + "num_input_tokens_seen": 73862088, + "step": 127310 + }, + { + "epoch": 18.962615430443847, + "grad_norm": 8.594514838478062e-06, + "learning_rate": 4.0877274593902335e-07, + "loss": 0.0, + "num_input_tokens_seen": 73864936, + "step": 127315 + }, + { + "epoch": 18.963360142984808, + "grad_norm": 2.9135471777408384e-06, + "learning_rate": 4.0818774677449377e-07, + "loss": 0.0, + "num_input_tokens_seen": 73868104, + "step": 127320 + }, + { + "epoch": 18.96410485552577, + "grad_norm": 2.5187430310325e-06, + "learning_rate": 4.0760316306363844e-07, + "loss": 0.0, + "num_input_tokens_seen": 73871080, + "step": 127325 + }, + { + "epoch": 18.964849568066725, + "grad_norm": 2.6034897473437013e-06, + "learning_rate": 4.0701899481633277e-07, + "loss": 0.0, + "num_input_tokens_seen": 73873832, + "step": 127330 + }, + { + "epoch": 18.965594280607686, + "grad_norm": 1.0691213901736774e-05, + "learning_rate": 4.0643524204244665e-07, + "loss": 0.0, + "num_input_tokens_seen": 73876520, + "step": 127335 + }, + { + "epoch": 18.966338993148646, + "grad_norm": 1.9194851574866334e-06, + "learning_rate": 4.0585190475184166e-07, + "loss": 0.0, + "num_input_tokens_seen": 73879496, + "step": 127340 + }, + { + "epoch": 18.967083705689603, + "grad_norm": 4.2693386603787076e-06, + "learning_rate": 4.05268982954371e-07, + "loss": 0.0, + "num_input_tokens_seen": 73882248, + "step": 127345 + }, + { + "epoch": 18.967828418230564, + "grad_norm": 1.3984973520564381e-05, + "learning_rate": 4.0468647665988513e-07, + "loss": 0.0, + "num_input_tokens_seen": 73885000, + "step": 127350 + }, + { + "epoch": 18.96857313077152, + "grad_norm": 1.9427277948125266e-05, + "learning_rate": 4.041043858782234e-07, + "loss": 0.0002, + "num_input_tokens_seen": 73888040, + "step": 127355 + }, + { + "epoch": 18.96931784331248, + "grad_norm": 1.3822075743519235e-05, + "learning_rate": 4.0352271061921966e-07, + "loss": 0.0, + "num_input_tokens_seen": 73890856, + "step": 127360 + }, + { + "epoch": 18.970062555853442, + "grad_norm": 3.4168090223829495e-06, + "learning_rate": 4.0294145089270205e-07, + "loss": 0.0, + "num_input_tokens_seen": 73893896, + "step": 127365 + }, + { + "epoch": 18.9708072683944, + "grad_norm": 1.680727109487634e-05, + "learning_rate": 4.0236060670848783e-07, + "loss": 0.0, + "num_input_tokens_seen": 73896616, + "step": 127370 + }, + { + "epoch": 18.97155198093536, + "grad_norm": 6.936462341400329e-06, + "learning_rate": 4.0178017807639136e-07, + "loss": 0.0, + "num_input_tokens_seen": 73899656, + "step": 127375 + }, + { + "epoch": 18.972296693476316, + "grad_norm": 3.865328380925348e-06, + "learning_rate": 4.012001650062186e-07, + "loss": 0.0, + "num_input_tokens_seen": 73902696, + "step": 127380 + }, + { + "epoch": 18.973041406017277, + "grad_norm": 0.00022331014042720199, + "learning_rate": 4.0062056750776734e-07, + "loss": 0.0, + "num_input_tokens_seen": 73905480, + "step": 127385 + }, + { + "epoch": 18.973786118558237, + "grad_norm": 2.2988531782175414e-05, + "learning_rate": 4.000413855908297e-07, + "loss": 0.0, + "num_input_tokens_seen": 73908264, + "step": 127390 + }, + { + "epoch": 18.974530831099194, + "grad_norm": 5.270244400890078e-06, + "learning_rate": 3.9946261926519233e-07, + "loss": 0.0, + "num_input_tokens_seen": 73911272, + "step": 127395 + }, + { + "epoch": 18.975275543640155, + "grad_norm": 4.7469688979617786e-06, + "learning_rate": 3.9888426854063075e-07, + "loss": 0.0, + "num_input_tokens_seen": 73914376, + "step": 127400 + }, + { + "epoch": 18.976020256181116, + "grad_norm": 3.2320976970368065e-06, + "learning_rate": 3.9830633342691494e-07, + "loss": 0.0, + "num_input_tokens_seen": 73917064, + "step": 127405 + }, + { + "epoch": 18.976764968722073, + "grad_norm": 4.475357854971662e-06, + "learning_rate": 3.9772881393380923e-07, + "loss": 0.0, + "num_input_tokens_seen": 73920232, + "step": 127410 + }, + { + "epoch": 18.977509681263033, + "grad_norm": 3.072836079809349e-06, + "learning_rate": 3.9715171007107256e-07, + "loss": 0.0, + "num_input_tokens_seen": 73922952, + "step": 127415 + }, + { + "epoch": 18.97825439380399, + "grad_norm": 1.9292785509605892e-05, + "learning_rate": 3.9657502184844983e-07, + "loss": 0.0, + "num_input_tokens_seen": 73925768, + "step": 127420 + }, + { + "epoch": 18.97899910634495, + "grad_norm": 0.00023669937218073756, + "learning_rate": 3.959987492756889e-07, + "loss": 0.0, + "num_input_tokens_seen": 73928616, + "step": 127425 + }, + { + "epoch": 18.97974381888591, + "grad_norm": 2.6534673907008255e-06, + "learning_rate": 3.9542289236252363e-07, + "loss": 0.0, + "num_input_tokens_seen": 73931496, + "step": 127430 + }, + { + "epoch": 18.980488531426868, + "grad_norm": 4.59880766356946e-06, + "learning_rate": 3.948474511186767e-07, + "loss": 0.0, + "num_input_tokens_seen": 73934504, + "step": 127435 + }, + { + "epoch": 18.98123324396783, + "grad_norm": 4.054400051245466e-06, + "learning_rate": 3.9427242555387935e-07, + "loss": 0.0, + "num_input_tokens_seen": 73937704, + "step": 127440 + }, + { + "epoch": 18.98197795650879, + "grad_norm": 3.374151128809899e-05, + "learning_rate": 3.936978156778376e-07, + "loss": 0.0, + "num_input_tokens_seen": 73940680, + "step": 127445 + }, + { + "epoch": 18.982722669049746, + "grad_norm": 5.537337528949138e-06, + "learning_rate": 3.9312362150026594e-07, + "loss": 0.0, + "num_input_tokens_seen": 73943592, + "step": 127450 + }, + { + "epoch": 18.983467381590707, + "grad_norm": 4.1124026211036835e-06, + "learning_rate": 3.925498430308594e-07, + "loss": 0.0, + "num_input_tokens_seen": 73946184, + "step": 127455 + }, + { + "epoch": 18.984212094131664, + "grad_norm": 8.114977390505373e-05, + "learning_rate": 3.91976480279313e-07, + "loss": 0.0, + "num_input_tokens_seen": 73948840, + "step": 127460 + }, + { + "epoch": 18.984956806672624, + "grad_norm": 5.79707602810231e-06, + "learning_rate": 3.914035332553162e-07, + "loss": 0.0, + "num_input_tokens_seen": 73951528, + "step": 127465 + }, + { + "epoch": 18.985701519213585, + "grad_norm": 7.737398846074939e-05, + "learning_rate": 3.9083100196854183e-07, + "loss": 0.0, + "num_input_tokens_seen": 73954312, + "step": 127470 + }, + { + "epoch": 18.98644623175454, + "grad_norm": 0.0006781897973269224, + "learning_rate": 3.9025888642866827e-07, + "loss": 0.0, + "num_input_tokens_seen": 73957000, + "step": 127475 + }, + { + "epoch": 18.987190944295502, + "grad_norm": 1.9469543985906057e-06, + "learning_rate": 3.896871866453572e-07, + "loss": 0.0, + "num_input_tokens_seen": 73960104, + "step": 127480 + }, + { + "epoch": 18.987935656836463, + "grad_norm": 2.6196199542027898e-05, + "learning_rate": 3.891159026282704e-07, + "loss": 0.0, + "num_input_tokens_seen": 73962632, + "step": 127485 + }, + { + "epoch": 18.98868036937742, + "grad_norm": 3.940684109693393e-06, + "learning_rate": 3.885450343870556e-07, + "loss": 0.0, + "num_input_tokens_seen": 73965704, + "step": 127490 + }, + { + "epoch": 18.98942508191838, + "grad_norm": 0.0005628751823678613, + "learning_rate": 3.8797458193135793e-07, + "loss": 0.0, + "num_input_tokens_seen": 73968776, + "step": 127495 + }, + { + "epoch": 18.990169794459337, + "grad_norm": 0.00015467722550965846, + "learning_rate": 3.8740454527081693e-07, + "loss": 0.0, + "num_input_tokens_seen": 73971624, + "step": 127500 + }, + { + "epoch": 18.990914507000298, + "grad_norm": 0.0007934033637866378, + "learning_rate": 3.8683492441506097e-07, + "loss": 0.0, + "num_input_tokens_seen": 73974504, + "step": 127505 + }, + { + "epoch": 18.99165921954126, + "grad_norm": 4.034522589790868e-06, + "learning_rate": 3.862657193737129e-07, + "loss": 0.0, + "num_input_tokens_seen": 73977736, + "step": 127510 + }, + { + "epoch": 18.992403932082215, + "grad_norm": 2.7344412956153974e-06, + "learning_rate": 3.856969301563873e-07, + "loss": 0.0, + "num_input_tokens_seen": 73980648, + "step": 127515 + }, + { + "epoch": 18.993148644623176, + "grad_norm": 5.0300808652536944e-05, + "learning_rate": 3.8512855677269586e-07, + "loss": 0.0, + "num_input_tokens_seen": 73983560, + "step": 127520 + }, + { + "epoch": 18.993893357164133, + "grad_norm": 1.121880723076174e-05, + "learning_rate": 3.845605992322393e-07, + "loss": 0.0, + "num_input_tokens_seen": 73986376, + "step": 127525 + }, + { + "epoch": 18.994638069705093, + "grad_norm": 2.1904854747845093e-06, + "learning_rate": 3.8399305754461546e-07, + "loss": 0.0, + "num_input_tokens_seen": 73989288, + "step": 127530 + }, + { + "epoch": 18.995382782246054, + "grad_norm": 8.016856554604601e-06, + "learning_rate": 3.834259317194083e-07, + "loss": 0.0, + "num_input_tokens_seen": 73992104, + "step": 127535 + }, + { + "epoch": 18.99612749478701, + "grad_norm": 0.000964353617746383, + "learning_rate": 3.828592217662047e-07, + "loss": 0.0, + "num_input_tokens_seen": 73995080, + "step": 127540 + }, + { + "epoch": 18.99687220732797, + "grad_norm": 8.021333087526727e-06, + "learning_rate": 3.8229292769457193e-07, + "loss": 0.0, + "num_input_tokens_seen": 73997800, + "step": 127545 + }, + { + "epoch": 18.997616919868932, + "grad_norm": 4.925188022752991e-06, + "learning_rate": 3.8172704951408013e-07, + "loss": 0.0, + "num_input_tokens_seen": 74000520, + "step": 127550 + }, + { + "epoch": 18.99836163240989, + "grad_norm": 1.891732949843572e-06, + "learning_rate": 3.811615872342883e-07, + "loss": 0.0, + "num_input_tokens_seen": 74003368, + "step": 127555 + }, + { + "epoch": 18.99910634495085, + "grad_norm": 9.915389455272816e-06, + "learning_rate": 3.805965408647527e-07, + "loss": 0.0, + "num_input_tokens_seen": 74006088, + "step": 127560 + }, + { + "epoch": 18.999851057491806, + "grad_norm": 9.838044206844643e-06, + "learning_rate": 3.8003191041501575e-07, + "loss": 0.0, + "num_input_tokens_seen": 74009000, + "step": 127565 + }, + { + "epoch": 19.0, + "eval_loss": 3.815547227859497, + "eval_runtime": 51.5336, + "eval_samples_per_second": 57.904, + "eval_steps_per_second": 14.476, + "num_input_tokens_seen": 74009160, + "step": 127566 + }, + { + "epoch": 19.000595770032767, + "grad_norm": 4.634096057998249e-06, + "learning_rate": 3.794676958946142e-07, + "loss": 0.0, + "num_input_tokens_seen": 74011400, + "step": 127570 + }, + { + "epoch": 19.001340482573728, + "grad_norm": 2.288994210175588e-06, + "learning_rate": 3.7890389731308486e-07, + "loss": 0.0, + "num_input_tokens_seen": 74014120, + "step": 127575 + }, + { + "epoch": 19.002085195114685, + "grad_norm": 4.9093287088908255e-06, + "learning_rate": 3.783405146799479e-07, + "loss": 0.0, + "num_input_tokens_seen": 74016744, + "step": 127580 + }, + { + "epoch": 19.002829907655645, + "grad_norm": 9.277795470552519e-06, + "learning_rate": 3.777775480047263e-07, + "loss": 0.0, + "num_input_tokens_seen": 74019464, + "step": 127585 + }, + { + "epoch": 19.003574620196606, + "grad_norm": 6.727700565534178e-06, + "learning_rate": 3.772149972969291e-07, + "loss": 0.0, + "num_input_tokens_seen": 74022568, + "step": 127590 + }, + { + "epoch": 19.004319332737563, + "grad_norm": 5.284116014081519e-06, + "learning_rate": 3.76652862566057e-07, + "loss": 0.0, + "num_input_tokens_seen": 74025320, + "step": 127595 + }, + { + "epoch": 19.005064045278523, + "grad_norm": 8.577504559070803e-06, + "learning_rate": 3.7609114382160803e-07, + "loss": 0.0, + "num_input_tokens_seen": 74028168, + "step": 127600 + }, + { + "epoch": 19.00580875781948, + "grad_norm": 2.5734661903697997e-06, + "learning_rate": 3.7552984107307177e-07, + "loss": 0.0, + "num_input_tokens_seen": 74031048, + "step": 127605 + }, + { + "epoch": 19.00655347036044, + "grad_norm": 1.84082928171847e-05, + "learning_rate": 3.7496895432993505e-07, + "loss": 0.0, + "num_input_tokens_seen": 74033736, + "step": 127610 + }, + { + "epoch": 19.0072981829014, + "grad_norm": 4.349210939835757e-06, + "learning_rate": 3.7440848360166813e-07, + "loss": 0.0, + "num_input_tokens_seen": 74036360, + "step": 127615 + }, + { + "epoch": 19.008042895442358, + "grad_norm": 2.1078603822388686e-06, + "learning_rate": 3.738484288977412e-07, + "loss": 0.0, + "num_input_tokens_seen": 74038952, + "step": 127620 + }, + { + "epoch": 19.00878760798332, + "grad_norm": 2.744872290350031e-06, + "learning_rate": 3.7328879022761886e-07, + "loss": 0.0, + "num_input_tokens_seen": 74041576, + "step": 127625 + }, + { + "epoch": 19.00953232052428, + "grad_norm": 5.636547939502634e-06, + "learning_rate": 3.7272956760075197e-07, + "loss": 0.0, + "num_input_tokens_seen": 74044488, + "step": 127630 + }, + { + "epoch": 19.010277033065236, + "grad_norm": 9.786905138753355e-06, + "learning_rate": 3.7217076102658845e-07, + "loss": 0.0, + "num_input_tokens_seen": 74047176, + "step": 127635 + }, + { + "epoch": 19.011021745606197, + "grad_norm": 6.795907211198937e-06, + "learning_rate": 3.7161237051456796e-07, + "loss": 0.0, + "num_input_tokens_seen": 74050024, + "step": 127640 + }, + { + "epoch": 19.011766458147154, + "grad_norm": 2.1200685296207666e-05, + "learning_rate": 3.710543960741275e-07, + "loss": 0.0, + "num_input_tokens_seen": 74052904, + "step": 127645 + }, + { + "epoch": 19.012511170688114, + "grad_norm": 9.191233402816579e-06, + "learning_rate": 3.7049683771468723e-07, + "loss": 0.0, + "num_input_tokens_seen": 74055592, + "step": 127650 + }, + { + "epoch": 19.013255883229075, + "grad_norm": 1.8327124053030275e-05, + "learning_rate": 3.6993969544567575e-07, + "loss": 0.0, + "num_input_tokens_seen": 74058760, + "step": 127655 + }, + { + "epoch": 19.01400059577003, + "grad_norm": 6.358267000905471e-06, + "learning_rate": 3.693829692764966e-07, + "loss": 0.0, + "num_input_tokens_seen": 74061864, + "step": 127660 + }, + { + "epoch": 19.014745308310992, + "grad_norm": 2.6914860427496023e-05, + "learning_rate": 3.68826659216559e-07, + "loss": 0.0, + "num_input_tokens_seen": 74064712, + "step": 127665 + }, + { + "epoch": 19.015490020851953, + "grad_norm": 1.136917489930056e-05, + "learning_rate": 3.682707652752637e-07, + "loss": 0.0, + "num_input_tokens_seen": 74067496, + "step": 127670 + }, + { + "epoch": 19.01623473339291, + "grad_norm": 1.1885374988196418e-05, + "learning_rate": 3.677152874619949e-07, + "loss": 0.0, + "num_input_tokens_seen": 74070376, + "step": 127675 + }, + { + "epoch": 19.01697944593387, + "grad_norm": 1.4363284208229743e-05, + "learning_rate": 3.671602257861451e-07, + "loss": 0.0, + "num_input_tokens_seen": 74073384, + "step": 127680 + }, + { + "epoch": 19.017724158474827, + "grad_norm": 2.4664739157742588e-06, + "learning_rate": 3.666055802570845e-07, + "loss": 0.0, + "num_input_tokens_seen": 74076424, + "step": 127685 + }, + { + "epoch": 19.018468871015788, + "grad_norm": 1.7681981262285262e-05, + "learning_rate": 3.6605135088418895e-07, + "loss": 0.0, + "num_input_tokens_seen": 74078952, + "step": 127690 + }, + { + "epoch": 19.01921358355675, + "grad_norm": 7.521979114244459e-06, + "learning_rate": 3.654975376768205e-07, + "loss": 0.0, + "num_input_tokens_seen": 74082312, + "step": 127695 + }, + { + "epoch": 19.019958296097705, + "grad_norm": 2.5380825263709994e-06, + "learning_rate": 3.649441406443327e-07, + "loss": 0.0, + "num_input_tokens_seen": 74085384, + "step": 127700 + }, + { + "epoch": 19.020703008638666, + "grad_norm": 3.824524173978716e-05, + "learning_rate": 3.643911597960736e-07, + "loss": 0.0, + "num_input_tokens_seen": 74088200, + "step": 127705 + }, + { + "epoch": 19.021447721179623, + "grad_norm": 1.7068152374122292e-05, + "learning_rate": 3.6383859514138864e-07, + "loss": 0.0, + "num_input_tokens_seen": 74090888, + "step": 127710 + }, + { + "epoch": 19.022192433720583, + "grad_norm": 4.100836576981237e-06, + "learning_rate": 3.6328644668961187e-07, + "loss": 0.0, + "num_input_tokens_seen": 74093736, + "step": 127715 + }, + { + "epoch": 19.022937146261544, + "grad_norm": 7.645016921742354e-06, + "learning_rate": 3.6273471445006923e-07, + "loss": 0.0, + "num_input_tokens_seen": 74096680, + "step": 127720 + }, + { + "epoch": 19.0236818588025, + "grad_norm": 0.0014775650342926383, + "learning_rate": 3.621833984320838e-07, + "loss": 0.0, + "num_input_tokens_seen": 74099720, + "step": 127725 + }, + { + "epoch": 19.02442657134346, + "grad_norm": 0.00011884622654179111, + "learning_rate": 3.616324986449676e-07, + "loss": 0.0, + "num_input_tokens_seen": 74102664, + "step": 127730 + }, + { + "epoch": 19.025171283884422, + "grad_norm": 0.023002905771136284, + "learning_rate": 3.6108201509803263e-07, + "loss": 0.0, + "num_input_tokens_seen": 74105640, + "step": 127735 + }, + { + "epoch": 19.02591599642538, + "grad_norm": 6.635349109274102e-06, + "learning_rate": 3.605319478005714e-07, + "loss": 0.0, + "num_input_tokens_seen": 74108424, + "step": 127740 + }, + { + "epoch": 19.02666070896634, + "grad_norm": 1.221903494297294e-05, + "learning_rate": 3.599822967618849e-07, + "loss": 0.0, + "num_input_tokens_seen": 74111400, + "step": 127745 + }, + { + "epoch": 19.027405421507297, + "grad_norm": 2.1820785605086712e-06, + "learning_rate": 3.594330619912517e-07, + "loss": 0.0, + "num_input_tokens_seen": 74114344, + "step": 127750 + }, + { + "epoch": 19.028150134048257, + "grad_norm": 2.1173973436816595e-06, + "learning_rate": 3.5888424349795615e-07, + "loss": 0.0, + "num_input_tokens_seen": 74117288, + "step": 127755 + }, + { + "epoch": 19.028894846589218, + "grad_norm": 5.066460744274082e-06, + "learning_rate": 3.5833584129126574e-07, + "loss": 0.0, + "num_input_tokens_seen": 74120168, + "step": 127760 + }, + { + "epoch": 19.029639559130175, + "grad_norm": 1.5967101489877678e-06, + "learning_rate": 3.5778785538044255e-07, + "loss": 0.0, + "num_input_tokens_seen": 74123368, + "step": 127765 + }, + { + "epoch": 19.030384271671135, + "grad_norm": 0.00015441504365298897, + "learning_rate": 3.572402857747542e-07, + "loss": 0.0, + "num_input_tokens_seen": 74126248, + "step": 127770 + }, + { + "epoch": 19.031128984212096, + "grad_norm": 2.1740827378380345e-06, + "learning_rate": 3.566931324834405e-07, + "loss": 0.0, + "num_input_tokens_seen": 74129320, + "step": 127775 + }, + { + "epoch": 19.031873696753053, + "grad_norm": 0.003589038271456957, + "learning_rate": 3.5614639551575235e-07, + "loss": 0.0, + "num_input_tokens_seen": 74132200, + "step": 127780 + }, + { + "epoch": 19.032618409294013, + "grad_norm": 3.2149673643289134e-06, + "learning_rate": 3.5560007488092404e-07, + "loss": 0.0, + "num_input_tokens_seen": 74134952, + "step": 127785 + }, + { + "epoch": 19.03336312183497, + "grad_norm": 2.1022924556746148e-05, + "learning_rate": 3.5505417058818437e-07, + "loss": 0.0, + "num_input_tokens_seen": 74137576, + "step": 127790 + }, + { + "epoch": 19.03410783437593, + "grad_norm": 3.3294525110250106e-06, + "learning_rate": 3.545086826467592e-07, + "loss": 0.0, + "num_input_tokens_seen": 74140360, + "step": 127795 + }, + { + "epoch": 19.03485254691689, + "grad_norm": 0.0006010639481246471, + "learning_rate": 3.5396361106585787e-07, + "loss": 0.0, + "num_input_tokens_seen": 74143592, + "step": 127800 + }, + { + "epoch": 19.035597259457848, + "grad_norm": 0.00023157290706876665, + "learning_rate": 3.534189558546924e-07, + "loss": 0.0, + "num_input_tokens_seen": 74146600, + "step": 127805 + }, + { + "epoch": 19.03634197199881, + "grad_norm": 8.277493179775774e-05, + "learning_rate": 3.5287471702246386e-07, + "loss": 0.0, + "num_input_tokens_seen": 74149512, + "step": 127810 + }, + { + "epoch": 19.03708668453977, + "grad_norm": 2.1443975128931925e-06, + "learning_rate": 3.5233089457837045e-07, + "loss": 0.0, + "num_input_tokens_seen": 74152680, + "step": 127815 + }, + { + "epoch": 19.037831397080726, + "grad_norm": 2.3152206267695874e-05, + "learning_rate": 3.517874885315936e-07, + "loss": 0.0, + "num_input_tokens_seen": 74155528, + "step": 127820 + }, + { + "epoch": 19.038576109621687, + "grad_norm": 3.538830060278997e-05, + "learning_rate": 3.5124449889131495e-07, + "loss": 0.0, + "num_input_tokens_seen": 74158472, + "step": 127825 + }, + { + "epoch": 19.039320822162644, + "grad_norm": 0.00018244131933897734, + "learning_rate": 3.5070192566671046e-07, + "loss": 0.0, + "num_input_tokens_seen": 74161352, + "step": 127830 + }, + { + "epoch": 19.040065534703604, + "grad_norm": 2.132644658559002e-05, + "learning_rate": 3.5015976886694226e-07, + "loss": 0.0, + "num_input_tokens_seen": 74164456, + "step": 127835 + }, + { + "epoch": 19.040810247244565, + "grad_norm": 0.018161555752158165, + "learning_rate": 3.496180285011724e-07, + "loss": 0.0, + "num_input_tokens_seen": 74167464, + "step": 127840 + }, + { + "epoch": 19.041554959785522, + "grad_norm": 1.2452757800929248e-05, + "learning_rate": 3.49076704578552e-07, + "loss": 0.0, + "num_input_tokens_seen": 74170408, + "step": 127845 + }, + { + "epoch": 19.042299672326482, + "grad_norm": 0.00041696825064718723, + "learning_rate": 3.4853579710822923e-07, + "loss": 0.0, + "num_input_tokens_seen": 74173160, + "step": 127850 + }, + { + "epoch": 19.043044384867443, + "grad_norm": 1.4823100173089188e-05, + "learning_rate": 3.4799530609933575e-07, + "loss": 0.0, + "num_input_tokens_seen": 74176136, + "step": 127855 + }, + { + "epoch": 19.0437890974084, + "grad_norm": 4.366304892755579e-06, + "learning_rate": 3.474552315610086e-07, + "loss": 0.0, + "num_input_tokens_seen": 74178856, + "step": 127860 + }, + { + "epoch": 19.04453380994936, + "grad_norm": 7.847593224141747e-06, + "learning_rate": 3.4691557350236827e-07, + "loss": 0.0, + "num_input_tokens_seen": 74181960, + "step": 127865 + }, + { + "epoch": 19.045278522490317, + "grad_norm": 9.13215808395762e-06, + "learning_rate": 3.4637633193253525e-07, + "loss": 0.0, + "num_input_tokens_seen": 74185000, + "step": 127870 + }, + { + "epoch": 19.046023235031278, + "grad_norm": 5.6873336689022835e-06, + "learning_rate": 3.458375068606162e-07, + "loss": 0.0, + "num_input_tokens_seen": 74188008, + "step": 127875 + }, + { + "epoch": 19.04676794757224, + "grad_norm": 5.258007149677724e-06, + "learning_rate": 3.4529909829571494e-07, + "loss": 0.0, + "num_input_tokens_seen": 74190856, + "step": 127880 + }, + { + "epoch": 19.047512660113195, + "grad_norm": 2.044975644821534e-06, + "learning_rate": 3.447611062469269e-07, + "loss": 0.0, + "num_input_tokens_seen": 74193480, + "step": 127885 + }, + { + "epoch": 19.048257372654156, + "grad_norm": 0.0009935441194102168, + "learning_rate": 3.442235307233449e-07, + "loss": 0.0, + "num_input_tokens_seen": 74196328, + "step": 127890 + }, + { + "epoch": 19.049002085195113, + "grad_norm": 0.0003107813827227801, + "learning_rate": 3.4368637173404494e-07, + "loss": 0.0, + "num_input_tokens_seen": 74199048, + "step": 127895 + }, + { + "epoch": 19.049746797736073, + "grad_norm": 1.8844957594410516e-06, + "learning_rate": 3.4314962928810315e-07, + "loss": 0.0, + "num_input_tokens_seen": 74201960, + "step": 127900 + }, + { + "epoch": 19.050491510277034, + "grad_norm": 2.7100240913568996e-06, + "learning_rate": 3.426133033945872e-07, + "loss": 0.0, + "num_input_tokens_seen": 74204680, + "step": 127905 + }, + { + "epoch": 19.05123622281799, + "grad_norm": 6.713375114486553e-06, + "learning_rate": 3.420773940625621e-07, + "loss": 0.0, + "num_input_tokens_seen": 74207656, + "step": 127910 + }, + { + "epoch": 19.05198093535895, + "grad_norm": 2.138049467248493e-06, + "learning_rate": 3.415419013010762e-07, + "loss": 0.0, + "num_input_tokens_seen": 74210568, + "step": 127915 + }, + { + "epoch": 19.052725647899912, + "grad_norm": 3.2030825423134957e-06, + "learning_rate": 3.410068251191806e-07, + "loss": 0.0, + "num_input_tokens_seen": 74213512, + "step": 127920 + }, + { + "epoch": 19.05347036044087, + "grad_norm": 5.7293429563287646e-05, + "learning_rate": 3.4047216552590687e-07, + "loss": 0.0, + "num_input_tokens_seen": 74216488, + "step": 127925 + }, + { + "epoch": 19.05421507298183, + "grad_norm": 2.4671155642863596e-06, + "learning_rate": 3.399379225302979e-07, + "loss": 0.0, + "num_input_tokens_seen": 74219432, + "step": 127930 + }, + { + "epoch": 19.054959785522787, + "grad_norm": 7.132161044864915e-06, + "learning_rate": 3.3940409614137135e-07, + "loss": 0.0, + "num_input_tokens_seen": 74221960, + "step": 127935 + }, + { + "epoch": 19.055704498063747, + "grad_norm": 8.967240319179837e-06, + "learning_rate": 3.3887068636815346e-07, + "loss": 0.0, + "num_input_tokens_seen": 74224776, + "step": 127940 + }, + { + "epoch": 19.056449210604708, + "grad_norm": 1.271046403417131e-05, + "learning_rate": 3.3833769321964527e-07, + "loss": 0.0, + "num_input_tokens_seen": 74227912, + "step": 127945 + }, + { + "epoch": 19.057193923145665, + "grad_norm": 3.01780050904199e-06, + "learning_rate": 3.378051167048618e-07, + "loss": 0.0, + "num_input_tokens_seen": 74230664, + "step": 127950 + }, + { + "epoch": 19.057938635686625, + "grad_norm": 2.3879904347268166e-06, + "learning_rate": 3.3727295683279314e-07, + "loss": 0.0, + "num_input_tokens_seen": 74233576, + "step": 127955 + }, + { + "epoch": 19.058683348227586, + "grad_norm": 2.9377927148743765e-06, + "learning_rate": 3.367412136124321e-07, + "loss": 0.0, + "num_input_tokens_seen": 74236296, + "step": 127960 + }, + { + "epoch": 19.059428060768543, + "grad_norm": 2.8338938591332408e-06, + "learning_rate": 3.3620988705276023e-07, + "loss": 0.0, + "num_input_tokens_seen": 74239016, + "step": 127965 + }, + { + "epoch": 19.060172773309503, + "grad_norm": 2.4716740881558508e-05, + "learning_rate": 3.3567897716275663e-07, + "loss": 0.0, + "num_input_tokens_seen": 74241992, + "step": 127970 + }, + { + "epoch": 19.06091748585046, + "grad_norm": 0.001827824511565268, + "learning_rate": 3.351484839513891e-07, + "loss": 0.0, + "num_input_tokens_seen": 74244584, + "step": 127975 + }, + { + "epoch": 19.06166219839142, + "grad_norm": 2.0001411940029357e-06, + "learning_rate": 3.3461840742761707e-07, + "loss": 0.0, + "num_input_tokens_seen": 74247464, + "step": 127980 + }, + { + "epoch": 19.06240691093238, + "grad_norm": 7.0055361902632285e-06, + "learning_rate": 3.340887476004001e-07, + "loss": 0.0, + "num_input_tokens_seen": 74250632, + "step": 127985 + }, + { + "epoch": 19.06315162347334, + "grad_norm": 2.553786544012837e-06, + "learning_rate": 3.3355950447868657e-07, + "loss": 0.0, + "num_input_tokens_seen": 74253704, + "step": 127990 + }, + { + "epoch": 19.0638963360143, + "grad_norm": 1.3874507203581743e-05, + "learning_rate": 3.3303067807141095e-07, + "loss": 0.0, + "num_input_tokens_seen": 74257000, + "step": 127995 + }, + { + "epoch": 19.06464104855526, + "grad_norm": 2.5445519895583857e-06, + "learning_rate": 3.325022683875162e-07, + "loss": 0.0, + "num_input_tokens_seen": 74259720, + "step": 128000 + }, + { + "epoch": 19.065385761096216, + "grad_norm": 2.2049589460948482e-05, + "learning_rate": 3.319742754359201e-07, + "loss": 0.0, + "num_input_tokens_seen": 74262600, + "step": 128005 + }, + { + "epoch": 19.066130473637177, + "grad_norm": 3.2950497370620724e-06, + "learning_rate": 3.314466992255516e-07, + "loss": 0.0, + "num_input_tokens_seen": 74265480, + "step": 128010 + }, + { + "epoch": 19.066875186178134, + "grad_norm": 6.007137471897295e-06, + "learning_rate": 3.309195397653148e-07, + "loss": 0.0, + "num_input_tokens_seen": 74268648, + "step": 128015 + }, + { + "epoch": 19.067619898719094, + "grad_norm": 2.346228711758158e-06, + "learning_rate": 3.3039279706412465e-07, + "loss": 0.0, + "num_input_tokens_seen": 74271656, + "step": 128020 + }, + { + "epoch": 19.068364611260055, + "grad_norm": 0.0001554804330226034, + "learning_rate": 3.2986647113087134e-07, + "loss": 0.0, + "num_input_tokens_seen": 74274536, + "step": 128025 + }, + { + "epoch": 19.069109323801012, + "grad_norm": 2.915144023063476e-06, + "learning_rate": 3.293405619744533e-07, + "loss": 0.0, + "num_input_tokens_seen": 74277256, + "step": 128030 + }, + { + "epoch": 19.069854036341972, + "grad_norm": 0.0002391910384176299, + "learning_rate": 3.288150696037523e-07, + "loss": 0.0, + "num_input_tokens_seen": 74280040, + "step": 128035 + }, + { + "epoch": 19.070598748882933, + "grad_norm": 6.08567916060565e-06, + "learning_rate": 3.282899940276418e-07, + "loss": 0.0, + "num_input_tokens_seen": 74282952, + "step": 128040 + }, + { + "epoch": 19.07134346142389, + "grad_norm": 1.2339558452367783e-05, + "learning_rate": 3.2776533525500085e-07, + "loss": 0.0, + "num_input_tokens_seen": 74285800, + "step": 128045 + }, + { + "epoch": 19.07208817396485, + "grad_norm": 3.981318059231853e-06, + "learning_rate": 3.272410932946862e-07, + "loss": 0.0, + "num_input_tokens_seen": 74288616, + "step": 128050 + }, + { + "epoch": 19.072832886505807, + "grad_norm": 0.00015190574049483985, + "learning_rate": 3.267172681555575e-07, + "loss": 0.0, + "num_input_tokens_seen": 74291720, + "step": 128055 + }, + { + "epoch": 19.073577599046768, + "grad_norm": 1.0738168954849243, + "learning_rate": 3.261938598464631e-07, + "loss": 0.0016, + "num_input_tokens_seen": 74294664, + "step": 128060 + }, + { + "epoch": 19.07432231158773, + "grad_norm": 2.8098402253817767e-06, + "learning_rate": 3.256708683762488e-07, + "loss": 0.0, + "num_input_tokens_seen": 74297480, + "step": 128065 + }, + { + "epoch": 19.075067024128685, + "grad_norm": 3.4445540677552344e-06, + "learning_rate": 3.2514829375374643e-07, + "loss": 0.0, + "num_input_tokens_seen": 74300328, + "step": 128070 + }, + { + "epoch": 19.075811736669646, + "grad_norm": 1.5419310557263088e-06, + "learning_rate": 3.24626135987785e-07, + "loss": 0.0, + "num_input_tokens_seen": 74303048, + "step": 128075 + }, + { + "epoch": 19.076556449210603, + "grad_norm": 1.1731337508535944e-05, + "learning_rate": 3.2410439508718527e-07, + "loss": 0.0, + "num_input_tokens_seen": 74305736, + "step": 128080 + }, + { + "epoch": 19.077301161751564, + "grad_norm": 2.8324921004241332e-05, + "learning_rate": 3.2358307106076234e-07, + "loss": 0.0, + "num_input_tokens_seen": 74308488, + "step": 128085 + }, + { + "epoch": 19.078045874292524, + "grad_norm": 3.597711611291743e-06, + "learning_rate": 3.2306216391732593e-07, + "loss": 0.0, + "num_input_tokens_seen": 74311144, + "step": 128090 + }, + { + "epoch": 19.07879058683348, + "grad_norm": 1.0651113370840903e-05, + "learning_rate": 3.225416736656689e-07, + "loss": 0.0, + "num_input_tokens_seen": 74313864, + "step": 128095 + }, + { + "epoch": 19.07953529937444, + "grad_norm": 2.262580892420374e-06, + "learning_rate": 3.220216003145926e-07, + "loss": 0.0, + "num_input_tokens_seen": 74316904, + "step": 128100 + }, + { + "epoch": 19.080280011915402, + "grad_norm": 1.616999361431226e-05, + "learning_rate": 3.215019438728789e-07, + "loss": 0.0, + "num_input_tokens_seen": 74319464, + "step": 128105 + }, + { + "epoch": 19.08102472445636, + "grad_norm": 3.945960543205729e-06, + "learning_rate": 3.209827043493097e-07, + "loss": 0.0, + "num_input_tokens_seen": 74322216, + "step": 128110 + }, + { + "epoch": 19.08176943699732, + "grad_norm": 4.984462066204287e-06, + "learning_rate": 3.204638817526528e-07, + "loss": 0.0, + "num_input_tokens_seen": 74325064, + "step": 128115 + }, + { + "epoch": 19.082514149538277, + "grad_norm": 2.6908942345471587e-06, + "learning_rate": 3.1994547609167644e-07, + "loss": 0.0, + "num_input_tokens_seen": 74327880, + "step": 128120 + }, + { + "epoch": 19.083258862079237, + "grad_norm": 1.370002792100422e-05, + "learning_rate": 3.194274873751374e-07, + "loss": 0.0, + "num_input_tokens_seen": 74330888, + "step": 128125 + }, + { + "epoch": 19.084003574620198, + "grad_norm": 4.651948074751999e-06, + "learning_rate": 3.189099156117842e-07, + "loss": 0.0, + "num_input_tokens_seen": 74333704, + "step": 128130 + }, + { + "epoch": 19.084748287161155, + "grad_norm": 5.224212145549245e-06, + "learning_rate": 3.1839276081036816e-07, + "loss": 0.0284, + "num_input_tokens_seen": 74336424, + "step": 128135 + }, + { + "epoch": 19.085492999702115, + "grad_norm": 6.372964708134532e-05, + "learning_rate": 3.1787602297961574e-07, + "loss": 0.0, + "num_input_tokens_seen": 74339272, + "step": 128140 + }, + { + "epoch": 19.086237712243076, + "grad_norm": 5.103949661133811e-06, + "learning_rate": 3.1735970212826705e-07, + "loss": 0.0, + "num_input_tokens_seen": 74342280, + "step": 128145 + }, + { + "epoch": 19.086982424784033, + "grad_norm": 3.158265826641582e-06, + "learning_rate": 3.168437982650374e-07, + "loss": 0.0, + "num_input_tokens_seen": 74345224, + "step": 128150 + }, + { + "epoch": 19.087727137324993, + "grad_norm": 1.985715016417089e-06, + "learning_rate": 3.1632831139864763e-07, + "loss": 0.0, + "num_input_tokens_seen": 74348040, + "step": 128155 + }, + { + "epoch": 19.08847184986595, + "grad_norm": 2.6757979867397808e-05, + "learning_rate": 3.158132415378018e-07, + "loss": 0.0, + "num_input_tokens_seen": 74350760, + "step": 128160 + }, + { + "epoch": 19.08921656240691, + "grad_norm": 3.7627614801749587e-06, + "learning_rate": 3.152985886912013e-07, + "loss": 0.0, + "num_input_tokens_seen": 74353672, + "step": 128165 + }, + { + "epoch": 19.08996127494787, + "grad_norm": 5.253903964330675e-06, + "learning_rate": 3.1478435286754483e-07, + "loss": 0.0, + "num_input_tokens_seen": 74356424, + "step": 128170 + }, + { + "epoch": 19.09070598748883, + "grad_norm": 3.1841351301409304e-05, + "learning_rate": 3.14270534075517e-07, + "loss": 0.0, + "num_input_tokens_seen": 74359304, + "step": 128175 + }, + { + "epoch": 19.09145070002979, + "grad_norm": 1.6082346974144457e-06, + "learning_rate": 3.137571323237998e-07, + "loss": 0.0, + "num_input_tokens_seen": 74362120, + "step": 128180 + }, + { + "epoch": 19.09219541257075, + "grad_norm": 4.695561437983997e-05, + "learning_rate": 3.13244147621064e-07, + "loss": 0.0, + "num_input_tokens_seen": 74364872, + "step": 128185 + }, + { + "epoch": 19.092940125111706, + "grad_norm": 3.6731589716509916e-06, + "learning_rate": 3.1273157997598056e-07, + "loss": 0.0, + "num_input_tokens_seen": 74367624, + "step": 128190 + }, + { + "epoch": 19.093684837652667, + "grad_norm": 7.931688742246479e-06, + "learning_rate": 3.122194293972064e-07, + "loss": 0.0, + "num_input_tokens_seen": 74370440, + "step": 128195 + }, + { + "epoch": 19.094429550193624, + "grad_norm": 1.3911836504121311e-05, + "learning_rate": 3.117076958933901e-07, + "loss": 0.0, + "num_input_tokens_seen": 74373320, + "step": 128200 + }, + { + "epoch": 19.095174262734584, + "grad_norm": 5.4902920965105295e-05, + "learning_rate": 3.111963794731831e-07, + "loss": 0.0, + "num_input_tokens_seen": 74376520, + "step": 128205 + }, + { + "epoch": 19.095918975275545, + "grad_norm": 1.7536120139993727e-05, + "learning_rate": 3.106854801452175e-07, + "loss": 0.0, + "num_input_tokens_seen": 74379272, + "step": 128210 + }, + { + "epoch": 19.096663687816502, + "grad_norm": 1.8438936422171537e-06, + "learning_rate": 3.1017499791813067e-07, + "loss": 0.0, + "num_input_tokens_seen": 74382088, + "step": 128215 + }, + { + "epoch": 19.097408400357462, + "grad_norm": 3.1233828394761076e-06, + "learning_rate": 3.096649328005435e-07, + "loss": 0.0, + "num_input_tokens_seen": 74384968, + "step": 128220 + }, + { + "epoch": 19.098153112898423, + "grad_norm": 4.563776201393921e-06, + "learning_rate": 3.091552848010715e-07, + "loss": 0.0, + "num_input_tokens_seen": 74388072, + "step": 128225 + }, + { + "epoch": 19.09889782543938, + "grad_norm": 1.1123392141598742e-05, + "learning_rate": 3.08646053928327e-07, + "loss": 0.0, + "num_input_tokens_seen": 74390760, + "step": 128230 + }, + { + "epoch": 19.09964253798034, + "grad_norm": 4.11446308135055e-06, + "learning_rate": 3.081372401909116e-07, + "loss": 0.0, + "num_input_tokens_seen": 74393448, + "step": 128235 + }, + { + "epoch": 19.100387250521297, + "grad_norm": 8.419177902396768e-06, + "learning_rate": 3.076288435974239e-07, + "loss": 0.0, + "num_input_tokens_seen": 74396296, + "step": 128240 + }, + { + "epoch": 19.101131963062258, + "grad_norm": 4.1582757148717064e-06, + "learning_rate": 3.071208641564488e-07, + "loss": 0.0, + "num_input_tokens_seen": 74399432, + "step": 128245 + }, + { + "epoch": 19.10187667560322, + "grad_norm": 6.348310762405163e-06, + "learning_rate": 3.06613301876571e-07, + "loss": 0.0, + "num_input_tokens_seen": 74402184, + "step": 128250 + }, + { + "epoch": 19.102621388144176, + "grad_norm": 1.3021221093367785e-05, + "learning_rate": 3.0610615676636144e-07, + "loss": 0.0, + "num_input_tokens_seen": 74404904, + "step": 128255 + }, + { + "epoch": 19.103366100685136, + "grad_norm": 2.9708544388995506e-06, + "learning_rate": 3.0559942883439387e-07, + "loss": 0.0, + "num_input_tokens_seen": 74407848, + "step": 128260 + }, + { + "epoch": 19.104110813226093, + "grad_norm": 1.3736051187152043e-05, + "learning_rate": 3.0509311808922526e-07, + "loss": 0.0, + "num_input_tokens_seen": 74410664, + "step": 128265 + }, + { + "epoch": 19.104855525767054, + "grad_norm": 2.299170546393725e-06, + "learning_rate": 3.045872245394099e-07, + "loss": 0.0402, + "num_input_tokens_seen": 74413544, + "step": 128270 + }, + { + "epoch": 19.105600238308014, + "grad_norm": 2.473879476383445e-06, + "learning_rate": 3.0408174819349377e-07, + "loss": 0.0, + "num_input_tokens_seen": 74416872, + "step": 128275 + }, + { + "epoch": 19.10634495084897, + "grad_norm": 3.4205615520477295e-05, + "learning_rate": 3.035766890600145e-07, + "loss": 0.0034, + "num_input_tokens_seen": 74419720, + "step": 128280 + }, + { + "epoch": 19.10708966338993, + "grad_norm": 0.0023904494009912014, + "learning_rate": 3.030720471475096e-07, + "loss": 0.0, + "num_input_tokens_seen": 74422664, + "step": 128285 + }, + { + "epoch": 19.107834375930892, + "grad_norm": 2.728572553678532e-06, + "learning_rate": 3.025678224645001e-07, + "loss": 0.0, + "num_input_tokens_seen": 74425544, + "step": 128290 + }, + { + "epoch": 19.10857908847185, + "grad_norm": 1.7096737792599015e-05, + "learning_rate": 3.02064015019507e-07, + "loss": 0.0, + "num_input_tokens_seen": 74427880, + "step": 128295 + }, + { + "epoch": 19.10932380101281, + "grad_norm": 3.621730456870864e-06, + "learning_rate": 3.015606248210401e-07, + "loss": 0.0, + "num_input_tokens_seen": 74430568, + "step": 128300 + }, + { + "epoch": 19.110068513553767, + "grad_norm": 3.004392510774778e-06, + "learning_rate": 3.010576518776037e-07, + "loss": 0.0, + "num_input_tokens_seen": 74433416, + "step": 128305 + }, + { + "epoch": 19.110813226094727, + "grad_norm": 2.5912631826940924e-06, + "learning_rate": 3.005550961976938e-07, + "loss": 0.0, + "num_input_tokens_seen": 74436136, + "step": 128310 + }, + { + "epoch": 19.111557938635688, + "grad_norm": 0.001950386562384665, + "learning_rate": 3.0005295778980647e-07, + "loss": 0.0, + "num_input_tokens_seen": 74438824, + "step": 128315 + }, + { + "epoch": 19.112302651176645, + "grad_norm": 1.5931454981910065e-05, + "learning_rate": 2.9955123666241814e-07, + "loss": 0.0, + "num_input_tokens_seen": 74441896, + "step": 128320 + }, + { + "epoch": 19.113047363717605, + "grad_norm": 2.8310780635365518e-06, + "learning_rate": 2.990499328240054e-07, + "loss": 0.0, + "num_input_tokens_seen": 74444840, + "step": 128325 + }, + { + "epoch": 19.113792076258566, + "grad_norm": 3.167307340845582e-06, + "learning_rate": 2.9854904628304206e-07, + "loss": 0.0, + "num_input_tokens_seen": 74447720, + "step": 128330 + }, + { + "epoch": 19.114536788799523, + "grad_norm": 3.6879837352898903e-06, + "learning_rate": 2.980485770479824e-07, + "loss": 0.0, + "num_input_tokens_seen": 74450536, + "step": 128335 + }, + { + "epoch": 19.115281501340483, + "grad_norm": 1.419092859578086e-05, + "learning_rate": 2.975485251272919e-07, + "loss": 0.0, + "num_input_tokens_seen": 74453512, + "step": 128340 + }, + { + "epoch": 19.11602621388144, + "grad_norm": 2.303440624018549e-06, + "learning_rate": 2.970488905294083e-07, + "loss": 0.0, + "num_input_tokens_seen": 74456296, + "step": 128345 + }, + { + "epoch": 19.1167709264224, + "grad_norm": 2.6609586711856537e-06, + "learning_rate": 2.965496732627804e-07, + "loss": 0.0, + "num_input_tokens_seen": 74459464, + "step": 128350 + }, + { + "epoch": 19.11751563896336, + "grad_norm": 4.3657828427967615e-06, + "learning_rate": 2.960508733358375e-07, + "loss": 0.0, + "num_input_tokens_seen": 74462376, + "step": 128355 + }, + { + "epoch": 19.11826035150432, + "grad_norm": 2.4810833565425128e-06, + "learning_rate": 2.955524907570062e-07, + "loss": 0.0, + "num_input_tokens_seen": 74465352, + "step": 128360 + }, + { + "epoch": 19.11900506404528, + "grad_norm": 2.4187652343243826e-06, + "learning_rate": 2.950545255347076e-07, + "loss": 0.0, + "num_input_tokens_seen": 74468136, + "step": 128365 + }, + { + "epoch": 19.11974977658624, + "grad_norm": 2.806589009196614e-06, + "learning_rate": 2.9455697767735155e-07, + "loss": 0.0, + "num_input_tokens_seen": 74470856, + "step": 128370 + }, + { + "epoch": 19.120494489127196, + "grad_norm": 2.0539112028927775e-06, + "learning_rate": 2.9405984719334814e-07, + "loss": 0.0, + "num_input_tokens_seen": 74473960, + "step": 128375 + }, + { + "epoch": 19.121239201668157, + "grad_norm": 3.3133753731817706e-06, + "learning_rate": 2.935631340910933e-07, + "loss": 0.0, + "num_input_tokens_seen": 74476872, + "step": 128380 + }, + { + "epoch": 19.121983914209114, + "grad_norm": 4.481935320654884e-06, + "learning_rate": 2.930668383789775e-07, + "loss": 0.0, + "num_input_tokens_seen": 74479688, + "step": 128385 + }, + { + "epoch": 19.122728626750074, + "grad_norm": 1.9671932022902183e-05, + "learning_rate": 2.925709600653859e-07, + "loss": 0.0, + "num_input_tokens_seen": 74482536, + "step": 128390 + }, + { + "epoch": 19.123473339291035, + "grad_norm": 2.2297683699434856e-06, + "learning_rate": 2.9207549915870045e-07, + "loss": 0.0, + "num_input_tokens_seen": 74485384, + "step": 128395 + }, + { + "epoch": 19.124218051831992, + "grad_norm": 0.2119474709033966, + "learning_rate": 2.915804556672841e-07, + "loss": 0.0, + "num_input_tokens_seen": 74488392, + "step": 128400 + }, + { + "epoch": 19.124962764372953, + "grad_norm": 2.1498992737178924e-06, + "learning_rate": 2.9108582959950504e-07, + "loss": 0.0, + "num_input_tokens_seen": 74491144, + "step": 128405 + }, + { + "epoch": 19.12570747691391, + "grad_norm": 2.1146559447515756e-05, + "learning_rate": 2.9059162096371773e-07, + "loss": 0.0, + "num_input_tokens_seen": 74494056, + "step": 128410 + }, + { + "epoch": 19.12645218945487, + "grad_norm": 4.161362085142173e-06, + "learning_rate": 2.9009782976827106e-07, + "loss": 0.0, + "num_input_tokens_seen": 74496968, + "step": 128415 + }, + { + "epoch": 19.12719690199583, + "grad_norm": 3.472368234724854e-06, + "learning_rate": 2.896044560215083e-07, + "loss": 0.0, + "num_input_tokens_seen": 74499688, + "step": 128420 + }, + { + "epoch": 19.127941614536788, + "grad_norm": 1.3664039215655066e-05, + "learning_rate": 2.891114997317618e-07, + "loss": 0.0, + "num_input_tokens_seen": 74502536, + "step": 128425 + }, + { + "epoch": 19.128686327077748, + "grad_norm": 2.2632486434304155e-05, + "learning_rate": 2.8861896090736365e-07, + "loss": 0.0, + "num_input_tokens_seen": 74505640, + "step": 128430 + }, + { + "epoch": 19.12943103961871, + "grad_norm": 1.5187240478553576e-06, + "learning_rate": 2.881268395566322e-07, + "loss": 0.0, + "num_input_tokens_seen": 74508520, + "step": 128435 + }, + { + "epoch": 19.130175752159666, + "grad_norm": 2.994857368321391e-06, + "learning_rate": 2.8763513568788036e-07, + "loss": 0.0, + "num_input_tokens_seen": 74511624, + "step": 128440 + }, + { + "epoch": 19.130920464700626, + "grad_norm": 6.46641501589329e-06, + "learning_rate": 2.871438493094153e-07, + "loss": 0.0, + "num_input_tokens_seen": 74514632, + "step": 128445 + }, + { + "epoch": 19.131665177241583, + "grad_norm": 8.37757033878006e-06, + "learning_rate": 2.866529804295387e-07, + "loss": 0.0, + "num_input_tokens_seen": 74517960, + "step": 128450 + }, + { + "epoch": 19.132409889782544, + "grad_norm": 2.2115847968962044e-05, + "learning_rate": 2.8616252905654393e-07, + "loss": 0.0, + "num_input_tokens_seen": 74520776, + "step": 128455 + }, + { + "epoch": 19.133154602323504, + "grad_norm": 0.0013441715855151415, + "learning_rate": 2.856724951987161e-07, + "loss": 0.0, + "num_input_tokens_seen": 74523720, + "step": 128460 + }, + { + "epoch": 19.13389931486446, + "grad_norm": 2.634249085531337e-06, + "learning_rate": 2.851828788643318e-07, + "loss": 0.0, + "num_input_tokens_seen": 74526696, + "step": 128465 + }, + { + "epoch": 19.13464402740542, + "grad_norm": 5.172092187422095e-06, + "learning_rate": 2.846936800616623e-07, + "loss": 0.0, + "num_input_tokens_seen": 74529576, + "step": 128470 + }, + { + "epoch": 19.135388739946382, + "grad_norm": 1.367740424029762e-05, + "learning_rate": 2.8420489879897595e-07, + "loss": 0.0, + "num_input_tokens_seen": 74532648, + "step": 128475 + }, + { + "epoch": 19.13613345248734, + "grad_norm": 5.8912584790959954e-05, + "learning_rate": 2.8371653508452725e-07, + "loss": 0.0, + "num_input_tokens_seen": 74535656, + "step": 128480 + }, + { + "epoch": 19.1368781650283, + "grad_norm": 8.307467942358926e-05, + "learning_rate": 2.832285889265651e-07, + "loss": 0.0, + "num_input_tokens_seen": 74538600, + "step": 128485 + }, + { + "epoch": 19.137622877569257, + "grad_norm": 2.9432510473270668e-06, + "learning_rate": 2.827410603333386e-07, + "loss": 0.0, + "num_input_tokens_seen": 74541384, + "step": 128490 + }, + { + "epoch": 19.138367590110217, + "grad_norm": 3.488618312985636e-05, + "learning_rate": 2.8225394931307715e-07, + "loss": 0.0, + "num_input_tokens_seen": 74544136, + "step": 128495 + }, + { + "epoch": 19.139112302651178, + "grad_norm": 5.775677891506348e-06, + "learning_rate": 2.817672558740131e-07, + "loss": 0.0, + "num_input_tokens_seen": 74547048, + "step": 128500 + }, + { + "epoch": 19.139857015192135, + "grad_norm": 4.671540409617592e-06, + "learning_rate": 2.812809800243704e-07, + "loss": 0.0, + "num_input_tokens_seen": 74549928, + "step": 128505 + }, + { + "epoch": 19.140601727733095, + "grad_norm": 2.4903256417019293e-06, + "learning_rate": 2.80795121772362e-07, + "loss": 0.0, + "num_input_tokens_seen": 74553064, + "step": 128510 + }, + { + "epoch": 19.141346440274056, + "grad_norm": 1.7592154108569957e-05, + "learning_rate": 2.803096811261979e-07, + "loss": 0.0, + "num_input_tokens_seen": 74556072, + "step": 128515 + }, + { + "epoch": 19.142091152815013, + "grad_norm": 5.840363428433193e-06, + "learning_rate": 2.7982465809407443e-07, + "loss": 0.0, + "num_input_tokens_seen": 74559016, + "step": 128520 + }, + { + "epoch": 19.142835865355973, + "grad_norm": 3.211847570128157e-06, + "learning_rate": 2.793400526841933e-07, + "loss": 0.0, + "num_input_tokens_seen": 74561672, + "step": 128525 + }, + { + "epoch": 19.14358057789693, + "grad_norm": 4.3946431105723605e-05, + "learning_rate": 2.7885586490473127e-07, + "loss": 0.0, + "num_input_tokens_seen": 74564712, + "step": 128530 + }, + { + "epoch": 19.14432529043789, + "grad_norm": 1.4585385542886797e-05, + "learning_rate": 2.7837209476387903e-07, + "loss": 0.0, + "num_input_tokens_seen": 74567624, + "step": 128535 + }, + { + "epoch": 19.14507000297885, + "grad_norm": 3.51852759195026e-06, + "learning_rate": 2.7788874226980233e-07, + "loss": 0.0, + "num_input_tokens_seen": 74570344, + "step": 128540 + }, + { + "epoch": 19.14581471551981, + "grad_norm": 1.9712988432729617e-05, + "learning_rate": 2.774058074306696e-07, + "loss": 0.0, + "num_input_tokens_seen": 74573160, + "step": 128545 + }, + { + "epoch": 19.14655942806077, + "grad_norm": 3.7201277791609755e-06, + "learning_rate": 2.7692329025463816e-07, + "loss": 0.0, + "num_input_tokens_seen": 74576072, + "step": 128550 + }, + { + "epoch": 19.14730414060173, + "grad_norm": 0.0011402575764805079, + "learning_rate": 2.7644119074986263e-07, + "loss": 0.0, + "num_input_tokens_seen": 74578760, + "step": 128555 + }, + { + "epoch": 19.148048853142686, + "grad_norm": 3.976809693995165e-06, + "learning_rate": 2.7595950892448374e-07, + "loss": 0.0, + "num_input_tokens_seen": 74581768, + "step": 128560 + }, + { + "epoch": 19.148793565683647, + "grad_norm": 3.065590499318205e-05, + "learning_rate": 2.754782447866394e-07, + "loss": 0.0, + "num_input_tokens_seen": 74584712, + "step": 128565 + }, + { + "epoch": 19.149538278224604, + "grad_norm": 2.583684590717894e-06, + "learning_rate": 2.7499739834446204e-07, + "loss": 0.0, + "num_input_tokens_seen": 74587720, + "step": 128570 + }, + { + "epoch": 19.150282990765565, + "grad_norm": 4.494183031056309e-06, + "learning_rate": 2.745169696060729e-07, + "loss": 0.0, + "num_input_tokens_seen": 74590632, + "step": 128575 + }, + { + "epoch": 19.151027703306525, + "grad_norm": 9.495973245066125e-06, + "learning_rate": 2.7403695857959046e-07, + "loss": 0.0, + "num_input_tokens_seen": 74593640, + "step": 128580 + }, + { + "epoch": 19.151772415847482, + "grad_norm": 2.570953256508801e-06, + "learning_rate": 2.735573652731249e-07, + "loss": 0.0, + "num_input_tokens_seen": 74596712, + "step": 128585 + }, + { + "epoch": 19.152517128388443, + "grad_norm": 2.9815619200235233e-06, + "learning_rate": 2.730781896947754e-07, + "loss": 0.0, + "num_input_tokens_seen": 74599944, + "step": 128590 + }, + { + "epoch": 19.1532618409294, + "grad_norm": 2.0563504222081974e-05, + "learning_rate": 2.7259943185263813e-07, + "loss": 0.0, + "num_input_tokens_seen": 74603080, + "step": 128595 + }, + { + "epoch": 19.15400655347036, + "grad_norm": 7.870693480072077e-06, + "learning_rate": 2.7212109175480114e-07, + "loss": 0.0, + "num_input_tokens_seen": 74606088, + "step": 128600 + }, + { + "epoch": 19.15475126601132, + "grad_norm": 2.9542201446020044e-05, + "learning_rate": 2.7164316940934966e-07, + "loss": 0.0, + "num_input_tokens_seen": 74609096, + "step": 128605 + }, + { + "epoch": 19.155495978552278, + "grad_norm": 0.00011688993981806561, + "learning_rate": 2.7116566482434936e-07, + "loss": 0.0, + "num_input_tokens_seen": 74612072, + "step": 128610 + }, + { + "epoch": 19.156240691093238, + "grad_norm": 1.2223085832374636e-05, + "learning_rate": 2.706885780078744e-07, + "loss": 0.0, + "num_input_tokens_seen": 74614952, + "step": 128615 + }, + { + "epoch": 19.1569854036342, + "grad_norm": 1.9954883100581355e-05, + "learning_rate": 2.7021190896798223e-07, + "loss": 0.0, + "num_input_tokens_seen": 74617960, + "step": 128620 + }, + { + "epoch": 19.157730116175156, + "grad_norm": 6.894698162795976e-05, + "learning_rate": 2.6973565771272746e-07, + "loss": 0.0, + "num_input_tokens_seen": 74620872, + "step": 128625 + }, + { + "epoch": 19.158474828716116, + "grad_norm": 2.058335894616903e-06, + "learning_rate": 2.6925982425015097e-07, + "loss": 0.0, + "num_input_tokens_seen": 74623848, + "step": 128630 + }, + { + "epoch": 19.159219541257073, + "grad_norm": 1.097950189432595e-05, + "learning_rate": 2.6878440858829626e-07, + "loss": 0.0, + "num_input_tokens_seen": 74626856, + "step": 128635 + }, + { + "epoch": 19.159964253798034, + "grad_norm": 3.668361159725464e-06, + "learning_rate": 2.68309410735193e-07, + "loss": 0.0, + "num_input_tokens_seen": 74629352, + "step": 128640 + }, + { + "epoch": 19.160708966338994, + "grad_norm": 3.7466006688191555e-06, + "learning_rate": 2.678348306988626e-07, + "loss": 0.0, + "num_input_tokens_seen": 74632072, + "step": 128645 + }, + { + "epoch": 19.16145367887995, + "grad_norm": 9.47735952649964e-06, + "learning_rate": 2.67360668487332e-07, + "loss": 0.0, + "num_input_tokens_seen": 74634760, + "step": 128650 + }, + { + "epoch": 19.16219839142091, + "grad_norm": 3.3515509585413383e-06, + "learning_rate": 2.6688692410860025e-07, + "loss": 0.0, + "num_input_tokens_seen": 74637672, + "step": 128655 + }, + { + "epoch": 19.162943103961872, + "grad_norm": 3.988474418292753e-06, + "learning_rate": 2.664135975706805e-07, + "loss": 0.0, + "num_input_tokens_seen": 74640680, + "step": 128660 + }, + { + "epoch": 19.16368781650283, + "grad_norm": 3.333231916258228e-06, + "learning_rate": 2.659406888815608e-07, + "loss": 0.0, + "num_input_tokens_seen": 74643464, + "step": 128665 + }, + { + "epoch": 19.16443252904379, + "grad_norm": 6.620759904762963e-06, + "learning_rate": 2.6546819804923737e-07, + "loss": 0.0, + "num_input_tokens_seen": 74646280, + "step": 128670 + }, + { + "epoch": 19.165177241584747, + "grad_norm": 3.3946048461075407e-06, + "learning_rate": 2.6499612508169016e-07, + "loss": 0.0, + "num_input_tokens_seen": 74649224, + "step": 128675 + }, + { + "epoch": 19.165921954125707, + "grad_norm": 0.003455881495028734, + "learning_rate": 2.645244699868932e-07, + "loss": 0.0, + "num_input_tokens_seen": 74651912, + "step": 128680 + }, + { + "epoch": 19.166666666666668, + "grad_norm": 4.319991148804547e-06, + "learning_rate": 2.6405323277281514e-07, + "loss": 0.0, + "num_input_tokens_seen": 74654824, + "step": 128685 + }, + { + "epoch": 19.167411379207625, + "grad_norm": 2.586589744169032e-06, + "learning_rate": 2.6358241344741906e-07, + "loss": 0.0, + "num_input_tokens_seen": 74657544, + "step": 128690 + }, + { + "epoch": 19.168156091748585, + "grad_norm": 1.729411565065675e-06, + "learning_rate": 2.6311201201865423e-07, + "loss": 0.0, + "num_input_tokens_seen": 74660392, + "step": 128695 + }, + { + "epoch": 19.168900804289546, + "grad_norm": 3.455942760410835e-06, + "learning_rate": 2.626420284944725e-07, + "loss": 0.0, + "num_input_tokens_seen": 74663368, + "step": 128700 + }, + { + "epoch": 19.169645516830503, + "grad_norm": 2.5945575998775894e-06, + "learning_rate": 2.6217246288281205e-07, + "loss": 0.0, + "num_input_tokens_seen": 74666312, + "step": 128705 + }, + { + "epoch": 19.170390229371463, + "grad_norm": 2.9695202101720497e-06, + "learning_rate": 2.6170331519160264e-07, + "loss": 0.0, + "num_input_tokens_seen": 74669256, + "step": 128710 + }, + { + "epoch": 19.17113494191242, + "grad_norm": 3.4805783798219636e-05, + "learning_rate": 2.61234585428774e-07, + "loss": 0.0, + "num_input_tokens_seen": 74672168, + "step": 128715 + }, + { + "epoch": 19.17187965445338, + "grad_norm": 2.8339240998320747e-06, + "learning_rate": 2.607662736022448e-07, + "loss": 0.0, + "num_input_tokens_seen": 74675176, + "step": 128720 + }, + { + "epoch": 19.17262436699434, + "grad_norm": 3.522162660374306e-05, + "learning_rate": 2.6029837971992545e-07, + "loss": 0.0, + "num_input_tokens_seen": 74678216, + "step": 128725 + }, + { + "epoch": 19.1733690795353, + "grad_norm": 1.9775075088546146e-06, + "learning_rate": 2.5983090378972064e-07, + "loss": 0.0, + "num_input_tokens_seen": 74681256, + "step": 128730 + }, + { + "epoch": 19.17411379207626, + "grad_norm": 6.561544432770461e-05, + "learning_rate": 2.5936384581952686e-07, + "loss": 0.0, + "num_input_tokens_seen": 74684136, + "step": 128735 + }, + { + "epoch": 19.17485850461722, + "grad_norm": 1.052115112543106e-05, + "learning_rate": 2.5889720581723506e-07, + "loss": 0.0, + "num_input_tokens_seen": 74687496, + "step": 128740 + }, + { + "epoch": 19.175603217158177, + "grad_norm": 3.3211445042979904e-06, + "learning_rate": 2.584309837907306e-07, + "loss": 0.0, + "num_input_tokens_seen": 74690312, + "step": 128745 + }, + { + "epoch": 19.176347929699137, + "grad_norm": 4.6118257159832865e-06, + "learning_rate": 2.5796517974789045e-07, + "loss": 0.0, + "num_input_tokens_seen": 74693256, + "step": 128750 + }, + { + "epoch": 19.177092642240094, + "grad_norm": 5.310519100021338e-06, + "learning_rate": 2.5749979369657783e-07, + "loss": 0.0, + "num_input_tokens_seen": 74696392, + "step": 128755 + }, + { + "epoch": 19.177837354781055, + "grad_norm": 1.6766463886597194e-05, + "learning_rate": 2.570348256446614e-07, + "loss": 0.0, + "num_input_tokens_seen": 74699432, + "step": 128760 + }, + { + "epoch": 19.178582067322015, + "grad_norm": 3.609705527196638e-05, + "learning_rate": 2.5657027559999327e-07, + "loss": 0.0, + "num_input_tokens_seen": 74702376, + "step": 128765 + }, + { + "epoch": 19.179326779862972, + "grad_norm": 3.335216661071172e-06, + "learning_rate": 2.561061435704226e-07, + "loss": 0.0, + "num_input_tokens_seen": 74705576, + "step": 128770 + }, + { + "epoch": 19.180071492403933, + "grad_norm": 1.3036494237894658e-05, + "learning_rate": 2.556424295637905e-07, + "loss": 0.0, + "num_input_tokens_seen": 74708552, + "step": 128775 + }, + { + "epoch": 19.18081620494489, + "grad_norm": 3.341528554301476e-06, + "learning_rate": 2.5517913358792945e-07, + "loss": 0.0, + "num_input_tokens_seen": 74711336, + "step": 128780 + }, + { + "epoch": 19.18156091748585, + "grad_norm": 0.00014167018525768071, + "learning_rate": 2.547162556506694e-07, + "loss": 0.0, + "num_input_tokens_seen": 74714280, + "step": 128785 + }, + { + "epoch": 19.18230563002681, + "grad_norm": 2.244086135760881e-05, + "learning_rate": 2.5425379575982343e-07, + "loss": 0.0, + "num_input_tokens_seen": 74717224, + "step": 128790 + }, + { + "epoch": 19.183050342567768, + "grad_norm": 7.67695928516332e-06, + "learning_rate": 2.537917539232132e-07, + "loss": 0.0, + "num_input_tokens_seen": 74719880, + "step": 128795 + }, + { + "epoch": 19.183795055108728, + "grad_norm": 4.400527814141242e-06, + "learning_rate": 2.5333013014864073e-07, + "loss": 0.0, + "num_input_tokens_seen": 74722760, + "step": 128800 + }, + { + "epoch": 19.18453976764969, + "grad_norm": 4.238681412971346e-06, + "learning_rate": 2.528689244439025e-07, + "loss": 0.0, + "num_input_tokens_seen": 74725512, + "step": 128805 + }, + { + "epoch": 19.185284480190646, + "grad_norm": 3.000288961629849e-05, + "learning_rate": 2.524081368167924e-07, + "loss": 0.0, + "num_input_tokens_seen": 74728232, + "step": 128810 + }, + { + "epoch": 19.186029192731606, + "grad_norm": 3.86085594072938e-06, + "learning_rate": 2.5194776727509584e-07, + "loss": 0.0, + "num_input_tokens_seen": 74730952, + "step": 128815 + }, + { + "epoch": 19.186773905272563, + "grad_norm": 2.7585642783378717e-06, + "learning_rate": 2.5148781582658986e-07, + "loss": 0.0, + "num_input_tokens_seen": 74733800, + "step": 128820 + }, + { + "epoch": 19.187518617813524, + "grad_norm": 3.432081939536147e-06, + "learning_rate": 2.5102828247904055e-07, + "loss": 0.0, + "num_input_tokens_seen": 74736712, + "step": 128825 + }, + { + "epoch": 19.188263330354484, + "grad_norm": 0.0002978324773721397, + "learning_rate": 2.5056916724021663e-07, + "loss": 0.0, + "num_input_tokens_seen": 74739464, + "step": 128830 + }, + { + "epoch": 19.18900804289544, + "grad_norm": 7.588501375721535e-06, + "learning_rate": 2.5011047011787026e-07, + "loss": 0.0, + "num_input_tokens_seen": 74742088, + "step": 128835 + }, + { + "epoch": 19.189752755436402, + "grad_norm": 2.929514721472515e-06, + "learning_rate": 2.4965219111975635e-07, + "loss": 0.0, + "num_input_tokens_seen": 74745064, + "step": 128840 + }, + { + "epoch": 19.190497467977362, + "grad_norm": 4.706332674686564e-06, + "learning_rate": 2.491943302536104e-07, + "loss": 0.0, + "num_input_tokens_seen": 74747848, + "step": 128845 + }, + { + "epoch": 19.19124218051832, + "grad_norm": 4.0976547097670846e-06, + "learning_rate": 2.487368875271706e-07, + "loss": 0.0, + "num_input_tokens_seen": 74750536, + "step": 128850 + }, + { + "epoch": 19.19198689305928, + "grad_norm": 2.9551533771154936e-06, + "learning_rate": 2.4827986294816696e-07, + "loss": 0.0, + "num_input_tokens_seen": 74753320, + "step": 128855 + }, + { + "epoch": 19.192731605600237, + "grad_norm": 5.846045041835168e-06, + "learning_rate": 2.478232565243183e-07, + "loss": 0.0, + "num_input_tokens_seen": 74755880, + "step": 128860 + }, + { + "epoch": 19.193476318141197, + "grad_norm": 4.161226570431609e-06, + "learning_rate": 2.4736706826333775e-07, + "loss": 0.0, + "num_input_tokens_seen": 74758568, + "step": 128865 + }, + { + "epoch": 19.194221030682158, + "grad_norm": 2.900415893236641e-06, + "learning_rate": 2.4691129817293324e-07, + "loss": 0.0, + "num_input_tokens_seen": 74761480, + "step": 128870 + }, + { + "epoch": 19.194965743223115, + "grad_norm": 2.2664034986519255e-06, + "learning_rate": 2.4645594626080405e-07, + "loss": 0.0, + "num_input_tokens_seen": 74764168, + "step": 128875 + }, + { + "epoch": 19.195710455764075, + "grad_norm": 5.415848136181012e-05, + "learning_rate": 2.460010125346468e-07, + "loss": 0.0, + "num_input_tokens_seen": 74766952, + "step": 128880 + }, + { + "epoch": 19.196455168305036, + "grad_norm": 0.00012071229866705835, + "learning_rate": 2.455464970021415e-07, + "loss": 0.0, + "num_input_tokens_seen": 74769800, + "step": 128885 + }, + { + "epoch": 19.197199880845993, + "grad_norm": 1.3818113984598313e-05, + "learning_rate": 2.450923996709681e-07, + "loss": 0.0, + "num_input_tokens_seen": 74772392, + "step": 128890 + }, + { + "epoch": 19.197944593386953, + "grad_norm": 6.842038146714913e-06, + "learning_rate": 2.446387205487982e-07, + "loss": 0.0, + "num_input_tokens_seen": 74775272, + "step": 128895 + }, + { + "epoch": 19.19868930592791, + "grad_norm": 2.455701178405434e-06, + "learning_rate": 2.44185459643298e-07, + "loss": 0.0, + "num_input_tokens_seen": 74778088, + "step": 128900 + }, + { + "epoch": 19.19943401846887, + "grad_norm": 1.7471951423431165e-06, + "learning_rate": 2.4373261696212237e-07, + "loss": 0.0, + "num_input_tokens_seen": 74780904, + "step": 128905 + }, + { + "epoch": 19.20017873100983, + "grad_norm": 2.646784878379549e-06, + "learning_rate": 2.4328019251292355e-07, + "loss": 0.0, + "num_input_tokens_seen": 74784168, + "step": 128910 + }, + { + "epoch": 19.20092344355079, + "grad_norm": 4.433909907675115e-06, + "learning_rate": 2.4282818630334547e-07, + "loss": 0.0, + "num_input_tokens_seen": 74787176, + "step": 128915 + }, + { + "epoch": 19.20166815609175, + "grad_norm": 4.27806571678957e-06, + "learning_rate": 2.4237659834102364e-07, + "loss": 0.0, + "num_input_tokens_seen": 74790184, + "step": 128920 + }, + { + "epoch": 19.202412868632706, + "grad_norm": 1.1608175555011258e-05, + "learning_rate": 2.4192542863358534e-07, + "loss": 0.0, + "num_input_tokens_seen": 74792936, + "step": 128925 + }, + { + "epoch": 19.203157581173667, + "grad_norm": 1.046496254275553e-05, + "learning_rate": 2.4147467718865227e-07, + "loss": 0.0, + "num_input_tokens_seen": 74795976, + "step": 128930 + }, + { + "epoch": 19.203902293714627, + "grad_norm": 2.8068218398402678e-06, + "learning_rate": 2.410243440138432e-07, + "loss": 0.0, + "num_input_tokens_seen": 74798760, + "step": 128935 + }, + { + "epoch": 19.204647006255584, + "grad_norm": 2.7689188755175564e-06, + "learning_rate": 2.405744291167633e-07, + "loss": 0.0, + "num_input_tokens_seen": 74801800, + "step": 128940 + }, + { + "epoch": 19.205391718796545, + "grad_norm": 0.0002479605027474463, + "learning_rate": 2.4012493250501476e-07, + "loss": 0.0, + "num_input_tokens_seen": 74804712, + "step": 128945 + }, + { + "epoch": 19.206136431337505, + "grad_norm": 3.392888538655825e-05, + "learning_rate": 2.3967585418619153e-07, + "loss": 0.0, + "num_input_tokens_seen": 74807720, + "step": 128950 + }, + { + "epoch": 19.206881143878462, + "grad_norm": 2.4635784939164296e-05, + "learning_rate": 2.392271941678792e-07, + "loss": 0.0, + "num_input_tokens_seen": 74810504, + "step": 128955 + }, + { + "epoch": 19.207625856419423, + "grad_norm": 6.485033736680634e-06, + "learning_rate": 2.387789524576578e-07, + "loss": 0.0, + "num_input_tokens_seen": 74813256, + "step": 128960 + }, + { + "epoch": 19.20837056896038, + "grad_norm": 3.4309005059185438e-06, + "learning_rate": 2.383311290630963e-07, + "loss": 0.0, + "num_input_tokens_seen": 74816104, + "step": 128965 + }, + { + "epoch": 19.20911528150134, + "grad_norm": 2.540319883337361e-06, + "learning_rate": 2.3788372399176638e-07, + "loss": 0.0, + "num_input_tokens_seen": 74819400, + "step": 128970 + }, + { + "epoch": 19.2098599940423, + "grad_norm": 2.4698908873688197e-06, + "learning_rate": 2.3743673725122318e-07, + "loss": 0.0, + "num_input_tokens_seen": 74822152, + "step": 128975 + }, + { + "epoch": 19.210604706583258, + "grad_norm": 2.991564542753622e-06, + "learning_rate": 2.3699016884901893e-07, + "loss": 0.0, + "num_input_tokens_seen": 74824968, + "step": 128980 + }, + { + "epoch": 19.21134941912422, + "grad_norm": 2.735818952714908e-06, + "learning_rate": 2.365440187926976e-07, + "loss": 0.0, + "num_input_tokens_seen": 74828136, + "step": 128985 + }, + { + "epoch": 19.21209413166518, + "grad_norm": 0.0003799860132858157, + "learning_rate": 2.3609828708979765e-07, + "loss": 0.0, + "num_input_tokens_seen": 74830952, + "step": 128990 + }, + { + "epoch": 19.212838844206136, + "grad_norm": 1.426644575985847e-05, + "learning_rate": 2.3565297374784635e-07, + "loss": 0.0, + "num_input_tokens_seen": 74833928, + "step": 128995 + }, + { + "epoch": 19.213583556747096, + "grad_norm": 2.8123060928919585e-06, + "learning_rate": 2.35208078774371e-07, + "loss": 0.0, + "num_input_tokens_seen": 74836968, + "step": 129000 + }, + { + "epoch": 19.214328269288053, + "grad_norm": 3.9207803638419136e-05, + "learning_rate": 2.3476360217688508e-07, + "loss": 0.0, + "num_input_tokens_seen": 74840296, + "step": 129005 + }, + { + "epoch": 19.215072981829014, + "grad_norm": 3.996581654064357e-06, + "learning_rate": 2.3431954396289645e-07, + "loss": 0.0, + "num_input_tokens_seen": 74843048, + "step": 129010 + }, + { + "epoch": 19.215817694369974, + "grad_norm": 3.7655063351849094e-06, + "learning_rate": 2.3387590413991022e-07, + "loss": 0.0, + "num_input_tokens_seen": 74846120, + "step": 129015 + }, + { + "epoch": 19.21656240691093, + "grad_norm": 2.618541202537017e-06, + "learning_rate": 2.3343268271541764e-07, + "loss": 0.0, + "num_input_tokens_seen": 74849064, + "step": 129020 + }, + { + "epoch": 19.217307119451892, + "grad_norm": 1.6719735867809504e-05, + "learning_rate": 2.329898796969099e-07, + "loss": 0.0, + "num_input_tokens_seen": 74851816, + "step": 129025 + }, + { + "epoch": 19.218051831992852, + "grad_norm": 6.237006346054841e-06, + "learning_rate": 2.3254749509186434e-07, + "loss": 0.0, + "num_input_tokens_seen": 74854696, + "step": 129030 + }, + { + "epoch": 19.21879654453381, + "grad_norm": 2.1393871065811254e-06, + "learning_rate": 2.321055289077584e-07, + "loss": 0.0, + "num_input_tokens_seen": 74857576, + "step": 129035 + }, + { + "epoch": 19.21954125707477, + "grad_norm": 1.8523446669860277e-06, + "learning_rate": 2.3166398115205545e-07, + "loss": 0.0, + "num_input_tokens_seen": 74860392, + "step": 129040 + }, + { + "epoch": 19.220285969615727, + "grad_norm": 3.3882333809742704e-05, + "learning_rate": 2.3122285183221627e-07, + "loss": 0.0, + "num_input_tokens_seen": 74863432, + "step": 129045 + }, + { + "epoch": 19.221030682156687, + "grad_norm": 4.118699507671408e-06, + "learning_rate": 2.3078214095569318e-07, + "loss": 0.0, + "num_input_tokens_seen": 74866472, + "step": 129050 + }, + { + "epoch": 19.221775394697648, + "grad_norm": 3.738529585461947e-06, + "learning_rate": 2.3034184852993025e-07, + "loss": 0.0, + "num_input_tokens_seen": 74869608, + "step": 129055 + }, + { + "epoch": 19.222520107238605, + "grad_norm": 3.636193696365808e-06, + "learning_rate": 2.2990197456236873e-07, + "loss": 0.0, + "num_input_tokens_seen": 74872264, + "step": 129060 + }, + { + "epoch": 19.223264819779565, + "grad_norm": 3.288845982751809e-05, + "learning_rate": 2.2946251906043604e-07, + "loss": 0.0, + "num_input_tokens_seen": 74874856, + "step": 129065 + }, + { + "epoch": 19.224009532320526, + "grad_norm": 4.621249445335707e-06, + "learning_rate": 2.2902348203155955e-07, + "loss": 0.0, + "num_input_tokens_seen": 74877768, + "step": 129070 + }, + { + "epoch": 19.224754244861483, + "grad_norm": 0.00010815123823704198, + "learning_rate": 2.2858486348315555e-07, + "loss": 0.0, + "num_input_tokens_seen": 74880840, + "step": 129075 + }, + { + "epoch": 19.225498957402444, + "grad_norm": 8.922111737774685e-05, + "learning_rate": 2.281466634226348e-07, + "loss": 0.0, + "num_input_tokens_seen": 74883848, + "step": 129080 + }, + { + "epoch": 19.2262436699434, + "grad_norm": 1.8632767933013383e-06, + "learning_rate": 2.277088818573969e-07, + "loss": 0.0, + "num_input_tokens_seen": 74886568, + "step": 129085 + }, + { + "epoch": 19.22698838248436, + "grad_norm": 1.934004558279412e-06, + "learning_rate": 2.2727151879484155e-07, + "loss": 0.0, + "num_input_tokens_seen": 74889320, + "step": 129090 + }, + { + "epoch": 19.22773309502532, + "grad_norm": 2.4572805159550626e-06, + "learning_rate": 2.2683457424235722e-07, + "loss": 0.0, + "num_input_tokens_seen": 74892104, + "step": 129095 + }, + { + "epoch": 19.22847780756628, + "grad_norm": 3.955884039896773e-06, + "learning_rate": 2.2639804820732135e-07, + "loss": 0.0, + "num_input_tokens_seen": 74895208, + "step": 129100 + }, + { + "epoch": 19.22922252010724, + "grad_norm": 1.8912121504399693e-06, + "learning_rate": 2.259619406971142e-07, + "loss": 0.0, + "num_input_tokens_seen": 74898280, + "step": 129105 + }, + { + "epoch": 19.229967232648196, + "grad_norm": 7.56376266508596e-06, + "learning_rate": 2.2552625171909925e-07, + "loss": 0.0, + "num_input_tokens_seen": 74901160, + "step": 129110 + }, + { + "epoch": 19.230711945189157, + "grad_norm": 5.518745183508145e-06, + "learning_rate": 2.250909812806401e-07, + "loss": 0.0, + "num_input_tokens_seen": 74904008, + "step": 129115 + }, + { + "epoch": 19.231456657730117, + "grad_norm": 2.662056203917018e-06, + "learning_rate": 2.246561293890892e-07, + "loss": 0.0, + "num_input_tokens_seen": 74906856, + "step": 129120 + }, + { + "epoch": 19.232201370271074, + "grad_norm": 3.731060814970988e-06, + "learning_rate": 2.2422169605178788e-07, + "loss": 0.0, + "num_input_tokens_seen": 74909576, + "step": 129125 + }, + { + "epoch": 19.232946082812035, + "grad_norm": 2.8971542178624077e-06, + "learning_rate": 2.2378768127608584e-07, + "loss": 0.0, + "num_input_tokens_seen": 74912776, + "step": 129130 + }, + { + "epoch": 19.233690795352995, + "grad_norm": 0.0005211894167587161, + "learning_rate": 2.23354085069305e-07, + "loss": 0.0, + "num_input_tokens_seen": 74915624, + "step": 129135 + }, + { + "epoch": 19.234435507893952, + "grad_norm": 3.0788455660513137e-06, + "learning_rate": 2.2292090743877836e-07, + "loss": 0.0, + "num_input_tokens_seen": 74918376, + "step": 129140 + }, + { + "epoch": 19.235180220434913, + "grad_norm": 9.459498869546223e-06, + "learning_rate": 2.2248814839181953e-07, + "loss": 0.0, + "num_input_tokens_seen": 74921320, + "step": 129145 + }, + { + "epoch": 19.23592493297587, + "grad_norm": 4.379526671982603e-06, + "learning_rate": 2.2205580793573932e-07, + "loss": 0.0, + "num_input_tokens_seen": 74924488, + "step": 129150 + }, + { + "epoch": 19.23666964551683, + "grad_norm": 2.8978604404983344e-06, + "learning_rate": 2.2162388607784578e-07, + "loss": 0.0, + "num_input_tokens_seen": 74927208, + "step": 129155 + }, + { + "epoch": 19.23741435805779, + "grad_norm": 5.756634436693275e-06, + "learning_rate": 2.2119238282543032e-07, + "loss": 0.0, + "num_input_tokens_seen": 74930056, + "step": 129160 + }, + { + "epoch": 19.238159070598748, + "grad_norm": 3.045949142688187e-06, + "learning_rate": 2.2076129818578706e-07, + "loss": 0.0, + "num_input_tokens_seen": 74933128, + "step": 129165 + }, + { + "epoch": 19.23890378313971, + "grad_norm": 9.927623978001066e-06, + "learning_rate": 2.203306321661963e-07, + "loss": 0.0, + "num_input_tokens_seen": 74936072, + "step": 129170 + }, + { + "epoch": 19.23964849568067, + "grad_norm": 3.5698503779713064e-06, + "learning_rate": 2.1990038477393559e-07, + "loss": 0.0, + "num_input_tokens_seen": 74938856, + "step": 129175 + }, + { + "epoch": 19.240393208221626, + "grad_norm": 2.3833699742681347e-06, + "learning_rate": 2.1947055601627132e-07, + "loss": 0.0, + "num_input_tokens_seen": 74941736, + "step": 129180 + }, + { + "epoch": 19.241137920762586, + "grad_norm": 2.3047055037750397e-06, + "learning_rate": 2.190411459004671e-07, + "loss": 0.0, + "num_input_tokens_seen": 74944552, + "step": 129185 + }, + { + "epoch": 19.241882633303543, + "grad_norm": 2.9733230348938378e-06, + "learning_rate": 2.1861215443377547e-07, + "loss": 0.0, + "num_input_tokens_seen": 74947336, + "step": 129190 + }, + { + "epoch": 19.242627345844504, + "grad_norm": 5.475579382618889e-05, + "learning_rate": 2.1818358162344622e-07, + "loss": 0.0, + "num_input_tokens_seen": 74950088, + "step": 129195 + }, + { + "epoch": 19.243372058385464, + "grad_norm": 7.628150342497975e-05, + "learning_rate": 2.1775542747671795e-07, + "loss": 0.0, + "num_input_tokens_seen": 74953064, + "step": 129200 + }, + { + "epoch": 19.24411677092642, + "grad_norm": 2.841193918357021e-06, + "learning_rate": 2.173276920008238e-07, + "loss": 0.0, + "num_input_tokens_seen": 74955880, + "step": 129205 + }, + { + "epoch": 19.244861483467382, + "grad_norm": 2.2509238988277502e-06, + "learning_rate": 2.1690037520299134e-07, + "loss": 0.0, + "num_input_tokens_seen": 74959048, + "step": 129210 + }, + { + "epoch": 19.245606196008342, + "grad_norm": 4.347009053162765e-06, + "learning_rate": 2.1647347709043696e-07, + "loss": 0.0, + "num_input_tokens_seen": 74961768, + "step": 129215 + }, + { + "epoch": 19.2463509085493, + "grad_norm": 0.0033134145196527243, + "learning_rate": 2.160469976703744e-07, + "loss": 0.0, + "num_input_tokens_seen": 74964712, + "step": 129220 + }, + { + "epoch": 19.24709562109026, + "grad_norm": 6.439865501306485e-06, + "learning_rate": 2.1562093695000897e-07, + "loss": 0.0, + "num_input_tokens_seen": 74967400, + "step": 129225 + }, + { + "epoch": 19.247840333631217, + "grad_norm": 1.8239330529468134e-05, + "learning_rate": 2.1519529493654045e-07, + "loss": 0.0, + "num_input_tokens_seen": 74970312, + "step": 129230 + }, + { + "epoch": 19.248585046172177, + "grad_norm": 4.788425030710641e-06, + "learning_rate": 2.14770071637152e-07, + "loss": 0.0, + "num_input_tokens_seen": 74973192, + "step": 129235 + }, + { + "epoch": 19.249329758713138, + "grad_norm": 6.902468157932162e-05, + "learning_rate": 2.143452670590379e-07, + "loss": 0.0, + "num_input_tokens_seen": 74976104, + "step": 129240 + }, + { + "epoch": 19.250074471254095, + "grad_norm": 2.6644955141819082e-05, + "learning_rate": 2.1392088120936737e-07, + "loss": 0.0, + "num_input_tokens_seen": 74979176, + "step": 129245 + }, + { + "epoch": 19.250819183795056, + "grad_norm": 8.669483941048384e-05, + "learning_rate": 2.1349691409530968e-07, + "loss": 0.0, + "num_input_tokens_seen": 74982120, + "step": 129250 + }, + { + "epoch": 19.251563896336016, + "grad_norm": 3.759604169317754e-06, + "learning_rate": 2.1307336572403415e-07, + "loss": 0.0, + "num_input_tokens_seen": 74984808, + "step": 129255 + }, + { + "epoch": 19.252308608876973, + "grad_norm": 6.642851076321676e-06, + "learning_rate": 2.1265023610268776e-07, + "loss": 0.0, + "num_input_tokens_seen": 74987752, + "step": 129260 + }, + { + "epoch": 19.253053321417934, + "grad_norm": 2.4009561911952915e-06, + "learning_rate": 2.1222752523842594e-07, + "loss": 0.0, + "num_input_tokens_seen": 74990312, + "step": 129265 + }, + { + "epoch": 19.25379803395889, + "grad_norm": 4.0212442399933934e-05, + "learning_rate": 2.1180523313838462e-07, + "loss": 0.0, + "num_input_tokens_seen": 74993352, + "step": 129270 + }, + { + "epoch": 19.25454274649985, + "grad_norm": 0.003305483143776655, + "learning_rate": 2.1138335980970258e-07, + "loss": 0.0, + "num_input_tokens_seen": 74996296, + "step": 129275 + }, + { + "epoch": 19.25528745904081, + "grad_norm": 7.551338967459742e-06, + "learning_rate": 2.1096190525950464e-07, + "loss": 0.0, + "num_input_tokens_seen": 74999016, + "step": 129280 + }, + { + "epoch": 19.25603217158177, + "grad_norm": 5.672622592101106e-06, + "learning_rate": 2.1054086949491013e-07, + "loss": 0.0, + "num_input_tokens_seen": 75001960, + "step": 129285 + }, + { + "epoch": 19.25677688412273, + "grad_norm": 8.062915003392845e-05, + "learning_rate": 2.101202525230328e-07, + "loss": 0.0, + "num_input_tokens_seen": 75004872, + "step": 129290 + }, + { + "epoch": 19.257521596663686, + "grad_norm": 4.3634036046569236e-06, + "learning_rate": 2.0970005435097807e-07, + "loss": 0.0, + "num_input_tokens_seen": 75007560, + "step": 129295 + }, + { + "epoch": 19.258266309204647, + "grad_norm": 1.2768542546837125e-05, + "learning_rate": 2.0928027498584579e-07, + "loss": 0.0, + "num_input_tokens_seen": 75010760, + "step": 129300 + }, + { + "epoch": 19.259011021745607, + "grad_norm": 1.2861110917583574e-05, + "learning_rate": 2.0886091443472477e-07, + "loss": 0.0, + "num_input_tokens_seen": 75013768, + "step": 129305 + }, + { + "epoch": 19.259755734286564, + "grad_norm": 7.113693300198065e-06, + "learning_rate": 2.084419727047038e-07, + "loss": 0.0, + "num_input_tokens_seen": 75016488, + "step": 129310 + }, + { + "epoch": 19.260500446827525, + "grad_norm": 2.504513304302236e-06, + "learning_rate": 2.0802344980285771e-07, + "loss": 0.0, + "num_input_tokens_seen": 75019528, + "step": 129315 + }, + { + "epoch": 19.261245159368485, + "grad_norm": 2.483533080521738e-06, + "learning_rate": 2.0760534573626144e-07, + "loss": 0.0, + "num_input_tokens_seen": 75022344, + "step": 129320 + }, + { + "epoch": 19.261989871909442, + "grad_norm": 8.659385457576718e-06, + "learning_rate": 2.0718766051197048e-07, + "loss": 0.0, + "num_input_tokens_seen": 75025384, + "step": 129325 + }, + { + "epoch": 19.262734584450403, + "grad_norm": 7.861452104407363e-06, + "learning_rate": 2.0677039413704857e-07, + "loss": 0.0, + "num_input_tokens_seen": 75028520, + "step": 129330 + }, + { + "epoch": 19.26347929699136, + "grad_norm": 4.40138364865561e-06, + "learning_rate": 2.063535466185429e-07, + "loss": 0.0, + "num_input_tokens_seen": 75031688, + "step": 129335 + }, + { + "epoch": 19.26422400953232, + "grad_norm": 2.256904281239258e-06, + "learning_rate": 2.0593711796349225e-07, + "loss": 0.0, + "num_input_tokens_seen": 75034568, + "step": 129340 + }, + { + "epoch": 19.26496872207328, + "grad_norm": 9.10474864213029e-06, + "learning_rate": 2.0552110817893544e-07, + "loss": 0.0, + "num_input_tokens_seen": 75037544, + "step": 129345 + }, + { + "epoch": 19.265713434614238, + "grad_norm": 2.029766619671136e-06, + "learning_rate": 2.051055172719002e-07, + "loss": 0.0, + "num_input_tokens_seen": 75040456, + "step": 129350 + }, + { + "epoch": 19.2664581471552, + "grad_norm": 5.827061613672413e-06, + "learning_rate": 2.0469034524940588e-07, + "loss": 0.0, + "num_input_tokens_seen": 75043240, + "step": 129355 + }, + { + "epoch": 19.26720285969616, + "grad_norm": 4.614900717569981e-06, + "learning_rate": 2.0427559211846915e-07, + "loss": 0.0, + "num_input_tokens_seen": 75046728, + "step": 129360 + }, + { + "epoch": 19.267947572237116, + "grad_norm": 6.948321697564097e-06, + "learning_rate": 2.0386125788609266e-07, + "loss": 0.0, + "num_input_tokens_seen": 75049480, + "step": 129365 + }, + { + "epoch": 19.268692284778076, + "grad_norm": 6.328168183244998e-06, + "learning_rate": 2.034473425592792e-07, + "loss": 0.0, + "num_input_tokens_seen": 75052360, + "step": 129370 + }, + { + "epoch": 19.269436997319033, + "grad_norm": 6.0372385632945225e-06, + "learning_rate": 2.0303384614502042e-07, + "loss": 0.0, + "num_input_tokens_seen": 75055304, + "step": 129375 + }, + { + "epoch": 19.270181709859994, + "grad_norm": 2.5592809834051877e-05, + "learning_rate": 2.0262076865030232e-07, + "loss": 0.0, + "num_input_tokens_seen": 75058248, + "step": 129380 + }, + { + "epoch": 19.270926422400954, + "grad_norm": 3.233969982829876e-05, + "learning_rate": 2.022081100821055e-07, + "loss": 0.0, + "num_input_tokens_seen": 75061480, + "step": 129385 + }, + { + "epoch": 19.27167113494191, + "grad_norm": 3.941450813726988e-06, + "learning_rate": 2.0179587044739655e-07, + "loss": 0.0, + "num_input_tokens_seen": 75064136, + "step": 129390 + }, + { + "epoch": 19.272415847482872, + "grad_norm": 4.434413767739898e-06, + "learning_rate": 2.0138404975314495e-07, + "loss": 0.0, + "num_input_tokens_seen": 75066952, + "step": 129395 + }, + { + "epoch": 19.273160560023832, + "grad_norm": 1.0387529073341284e-05, + "learning_rate": 2.0097264800630344e-07, + "loss": 0.0, + "num_input_tokens_seen": 75070120, + "step": 129400 + }, + { + "epoch": 19.27390527256479, + "grad_norm": 6.245048552955268e-06, + "learning_rate": 2.0056166521382759e-07, + "loss": 0.0, + "num_input_tokens_seen": 75073000, + "step": 129405 + }, + { + "epoch": 19.27464998510575, + "grad_norm": 2.885154572140891e-06, + "learning_rate": 2.0015110138265624e-07, + "loss": 0.0, + "num_input_tokens_seen": 75075784, + "step": 129410 + }, + { + "epoch": 19.275394697646707, + "grad_norm": 2.5511135390843265e-06, + "learning_rate": 1.997409565197228e-07, + "loss": 0.0, + "num_input_tokens_seen": 75078248, + "step": 129415 + }, + { + "epoch": 19.276139410187668, + "grad_norm": 4.425608494784683e-06, + "learning_rate": 1.9933123063196335e-07, + "loss": 0.0, + "num_input_tokens_seen": 75081128, + "step": 129420 + }, + { + "epoch": 19.276884122728628, + "grad_norm": 4.461712160264142e-05, + "learning_rate": 1.9892192372629737e-07, + "loss": 0.0, + "num_input_tokens_seen": 75084200, + "step": 129425 + }, + { + "epoch": 19.277628835269585, + "grad_norm": 1.5850202544243075e-05, + "learning_rate": 1.9851303580963599e-07, + "loss": 0.0, + "num_input_tokens_seen": 75086824, + "step": 129430 + }, + { + "epoch": 19.278373547810546, + "grad_norm": 3.0963938115746714e-06, + "learning_rate": 1.9810456688889313e-07, + "loss": 0.0, + "num_input_tokens_seen": 75089544, + "step": 129435 + }, + { + "epoch": 19.279118260351503, + "grad_norm": 9.457586202188395e-06, + "learning_rate": 1.9769651697096326e-07, + "loss": 0.0, + "num_input_tokens_seen": 75092808, + "step": 129440 + }, + { + "epoch": 19.279862972892463, + "grad_norm": 1.3978501556266565e-05, + "learning_rate": 1.9728888606274365e-07, + "loss": 0.0, + "num_input_tokens_seen": 75095688, + "step": 129445 + }, + { + "epoch": 19.280607685433424, + "grad_norm": 5.196179245103849e-06, + "learning_rate": 1.9688167417112047e-07, + "loss": 0.004, + "num_input_tokens_seen": 75098504, + "step": 129450 + }, + { + "epoch": 19.28135239797438, + "grad_norm": 3.3408762192266295e-06, + "learning_rate": 1.9647488130297154e-07, + "loss": 0.0, + "num_input_tokens_seen": 75101704, + "step": 129455 + }, + { + "epoch": 19.28209711051534, + "grad_norm": 8.071382580965292e-06, + "learning_rate": 1.960685074651719e-07, + "loss": 0.0, + "num_input_tokens_seen": 75104616, + "step": 129460 + }, + { + "epoch": 19.2828418230563, + "grad_norm": 1.3135411791154183e-05, + "learning_rate": 1.9566255266458278e-07, + "loss": 0.0, + "num_input_tokens_seen": 75107560, + "step": 129465 + }, + { + "epoch": 19.28358653559726, + "grad_norm": 2.9578798148577334e-06, + "learning_rate": 1.9525701690806807e-07, + "loss": 0.0, + "num_input_tokens_seen": 75110376, + "step": 129470 + }, + { + "epoch": 19.28433124813822, + "grad_norm": 4.639067810785491e-06, + "learning_rate": 1.948519002024751e-07, + "loss": 0.0, + "num_input_tokens_seen": 75113352, + "step": 129475 + }, + { + "epoch": 19.285075960679176, + "grad_norm": 5.4936459491727874e-05, + "learning_rate": 1.9444720255464844e-07, + "loss": 0.0, + "num_input_tokens_seen": 75116264, + "step": 129480 + }, + { + "epoch": 19.285820673220137, + "grad_norm": 0.00019652668561320752, + "learning_rate": 1.940429239714242e-07, + "loss": 0.0, + "num_input_tokens_seen": 75119176, + "step": 129485 + }, + { + "epoch": 19.286565385761097, + "grad_norm": 2.3903480723674875e-06, + "learning_rate": 1.936390644596303e-07, + "loss": 0.0, + "num_input_tokens_seen": 75122248, + "step": 129490 + }, + { + "epoch": 19.287310098302054, + "grad_norm": 3.4308875456190435e-06, + "learning_rate": 1.932356240260974e-07, + "loss": 0.0, + "num_input_tokens_seen": 75125000, + "step": 129495 + }, + { + "epoch": 19.288054810843015, + "grad_norm": 1.8877490219892934e-06, + "learning_rate": 1.9283260267763115e-07, + "loss": 0.0, + "num_input_tokens_seen": 75127720, + "step": 129500 + }, + { + "epoch": 19.288799523383975, + "grad_norm": 1.722315209917724e-05, + "learning_rate": 1.924300004210483e-07, + "loss": 0.0, + "num_input_tokens_seen": 75130440, + "step": 129505 + }, + { + "epoch": 19.289544235924932, + "grad_norm": 2.1460102743731113e-06, + "learning_rate": 1.9202781726314622e-07, + "loss": 0.0, + "num_input_tokens_seen": 75133256, + "step": 129510 + }, + { + "epoch": 19.290288948465893, + "grad_norm": 4.846765023103217e-06, + "learning_rate": 1.9162605321072224e-07, + "loss": 0.0, + "num_input_tokens_seen": 75135912, + "step": 129515 + }, + { + "epoch": 19.29103366100685, + "grad_norm": 5.659354883391643e-06, + "learning_rate": 1.9122470827055984e-07, + "loss": 0.0, + "num_input_tokens_seen": 75138952, + "step": 129520 + }, + { + "epoch": 19.29177837354781, + "grad_norm": 2.394032662778045e-06, + "learning_rate": 1.9082378244944242e-07, + "loss": 0.0, + "num_input_tokens_seen": 75141768, + "step": 129525 + }, + { + "epoch": 19.29252308608877, + "grad_norm": 2.0886456695734523e-05, + "learning_rate": 1.9042327575414242e-07, + "loss": 0.0, + "num_input_tokens_seen": 75144520, + "step": 129530 + }, + { + "epoch": 19.293267798629728, + "grad_norm": 2.2769745555706322e-06, + "learning_rate": 1.9002318819142661e-07, + "loss": 0.0, + "num_input_tokens_seen": 75147240, + "step": 129535 + }, + { + "epoch": 19.29401251117069, + "grad_norm": 2.1706415282096714e-05, + "learning_rate": 1.8962351976805348e-07, + "loss": 0.0, + "num_input_tokens_seen": 75150312, + "step": 129540 + }, + { + "epoch": 19.29475722371165, + "grad_norm": 2.299819016116089e-06, + "learning_rate": 1.892242704907732e-07, + "loss": 0.0, + "num_input_tokens_seen": 75153128, + "step": 129545 + }, + { + "epoch": 19.295501936252606, + "grad_norm": 7.316541541513288e-06, + "learning_rate": 1.888254403663331e-07, + "loss": 0.0, + "num_input_tokens_seen": 75156200, + "step": 129550 + }, + { + "epoch": 19.296246648793566, + "grad_norm": 1.5142276424739975e-05, + "learning_rate": 1.884270294014695e-07, + "loss": 0.0, + "num_input_tokens_seen": 75159208, + "step": 129555 + }, + { + "epoch": 19.296991361334523, + "grad_norm": 6.620295607717708e-06, + "learning_rate": 1.880290376029159e-07, + "loss": 0.0, + "num_input_tokens_seen": 75161928, + "step": 129560 + }, + { + "epoch": 19.297736073875484, + "grad_norm": 3.2527625535294646e-06, + "learning_rate": 1.8763146497739194e-07, + "loss": 0.0, + "num_input_tokens_seen": 75164776, + "step": 129565 + }, + { + "epoch": 19.298480786416444, + "grad_norm": 1.8986181657965062e-06, + "learning_rate": 1.872343115316144e-07, + "loss": 0.0, + "num_input_tokens_seen": 75167688, + "step": 129570 + }, + { + "epoch": 19.2992254989574, + "grad_norm": 1.8563627236289904e-05, + "learning_rate": 1.8683757727229745e-07, + "loss": 0.0, + "num_input_tokens_seen": 75170120, + "step": 129575 + }, + { + "epoch": 19.299970211498362, + "grad_norm": 1.969262939383043e-06, + "learning_rate": 1.864412622061412e-07, + "loss": 0.0, + "num_input_tokens_seen": 75173096, + "step": 129580 + }, + { + "epoch": 19.300714924039323, + "grad_norm": 1.0041124369308818e-05, + "learning_rate": 1.8604536633984037e-07, + "loss": 0.0, + "num_input_tokens_seen": 75175976, + "step": 129585 + }, + { + "epoch": 19.30145963658028, + "grad_norm": 0.0032798147294670343, + "learning_rate": 1.8564988968008124e-07, + "loss": 0.0, + "num_input_tokens_seen": 75178952, + "step": 129590 + }, + { + "epoch": 19.30220434912124, + "grad_norm": 6.6139118644059636e-06, + "learning_rate": 1.8525483223354734e-07, + "loss": 0.0, + "num_input_tokens_seen": 75181832, + "step": 129595 + }, + { + "epoch": 19.302949061662197, + "grad_norm": 4.090232323505916e-06, + "learning_rate": 1.848601940069139e-07, + "loss": 0.0, + "num_input_tokens_seen": 75184680, + "step": 129600 + }, + { + "epoch": 19.303693774203158, + "grad_norm": 3.544017545209499e-06, + "learning_rate": 1.8446597500684503e-07, + "loss": 0.0, + "num_input_tokens_seen": 75187496, + "step": 129605 + }, + { + "epoch": 19.304438486744118, + "grad_norm": 2.2917083697393537e-06, + "learning_rate": 1.8407217524000486e-07, + "loss": 0.0, + "num_input_tokens_seen": 75190216, + "step": 129610 + }, + { + "epoch": 19.305183199285075, + "grad_norm": 6.890932581882225e-06, + "learning_rate": 1.8367879471304084e-07, + "loss": 0.0, + "num_input_tokens_seen": 75193128, + "step": 129615 + }, + { + "epoch": 19.305927911826036, + "grad_norm": 5.8402447393746115e-06, + "learning_rate": 1.832858334326032e-07, + "loss": 0.0, + "num_input_tokens_seen": 75196232, + "step": 129620 + }, + { + "epoch": 19.306672624366993, + "grad_norm": 2.4534911062801257e-06, + "learning_rate": 1.828932914053255e-07, + "loss": 0.0012, + "num_input_tokens_seen": 75198952, + "step": 129625 + }, + { + "epoch": 19.307417336907953, + "grad_norm": 0.00015433186490554363, + "learning_rate": 1.8250116863784694e-07, + "loss": 0.0, + "num_input_tokens_seen": 75201768, + "step": 129630 + }, + { + "epoch": 19.308162049448914, + "grad_norm": 9.135912478086539e-06, + "learning_rate": 1.8210946513678439e-07, + "loss": 0.0, + "num_input_tokens_seen": 75204456, + "step": 129635 + }, + { + "epoch": 19.30890676198987, + "grad_norm": 2.2458678358816542e-05, + "learning_rate": 1.8171818090876037e-07, + "loss": 0.0, + "num_input_tokens_seen": 75207272, + "step": 129640 + }, + { + "epoch": 19.30965147453083, + "grad_norm": 2.7444350507721538e-06, + "learning_rate": 1.8132731596038345e-07, + "loss": 0.0, + "num_input_tokens_seen": 75210088, + "step": 129645 + }, + { + "epoch": 19.31039618707179, + "grad_norm": 3.93610162063851e-06, + "learning_rate": 1.8093687029825666e-07, + "loss": 0.0, + "num_input_tokens_seen": 75212744, + "step": 129650 + }, + { + "epoch": 19.31114089961275, + "grad_norm": 2.53580265052733e-06, + "learning_rate": 1.8054684392897758e-07, + "loss": 0.0, + "num_input_tokens_seen": 75215752, + "step": 129655 + }, + { + "epoch": 19.31188561215371, + "grad_norm": 2.7905844035558403e-06, + "learning_rate": 1.8015723685913255e-07, + "loss": 0.0, + "num_input_tokens_seen": 75218728, + "step": 129660 + }, + { + "epoch": 19.312630324694666, + "grad_norm": 0.0014891676837578416, + "learning_rate": 1.797680490953052e-07, + "loss": 0.0, + "num_input_tokens_seen": 75221384, + "step": 129665 + }, + { + "epoch": 19.313375037235627, + "grad_norm": 6.726789251843002e-06, + "learning_rate": 1.7937928064407085e-07, + "loss": 0.0, + "num_input_tokens_seen": 75224232, + "step": 129670 + }, + { + "epoch": 19.314119749776587, + "grad_norm": 2.9469597393472213e-06, + "learning_rate": 1.7899093151199643e-07, + "loss": 0.0, + "num_input_tokens_seen": 75227400, + "step": 129675 + }, + { + "epoch": 19.314864462317544, + "grad_norm": 8.239517774200067e-05, + "learning_rate": 1.7860300170564613e-07, + "loss": 0.0, + "num_input_tokens_seen": 75230504, + "step": 129680 + }, + { + "epoch": 19.315609174858505, + "grad_norm": 6.104781732574338e-06, + "learning_rate": 1.7821549123156755e-07, + "loss": 0.0, + "num_input_tokens_seen": 75233192, + "step": 129685 + }, + { + "epoch": 19.316353887399465, + "grad_norm": 6.913203378644539e-06, + "learning_rate": 1.7782840009631375e-07, + "loss": 0.0, + "num_input_tokens_seen": 75236168, + "step": 129690 + }, + { + "epoch": 19.317098599940422, + "grad_norm": 2.095859144901624e-06, + "learning_rate": 1.7744172830641835e-07, + "loss": 0.0, + "num_input_tokens_seen": 75238728, + "step": 129695 + }, + { + "epoch": 19.317843312481383, + "grad_norm": 0.08252038061618805, + "learning_rate": 1.7705547586841785e-07, + "loss": 0.0001, + "num_input_tokens_seen": 75241832, + "step": 129700 + }, + { + "epoch": 19.31858802502234, + "grad_norm": 1.993671503441874e-05, + "learning_rate": 1.7666964278883202e-07, + "loss": 0.0, + "num_input_tokens_seen": 75244520, + "step": 129705 + }, + { + "epoch": 19.3193327375633, + "grad_norm": 0.0009103096672333777, + "learning_rate": 1.7628422907418894e-07, + "loss": 0.0, + "num_input_tokens_seen": 75247368, + "step": 129710 + }, + { + "epoch": 19.32007745010426, + "grad_norm": 2.2341539533954347e-06, + "learning_rate": 1.7589923473098902e-07, + "loss": 0.0, + "num_input_tokens_seen": 75250152, + "step": 129715 + }, + { + "epoch": 19.320822162645218, + "grad_norm": 3.4712255001068115e-05, + "learning_rate": 1.7551465976574643e-07, + "loss": 0.0, + "num_input_tokens_seen": 75253000, + "step": 129720 + }, + { + "epoch": 19.32156687518618, + "grad_norm": 1.7368955013807863e-05, + "learning_rate": 1.7513050418495047e-07, + "loss": 0.0, + "num_input_tokens_seen": 75255944, + "step": 129725 + }, + { + "epoch": 19.32231158772714, + "grad_norm": 7.156087121984456e-06, + "learning_rate": 1.7474676799509314e-07, + "loss": 0.0, + "num_input_tokens_seen": 75258792, + "step": 129730 + }, + { + "epoch": 19.323056300268096, + "grad_norm": 1.1924955288122874e-05, + "learning_rate": 1.7436345120266095e-07, + "loss": 0.0, + "num_input_tokens_seen": 75261512, + "step": 129735 + }, + { + "epoch": 19.323801012809056, + "grad_norm": 0.004678468219935894, + "learning_rate": 1.739805538141237e-07, + "loss": 0.0, + "num_input_tokens_seen": 75264392, + "step": 129740 + }, + { + "epoch": 19.324545725350013, + "grad_norm": 2.7331777801009594e-06, + "learning_rate": 1.73598075835954e-07, + "loss": 0.0, + "num_input_tokens_seen": 75267176, + "step": 129745 + }, + { + "epoch": 19.325290437890974, + "grad_norm": 3.5474795367917977e-06, + "learning_rate": 1.7321601727461334e-07, + "loss": 0.0, + "num_input_tokens_seen": 75270056, + "step": 129750 + }, + { + "epoch": 19.326035150431935, + "grad_norm": 2.440357775412849e-06, + "learning_rate": 1.7283437813655489e-07, + "loss": 0.0, + "num_input_tokens_seen": 75272776, + "step": 129755 + }, + { + "epoch": 19.32677986297289, + "grad_norm": 2.374516952841077e-05, + "learning_rate": 1.7245315842822352e-07, + "loss": 0.0, + "num_input_tokens_seen": 75275656, + "step": 129760 + }, + { + "epoch": 19.327524575513852, + "grad_norm": 2.2726583210896933e-06, + "learning_rate": 1.720723581560668e-07, + "loss": 0.0, + "num_input_tokens_seen": 75278984, + "step": 129765 + }, + { + "epoch": 19.328269288054813, + "grad_norm": 3.482098009044421e-06, + "learning_rate": 1.716919773265102e-07, + "loss": 0.0, + "num_input_tokens_seen": 75282312, + "step": 129770 + }, + { + "epoch": 19.32901400059577, + "grad_norm": 3.5239665976405377e-06, + "learning_rate": 1.7131201594598468e-07, + "loss": 0.0, + "num_input_tokens_seen": 75285256, + "step": 129775 + }, + { + "epoch": 19.32975871313673, + "grad_norm": 5.702667294826824e-06, + "learning_rate": 1.709324740209073e-07, + "loss": 0.0, + "num_input_tokens_seen": 75288008, + "step": 129780 + }, + { + "epoch": 19.330503425677687, + "grad_norm": 4.593988251144765e-06, + "learning_rate": 1.7055335155769238e-07, + "loss": 0.0, + "num_input_tokens_seen": 75290888, + "step": 129785 + }, + { + "epoch": 19.331248138218648, + "grad_norm": 1.8396407313048257e-06, + "learning_rate": 1.7017464856274033e-07, + "loss": 0.0, + "num_input_tokens_seen": 75293736, + "step": 129790 + }, + { + "epoch": 19.331992850759608, + "grad_norm": 4.797984274773626e-06, + "learning_rate": 1.6979636504245445e-07, + "loss": 0.0, + "num_input_tokens_seen": 75296680, + "step": 129795 + }, + { + "epoch": 19.332737563300565, + "grad_norm": 3.8091070564405527e-06, + "learning_rate": 1.6941850100322122e-07, + "loss": 0.0, + "num_input_tokens_seen": 75299720, + "step": 129800 + }, + { + "epoch": 19.333482275841526, + "grad_norm": 6.273978215176612e-05, + "learning_rate": 1.6904105645142444e-07, + "loss": 0.0, + "num_input_tokens_seen": 75302472, + "step": 129805 + }, + { + "epoch": 19.334226988382483, + "grad_norm": 6.540183676406741e-06, + "learning_rate": 1.686640313934451e-07, + "loss": 0.0, + "num_input_tokens_seen": 75305512, + "step": 129810 + }, + { + "epoch": 19.334971700923443, + "grad_norm": 2.870155640266603e-06, + "learning_rate": 1.6828742583564762e-07, + "loss": 0.0, + "num_input_tokens_seen": 75308296, + "step": 129815 + }, + { + "epoch": 19.335716413464404, + "grad_norm": 9.06348395801615e-06, + "learning_rate": 1.6791123978439626e-07, + "loss": 0.0, + "num_input_tokens_seen": 75311272, + "step": 129820 + }, + { + "epoch": 19.33646112600536, + "grad_norm": 4.172309218120063e-06, + "learning_rate": 1.6753547324604713e-07, + "loss": 0.0, + "num_input_tokens_seen": 75314120, + "step": 129825 + }, + { + "epoch": 19.33720583854632, + "grad_norm": 4.316451395425247e-06, + "learning_rate": 1.671601262269451e-07, + "loss": 0.0, + "num_input_tokens_seen": 75317160, + "step": 129830 + }, + { + "epoch": 19.337950551087282, + "grad_norm": 1.9562588931876235e-05, + "learning_rate": 1.6678519873343789e-07, + "loss": 0.0, + "num_input_tokens_seen": 75320104, + "step": 129835 + }, + { + "epoch": 19.33869526362824, + "grad_norm": 6.8210551944503095e-06, + "learning_rate": 1.66410690771851e-07, + "loss": 0.0003, + "num_input_tokens_seen": 75323048, + "step": 129840 + }, + { + "epoch": 19.3394399761692, + "grad_norm": 3.658919013105333e-06, + "learning_rate": 1.6603660234851825e-07, + "loss": 0.0, + "num_input_tokens_seen": 75325512, + "step": 129845 + }, + { + "epoch": 19.340184688710156, + "grad_norm": 2.069206857413519e-06, + "learning_rate": 1.656629334697568e-07, + "loss": 0.0, + "num_input_tokens_seen": 75328392, + "step": 129850 + }, + { + "epoch": 19.340929401251117, + "grad_norm": 1.425041955371853e-05, + "learning_rate": 1.6528968414188107e-07, + "loss": 0.0, + "num_input_tokens_seen": 75331272, + "step": 129855 + }, + { + "epoch": 19.341674113792077, + "grad_norm": 2.445891823299462e-06, + "learning_rate": 1.6491685437119154e-07, + "loss": 0.0, + "num_input_tokens_seen": 75333896, + "step": 129860 + }, + { + "epoch": 19.342418826333034, + "grad_norm": 1.5651481589884497e-05, + "learning_rate": 1.6454444416399428e-07, + "loss": 0.0, + "num_input_tokens_seen": 75336872, + "step": 129865 + }, + { + "epoch": 19.343163538873995, + "grad_norm": 9.165464143734425e-05, + "learning_rate": 1.6417245352657317e-07, + "loss": 0.0, + "num_input_tokens_seen": 75339560, + "step": 129870 + }, + { + "epoch": 19.343908251414955, + "grad_norm": 3.6404485399543773e-06, + "learning_rate": 1.638008824652204e-07, + "loss": 0.0, + "num_input_tokens_seen": 75342440, + "step": 129875 + }, + { + "epoch": 19.344652963955912, + "grad_norm": 3.1761817353981314e-06, + "learning_rate": 1.6342973098620872e-07, + "loss": 0.0, + "num_input_tokens_seen": 75345288, + "step": 129880 + }, + { + "epoch": 19.345397676496873, + "grad_norm": 4.835407253267476e-06, + "learning_rate": 1.6305899909580814e-07, + "loss": 0.0, + "num_input_tokens_seen": 75348552, + "step": 129885 + }, + { + "epoch": 19.34614238903783, + "grad_norm": 2.8745291729137534e-06, + "learning_rate": 1.6268868680028026e-07, + "loss": 0.0, + "num_input_tokens_seen": 75351304, + "step": 129890 + }, + { + "epoch": 19.34688710157879, + "grad_norm": 3.951678081648424e-06, + "learning_rate": 1.623187941058868e-07, + "loss": 0.0, + "num_input_tokens_seen": 75354024, + "step": 129895 + }, + { + "epoch": 19.34763181411975, + "grad_norm": 3.7551881177932955e-06, + "learning_rate": 1.6194932101886995e-07, + "loss": 0.0, + "num_input_tokens_seen": 75357192, + "step": 129900 + }, + { + "epoch": 19.348376526660708, + "grad_norm": 5.476408659887966e-06, + "learning_rate": 1.615802675454775e-07, + "loss": 0.0009, + "num_input_tokens_seen": 75360872, + "step": 129905 + }, + { + "epoch": 19.34912123920167, + "grad_norm": 4.6260731323855e-05, + "learning_rate": 1.6121163369194335e-07, + "loss": 0.0, + "num_input_tokens_seen": 75363816, + "step": 129910 + }, + { + "epoch": 19.34986595174263, + "grad_norm": 1.485723078076262e-05, + "learning_rate": 1.6084341946449033e-07, + "loss": 0.0, + "num_input_tokens_seen": 75366696, + "step": 129915 + }, + { + "epoch": 19.350610664283586, + "grad_norm": 1.09223292383831e-05, + "learning_rate": 1.6047562486934398e-07, + "loss": 0.0, + "num_input_tokens_seen": 75369576, + "step": 129920 + }, + { + "epoch": 19.351355376824547, + "grad_norm": 0.00014637071581091732, + "learning_rate": 1.60108249912716e-07, + "loss": 0.0, + "num_input_tokens_seen": 75372712, + "step": 129925 + }, + { + "epoch": 19.352100089365504, + "grad_norm": 5.645372311846586e-06, + "learning_rate": 1.5974129460081255e-07, + "loss": 0.0, + "num_input_tokens_seen": 75375656, + "step": 129930 + }, + { + "epoch": 19.352844801906464, + "grad_norm": 0.00011928391904802993, + "learning_rate": 1.5937475893983423e-07, + "loss": 0.0, + "num_input_tokens_seen": 75378600, + "step": 129935 + }, + { + "epoch": 19.353589514447425, + "grad_norm": 1.1650214219116606e-05, + "learning_rate": 1.5900864293597328e-07, + "loss": 0.0, + "num_input_tokens_seen": 75381384, + "step": 129940 + }, + { + "epoch": 19.35433422698838, + "grad_norm": 4.240636553731747e-05, + "learning_rate": 1.5864294659541367e-07, + "loss": 0.0, + "num_input_tokens_seen": 75384104, + "step": 129945 + }, + { + "epoch": 19.355078939529342, + "grad_norm": 2.317825419595465e-06, + "learning_rate": 1.5827766992433378e-07, + "loss": 0.0, + "num_input_tokens_seen": 75386792, + "step": 129950 + }, + { + "epoch": 19.3558236520703, + "grad_norm": 2.1357320747483755e-06, + "learning_rate": 1.5791281292890093e-07, + "loss": 0.0, + "num_input_tokens_seen": 75389864, + "step": 129955 + }, + { + "epoch": 19.35656836461126, + "grad_norm": 1.6798290971564711e-06, + "learning_rate": 1.575483756152879e-07, + "loss": 0.0, + "num_input_tokens_seen": 75392904, + "step": 129960 + }, + { + "epoch": 19.35731307715222, + "grad_norm": 5.297439201967791e-06, + "learning_rate": 1.5718435798964538e-07, + "loss": 0.0, + "num_input_tokens_seen": 75395784, + "step": 129965 + }, + { + "epoch": 19.358057789693177, + "grad_norm": 0.0002086405293084681, + "learning_rate": 1.5682076005812118e-07, + "loss": 0.0, + "num_input_tokens_seen": 75398824, + "step": 129970 + }, + { + "epoch": 19.358802502234138, + "grad_norm": 2.879306975955842e-06, + "learning_rate": 1.564575818268632e-07, + "loss": 0.0, + "num_input_tokens_seen": 75401480, + "step": 129975 + }, + { + "epoch": 19.359547214775098, + "grad_norm": 1.0387730981165078e-05, + "learning_rate": 1.5609482330200265e-07, + "loss": 0.0, + "num_input_tokens_seen": 75404456, + "step": 129980 + }, + { + "epoch": 19.360291927316055, + "grad_norm": 1.7282295630138833e-06, + "learning_rate": 1.5573248448967072e-07, + "loss": 0.0, + "num_input_tokens_seen": 75407336, + "step": 129985 + }, + { + "epoch": 19.361036639857016, + "grad_norm": 1.1287858797004446e-05, + "learning_rate": 1.5537056539598748e-07, + "loss": 0.0, + "num_input_tokens_seen": 75410440, + "step": 129990 + }, + { + "epoch": 19.361781352397973, + "grad_norm": 0.00023023862740956247, + "learning_rate": 1.5500906602706756e-07, + "loss": 0.0, + "num_input_tokens_seen": 75413384, + "step": 129995 + }, + { + "epoch": 19.362526064938933, + "grad_norm": 3.7726904338342138e-06, + "learning_rate": 1.546479863890199e-07, + "loss": 0.0, + "num_input_tokens_seen": 75416296, + "step": 130000 + }, + { + "epoch": 19.363270777479894, + "grad_norm": 5.039437837695004e-06, + "learning_rate": 1.542873264879424e-07, + "loss": 0.0, + "num_input_tokens_seen": 75419112, + "step": 130005 + }, + { + "epoch": 19.36401549002085, + "grad_norm": 4.7703520067443606e-06, + "learning_rate": 1.5392708632992748e-07, + "loss": 0.0, + "num_input_tokens_seen": 75422280, + "step": 130010 + }, + { + "epoch": 19.36476020256181, + "grad_norm": 0.00018459834973327816, + "learning_rate": 1.5356726592106185e-07, + "loss": 0.0, + "num_input_tokens_seen": 75425544, + "step": 130015 + }, + { + "epoch": 19.365504915102772, + "grad_norm": 3.122895577689633e-05, + "learning_rate": 1.5320786526742682e-07, + "loss": 0.0, + "num_input_tokens_seen": 75428424, + "step": 130020 + }, + { + "epoch": 19.36624962764373, + "grad_norm": 3.468941486062249e-06, + "learning_rate": 1.5284888437508972e-07, + "loss": 0.0, + "num_input_tokens_seen": 75431304, + "step": 130025 + }, + { + "epoch": 19.36699434018469, + "grad_norm": 3.0277178666437976e-06, + "learning_rate": 1.5249032325011514e-07, + "loss": 0.0, + "num_input_tokens_seen": 75434184, + "step": 130030 + }, + { + "epoch": 19.367739052725646, + "grad_norm": 5.2101881919952575e-06, + "learning_rate": 1.5213218189856492e-07, + "loss": 0.0, + "num_input_tokens_seen": 75436936, + "step": 130035 + }, + { + "epoch": 19.368483765266607, + "grad_norm": 3.3267904200329212e-06, + "learning_rate": 1.5177446032648702e-07, + "loss": 0.0, + "num_input_tokens_seen": 75440168, + "step": 130040 + }, + { + "epoch": 19.369228477807567, + "grad_norm": 3.4865238376369234e-06, + "learning_rate": 1.5141715853992654e-07, + "loss": 0.0, + "num_input_tokens_seen": 75442760, + "step": 130045 + }, + { + "epoch": 19.369973190348524, + "grad_norm": 8.366065230802633e-06, + "learning_rate": 1.510602765449176e-07, + "loss": 0.0, + "num_input_tokens_seen": 75446024, + "step": 130050 + }, + { + "epoch": 19.370717902889485, + "grad_norm": 4.910278676106827e-06, + "learning_rate": 1.507038143474887e-07, + "loss": 0.0, + "num_input_tokens_seen": 75448712, + "step": 130055 + }, + { + "epoch": 19.371462615430445, + "grad_norm": 2.1562830170296365e-06, + "learning_rate": 1.5034777195366278e-07, + "loss": 0.0, + "num_input_tokens_seen": 75451464, + "step": 130060 + }, + { + "epoch": 19.372207327971402, + "grad_norm": 0.00014988044858910143, + "learning_rate": 1.4999214936945726e-07, + "loss": 0.0, + "num_input_tokens_seen": 75454504, + "step": 130065 + }, + { + "epoch": 19.372952040512363, + "grad_norm": 7.504713721573353e-06, + "learning_rate": 1.496369466008757e-07, + "loss": 0.0, + "num_input_tokens_seen": 75457640, + "step": 130070 + }, + { + "epoch": 19.37369675305332, + "grad_norm": 2.1048147118563065e-06, + "learning_rate": 1.4928216365392157e-07, + "loss": 0.0, + "num_input_tokens_seen": 75460584, + "step": 130075 + }, + { + "epoch": 19.37444146559428, + "grad_norm": 2.979758619403583e-06, + "learning_rate": 1.489278005345901e-07, + "loss": 0.0, + "num_input_tokens_seen": 75463240, + "step": 130080 + }, + { + "epoch": 19.37518617813524, + "grad_norm": 3.5261713492218405e-05, + "learning_rate": 1.485738572488654e-07, + "loss": 0.0, + "num_input_tokens_seen": 75466376, + "step": 130085 + }, + { + "epoch": 19.375930890676198, + "grad_norm": 3.2420655315945623e-06, + "learning_rate": 1.4822033380272603e-07, + "loss": 0.0, + "num_input_tokens_seen": 75469160, + "step": 130090 + }, + { + "epoch": 19.37667560321716, + "grad_norm": 1.1815334801212884e-05, + "learning_rate": 1.47867230202145e-07, + "loss": 0.0, + "num_input_tokens_seen": 75471720, + "step": 130095 + }, + { + "epoch": 19.37742031575812, + "grad_norm": 3.2112518510984955e-06, + "learning_rate": 1.4751454645309248e-07, + "loss": 0.0, + "num_input_tokens_seen": 75475048, + "step": 130100 + }, + { + "epoch": 19.378165028299076, + "grad_norm": 3.591560925997328e-06, + "learning_rate": 1.471622825615193e-07, + "loss": 0.0, + "num_input_tokens_seen": 75478216, + "step": 130105 + }, + { + "epoch": 19.378909740840037, + "grad_norm": 1.584968231327366e-05, + "learning_rate": 1.4681043853338184e-07, + "loss": 0.0, + "num_input_tokens_seen": 75481064, + "step": 130110 + }, + { + "epoch": 19.379654453380994, + "grad_norm": 1.3127200872986577e-05, + "learning_rate": 1.4645901437461972e-07, + "loss": 0.0, + "num_input_tokens_seen": 75483752, + "step": 130115 + }, + { + "epoch": 19.380399165921954, + "grad_norm": 1.711688241812226e-06, + "learning_rate": 1.4610801009117548e-07, + "loss": 0.0, + "num_input_tokens_seen": 75486472, + "step": 130120 + }, + { + "epoch": 19.381143878462915, + "grad_norm": 7.927357728476636e-06, + "learning_rate": 1.4575742568897488e-07, + "loss": 0.0, + "num_input_tokens_seen": 75489320, + "step": 130125 + }, + { + "epoch": 19.38188859100387, + "grad_norm": 1.2687090929830447e-05, + "learning_rate": 1.45407261173941e-07, + "loss": 0.0, + "num_input_tokens_seen": 75492392, + "step": 130130 + }, + { + "epoch": 19.382633303544832, + "grad_norm": 2.8298672987148166e-05, + "learning_rate": 1.4505751655199405e-07, + "loss": 0.0, + "num_input_tokens_seen": 75495080, + "step": 130135 + }, + { + "epoch": 19.38337801608579, + "grad_norm": 0.00046051491517573595, + "learning_rate": 1.4470819182903493e-07, + "loss": 0.0, + "num_input_tokens_seen": 75498024, + "step": 130140 + }, + { + "epoch": 19.38412272862675, + "grad_norm": 4.510625785769662e-06, + "learning_rate": 1.443592870109728e-07, + "loss": 0.0, + "num_input_tokens_seen": 75500872, + "step": 130145 + }, + { + "epoch": 19.38486744116771, + "grad_norm": 6.452011803048663e-06, + "learning_rate": 1.4401080210369454e-07, + "loss": 0.0, + "num_input_tokens_seen": 75503656, + "step": 130150 + }, + { + "epoch": 19.385612153708667, + "grad_norm": 5.33659549546428e-05, + "learning_rate": 1.4366273711309275e-07, + "loss": 0.0, + "num_input_tokens_seen": 75506600, + "step": 130155 + }, + { + "epoch": 19.386356866249628, + "grad_norm": 2.3957258235896006e-05, + "learning_rate": 1.43315092045046e-07, + "loss": 0.0, + "num_input_tokens_seen": 75509704, + "step": 130160 + }, + { + "epoch": 19.38710157879059, + "grad_norm": 1.9989238353446126e-06, + "learning_rate": 1.429678669054274e-07, + "loss": 0.0, + "num_input_tokens_seen": 75512872, + "step": 130165 + }, + { + "epoch": 19.387846291331545, + "grad_norm": 7.416649168590084e-06, + "learning_rate": 1.4262106170010447e-07, + "loss": 0.0001, + "num_input_tokens_seen": 75516328, + "step": 130170 + }, + { + "epoch": 19.388591003872506, + "grad_norm": 0.00020467114518396556, + "learning_rate": 1.4227467643493364e-07, + "loss": 0.0, + "num_input_tokens_seen": 75519304, + "step": 130175 + }, + { + "epoch": 19.389335716413463, + "grad_norm": 2.782529372780118e-05, + "learning_rate": 1.4192871111576856e-07, + "loss": 0.0, + "num_input_tokens_seen": 75522120, + "step": 130180 + }, + { + "epoch": 19.390080428954423, + "grad_norm": 1.1003728104697075e-05, + "learning_rate": 1.4158316574845175e-07, + "loss": 0.0, + "num_input_tokens_seen": 75524968, + "step": 130185 + }, + { + "epoch": 19.390825141495384, + "grad_norm": 3.292320343462052e-06, + "learning_rate": 1.4123804033882305e-07, + "loss": 0.0, + "num_input_tokens_seen": 75527976, + "step": 130190 + }, + { + "epoch": 19.39156985403634, + "grad_norm": 6.58910676065716e-06, + "learning_rate": 1.4089333489271384e-07, + "loss": 0.0, + "num_input_tokens_seen": 75530984, + "step": 130195 + }, + { + "epoch": 19.3923145665773, + "grad_norm": 2.33886316891585e-06, + "learning_rate": 1.405490494159445e-07, + "loss": 0.0, + "num_input_tokens_seen": 75534056, + "step": 130200 + }, + { + "epoch": 19.393059279118262, + "grad_norm": 0.00537893595173955, + "learning_rate": 1.4020518391433258e-07, + "loss": 0.0, + "num_input_tokens_seen": 75536936, + "step": 130205 + }, + { + "epoch": 19.39380399165922, + "grad_norm": 4.8955985221255105e-06, + "learning_rate": 1.398617383936901e-07, + "loss": 0.0, + "num_input_tokens_seen": 75540072, + "step": 130210 + }, + { + "epoch": 19.39454870420018, + "grad_norm": 2.7023831989936298e-06, + "learning_rate": 1.395187128598152e-07, + "loss": 0.0, + "num_input_tokens_seen": 75542888, + "step": 130215 + }, + { + "epoch": 19.395293416741136, + "grad_norm": 2.342413608857896e-06, + "learning_rate": 1.3917610731850328e-07, + "loss": 0.0, + "num_input_tokens_seen": 75545768, + "step": 130220 + }, + { + "epoch": 19.396038129282097, + "grad_norm": 1.7910057067638263e-05, + "learning_rate": 1.3883392177554688e-07, + "loss": 0.0, + "num_input_tokens_seen": 75548424, + "step": 130225 + }, + { + "epoch": 19.396782841823057, + "grad_norm": 1.1766976967919618e-05, + "learning_rate": 1.3849215623672197e-07, + "loss": 0.0, + "num_input_tokens_seen": 75551656, + "step": 130230 + }, + { + "epoch": 19.397527554364014, + "grad_norm": 4.178939889243338e-06, + "learning_rate": 1.3815081070780167e-07, + "loss": 0.0, + "num_input_tokens_seen": 75554536, + "step": 130235 + }, + { + "epoch": 19.398272266904975, + "grad_norm": 2.5665617613412905e-06, + "learning_rate": 1.378098851945564e-07, + "loss": 0.0, + "num_input_tokens_seen": 75557288, + "step": 130240 + }, + { + "epoch": 19.399016979445936, + "grad_norm": 1.0859245776373427e-05, + "learning_rate": 1.3746937970274543e-07, + "loss": 0.0021, + "num_input_tokens_seen": 75560424, + "step": 130245 + }, + { + "epoch": 19.399761691986892, + "grad_norm": 3.4348665849393e-06, + "learning_rate": 1.3712929423812247e-07, + "loss": 0.0, + "num_input_tokens_seen": 75563304, + "step": 130250 + }, + { + "epoch": 19.400506404527853, + "grad_norm": 7.5258190008753445e-06, + "learning_rate": 1.3678962880642465e-07, + "loss": 0.0, + "num_input_tokens_seen": 75566056, + "step": 130255 + }, + { + "epoch": 19.40125111706881, + "grad_norm": 3.452757482591551e-06, + "learning_rate": 1.3645038341340011e-07, + "loss": 0.0, + "num_input_tokens_seen": 75568968, + "step": 130260 + }, + { + "epoch": 19.40199582960977, + "grad_norm": 1.6569301806157455e-05, + "learning_rate": 1.361115580647748e-07, + "loss": 0.0, + "num_input_tokens_seen": 75571976, + "step": 130265 + }, + { + "epoch": 19.40274054215073, + "grad_norm": 0.0002355000760871917, + "learning_rate": 1.357731527662748e-07, + "loss": 0.0, + "num_input_tokens_seen": 75574792, + "step": 130270 + }, + { + "epoch": 19.403485254691688, + "grad_norm": 3.783533657042426e-06, + "learning_rate": 1.3543516752361763e-07, + "loss": 0.0, + "num_input_tokens_seen": 75578024, + "step": 130275 + }, + { + "epoch": 19.40422996723265, + "grad_norm": 2.0592181044776225e-06, + "learning_rate": 1.3509760234251267e-07, + "loss": 0.0, + "num_input_tokens_seen": 75580936, + "step": 130280 + }, + { + "epoch": 19.40497467977361, + "grad_norm": 2.7516594855114818e-05, + "learning_rate": 1.3476045722865815e-07, + "loss": 0.0, + "num_input_tokens_seen": 75583752, + "step": 130285 + }, + { + "epoch": 19.405719392314566, + "grad_norm": 1.2546915058919694e-05, + "learning_rate": 1.3442373218775784e-07, + "loss": 0.0, + "num_input_tokens_seen": 75586696, + "step": 130290 + }, + { + "epoch": 19.406464104855527, + "grad_norm": 1.1247157999605406e-05, + "learning_rate": 1.340874272254933e-07, + "loss": 0.0, + "num_input_tokens_seen": 75589736, + "step": 130295 + }, + { + "epoch": 19.407208817396484, + "grad_norm": 0.00020923760894220322, + "learning_rate": 1.3375154234755162e-07, + "loss": 0.0, + "num_input_tokens_seen": 75592680, + "step": 130300 + }, + { + "epoch": 19.407953529937444, + "grad_norm": 8.409910151385702e-06, + "learning_rate": 1.3341607755960327e-07, + "loss": 0.0, + "num_input_tokens_seen": 75595464, + "step": 130305 + }, + { + "epoch": 19.408698242478405, + "grad_norm": 2.426144192213542e-06, + "learning_rate": 1.3308103286731598e-07, + "loss": 0.0, + "num_input_tokens_seen": 75598344, + "step": 130310 + }, + { + "epoch": 19.40944295501936, + "grad_norm": 2.44995362663758e-06, + "learning_rate": 1.3274640827635187e-07, + "loss": 0.0, + "num_input_tokens_seen": 75601288, + "step": 130315 + }, + { + "epoch": 19.410187667560322, + "grad_norm": 4.651363269658759e-05, + "learning_rate": 1.3241220379236473e-07, + "loss": 0.0, + "num_input_tokens_seen": 75603912, + "step": 130320 + }, + { + "epoch": 19.41093238010128, + "grad_norm": 1.2198170225019567e-05, + "learning_rate": 1.320784194209973e-07, + "loss": 0.0, + "num_input_tokens_seen": 75606824, + "step": 130325 + }, + { + "epoch": 19.41167709264224, + "grad_norm": 8.515064109815285e-06, + "learning_rate": 1.3174505516789226e-07, + "loss": 0.0, + "num_input_tokens_seen": 75609512, + "step": 130330 + }, + { + "epoch": 19.4124218051832, + "grad_norm": 5.007164872949943e-06, + "learning_rate": 1.3141211103867845e-07, + "loss": 0.0, + "num_input_tokens_seen": 75612520, + "step": 130335 + }, + { + "epoch": 19.413166517724157, + "grad_norm": 8.08145159680862e-06, + "learning_rate": 1.3107958703898193e-07, + "loss": 0.0, + "num_input_tokens_seen": 75615688, + "step": 130340 + }, + { + "epoch": 19.413911230265118, + "grad_norm": 3.3462163173680892e-06, + "learning_rate": 1.3074748317442042e-07, + "loss": 0.0, + "num_input_tokens_seen": 75618504, + "step": 130345 + }, + { + "epoch": 19.41465594280608, + "grad_norm": 1.0751963600341696e-05, + "learning_rate": 1.3041579945060335e-07, + "loss": 0.0, + "num_input_tokens_seen": 75621384, + "step": 130350 + }, + { + "epoch": 19.415400655347035, + "grad_norm": 2.5892056783050066e-06, + "learning_rate": 1.3008453587313453e-07, + "loss": 0.0, + "num_input_tokens_seen": 75624328, + "step": 130355 + }, + { + "epoch": 19.416145367887996, + "grad_norm": 7.093217845977051e-06, + "learning_rate": 1.2975369244761226e-07, + "loss": 0.0, + "num_input_tokens_seen": 75627496, + "step": 130360 + }, + { + "epoch": 19.416890080428953, + "grad_norm": 5.200475698075024e-06, + "learning_rate": 1.2942326917962377e-07, + "loss": 0.0, + "num_input_tokens_seen": 75630440, + "step": 130365 + }, + { + "epoch": 19.417634792969913, + "grad_norm": 5.690760644938564e-06, + "learning_rate": 1.290932660747507e-07, + "loss": 0.0, + "num_input_tokens_seen": 75633192, + "step": 130370 + }, + { + "epoch": 19.418379505510874, + "grad_norm": 0.00011752753198379651, + "learning_rate": 1.287636831385719e-07, + "loss": 0.0, + "num_input_tokens_seen": 75636072, + "step": 130375 + }, + { + "epoch": 19.41912421805183, + "grad_norm": 0.0024022518191486597, + "learning_rate": 1.2843452037664962e-07, + "loss": 0.0, + "num_input_tokens_seen": 75638760, + "step": 130380 + }, + { + "epoch": 19.41986893059279, + "grad_norm": 5.756640348408837e-06, + "learning_rate": 1.281057777945488e-07, + "loss": 0.0, + "num_input_tokens_seen": 75641832, + "step": 130385 + }, + { + "epoch": 19.420613643133752, + "grad_norm": 2.4545513497287175e-06, + "learning_rate": 1.2777745539782337e-07, + "loss": 0.0, + "num_input_tokens_seen": 75644776, + "step": 130390 + }, + { + "epoch": 19.42135835567471, + "grad_norm": 2.6913578494713875e-06, + "learning_rate": 1.274495531920189e-07, + "loss": 0.0, + "num_input_tokens_seen": 75647976, + "step": 130395 + }, + { + "epoch": 19.42210306821567, + "grad_norm": 3.684039256768301e-06, + "learning_rate": 1.2712207118267262e-07, + "loss": 0.0, + "num_input_tokens_seen": 75650856, + "step": 130400 + }, + { + "epoch": 19.422847780756626, + "grad_norm": 7.1543295234732796e-06, + "learning_rate": 1.2679500937532173e-07, + "loss": 0.0, + "num_input_tokens_seen": 75653736, + "step": 130405 + }, + { + "epoch": 19.423592493297587, + "grad_norm": 6.092246621847153e-06, + "learning_rate": 1.2646836777548688e-07, + "loss": 0.0, + "num_input_tokens_seen": 75656840, + "step": 130410 + }, + { + "epoch": 19.424337205838548, + "grad_norm": 1.7159718481707387e-05, + "learning_rate": 1.2614214638869137e-07, + "loss": 0.0, + "num_input_tokens_seen": 75659848, + "step": 130415 + }, + { + "epoch": 19.425081918379504, + "grad_norm": 1.1223810361116193e-05, + "learning_rate": 1.2581634522044194e-07, + "loss": 0.0, + "num_input_tokens_seen": 75662792, + "step": 130420 + }, + { + "epoch": 19.425826630920465, + "grad_norm": 2.3292166133614955e-06, + "learning_rate": 1.254909642762453e-07, + "loss": 0.0, + "num_input_tokens_seen": 75665704, + "step": 130425 + }, + { + "epoch": 19.426571343461426, + "grad_norm": 9.93810135696549e-06, + "learning_rate": 1.2516600356159701e-07, + "loss": 0.0, + "num_input_tokens_seen": 75668456, + "step": 130430 + }, + { + "epoch": 19.427316056002383, + "grad_norm": 3.561372977856081e-06, + "learning_rate": 1.248414630819872e-07, + "loss": 0.0, + "num_input_tokens_seen": 75671144, + "step": 130435 + }, + { + "epoch": 19.428060768543343, + "grad_norm": 4.974998319084989e-06, + "learning_rate": 1.2451734284289752e-07, + "loss": 0.0, + "num_input_tokens_seen": 75673768, + "step": 130440 + }, + { + "epoch": 19.4288054810843, + "grad_norm": 2.5362066935485927e-06, + "learning_rate": 1.2419364284980696e-07, + "loss": 0.0, + "num_input_tokens_seen": 75676552, + "step": 130445 + }, + { + "epoch": 19.42955019362526, + "grad_norm": 1.337682806479279e-05, + "learning_rate": 1.2387036310818334e-07, + "loss": 0.0, + "num_input_tokens_seen": 75679432, + "step": 130450 + }, + { + "epoch": 19.43029490616622, + "grad_norm": 0.0014819984789937735, + "learning_rate": 1.2354750362348344e-07, + "loss": 0.0, + "num_input_tokens_seen": 75682152, + "step": 130455 + }, + { + "epoch": 19.431039618707178, + "grad_norm": 8.4706574853044e-05, + "learning_rate": 1.2322506440116676e-07, + "loss": 0.0, + "num_input_tokens_seen": 75685096, + "step": 130460 + }, + { + "epoch": 19.43178433124814, + "grad_norm": 4.527927558228839e-06, + "learning_rate": 1.2290304544668174e-07, + "loss": 0.0, + "num_input_tokens_seen": 75687880, + "step": 130465 + }, + { + "epoch": 19.432529043789096, + "grad_norm": 1.7548983350934577e-06, + "learning_rate": 1.2258144676546291e-07, + "loss": 0.0, + "num_input_tokens_seen": 75690888, + "step": 130470 + }, + { + "epoch": 19.433273756330056, + "grad_norm": 6.237288744159741e-06, + "learning_rate": 1.2226026836294756e-07, + "loss": 0.0, + "num_input_tokens_seen": 75693864, + "step": 130475 + }, + { + "epoch": 19.434018468871017, + "grad_norm": 7.714866114838514e-06, + "learning_rate": 1.2193951024455918e-07, + "loss": 0.0, + "num_input_tokens_seen": 75696552, + "step": 130480 + }, + { + "epoch": 19.434763181411974, + "grad_norm": 6.563539045600919e-06, + "learning_rate": 1.216191724157184e-07, + "loss": 0.0, + "num_input_tokens_seen": 75699592, + "step": 130485 + }, + { + "epoch": 19.435507893952934, + "grad_norm": 2.9218426789157093e-06, + "learning_rate": 1.212992548818348e-07, + "loss": 0.0, + "num_input_tokens_seen": 75702408, + "step": 130490 + }, + { + "epoch": 19.436252606493895, + "grad_norm": 1.097096628654981e-05, + "learning_rate": 1.2097975764831516e-07, + "loss": 0.0, + "num_input_tokens_seen": 75705448, + "step": 130495 + }, + { + "epoch": 19.43699731903485, + "grad_norm": 0.00014611153164878488, + "learning_rate": 1.206606807205579e-07, + "loss": 0.0, + "num_input_tokens_seen": 75708168, + "step": 130500 + }, + { + "epoch": 19.437742031575812, + "grad_norm": 3.1117890557652572e-06, + "learning_rate": 1.2034202410395324e-07, + "loss": 0.0, + "num_input_tokens_seen": 75710984, + "step": 130505 + }, + { + "epoch": 19.43848674411677, + "grad_norm": 5.263262573862448e-05, + "learning_rate": 1.200237878038829e-07, + "loss": 0.0, + "num_input_tokens_seen": 75714024, + "step": 130510 + }, + { + "epoch": 19.43923145665773, + "grad_norm": 2.13998714571062e-06, + "learning_rate": 1.197059718257204e-07, + "loss": 0.0, + "num_input_tokens_seen": 75717192, + "step": 130515 + }, + { + "epoch": 19.43997616919869, + "grad_norm": 4.21135155193042e-05, + "learning_rate": 1.19388576174842e-07, + "loss": 0.0, + "num_input_tokens_seen": 75720200, + "step": 130520 + }, + { + "epoch": 19.440720881739647, + "grad_norm": 6.8940921664761845e-06, + "learning_rate": 1.1907160085660451e-07, + "loss": 0.0, + "num_input_tokens_seen": 75723176, + "step": 130525 + }, + { + "epoch": 19.441465594280608, + "grad_norm": 6.44967121843365e-06, + "learning_rate": 1.1875504587636477e-07, + "loss": 0.0, + "num_input_tokens_seen": 75726056, + "step": 130530 + }, + { + "epoch": 19.44221030682157, + "grad_norm": 2.911064029831323e-06, + "learning_rate": 1.1843891123947126e-07, + "loss": 0.0, + "num_input_tokens_seen": 75728872, + "step": 130535 + }, + { + "epoch": 19.442955019362525, + "grad_norm": 2.33359205594752e-05, + "learning_rate": 1.1812319695126416e-07, + "loss": 0.0, + "num_input_tokens_seen": 75731944, + "step": 130540 + }, + { + "epoch": 19.443699731903486, + "grad_norm": 1.7714146451908164e-05, + "learning_rate": 1.1780790301707533e-07, + "loss": 0.0, + "num_input_tokens_seen": 75734824, + "step": 130545 + }, + { + "epoch": 19.444444444444443, + "grad_norm": 4.1911230255209375e-06, + "learning_rate": 1.1749302944223384e-07, + "loss": 0.0005, + "num_input_tokens_seen": 75737960, + "step": 130550 + }, + { + "epoch": 19.445189156985403, + "grad_norm": 8.552621693524998e-06, + "learning_rate": 1.1717857623205764e-07, + "loss": 0.0, + "num_input_tokens_seen": 75740648, + "step": 130555 + }, + { + "epoch": 19.445933869526364, + "grad_norm": 3.651490033007576e-06, + "learning_rate": 1.1686454339185915e-07, + "loss": 0.0, + "num_input_tokens_seen": 75743720, + "step": 130560 + }, + { + "epoch": 19.44667858206732, + "grad_norm": 1.8275813999935053e-05, + "learning_rate": 1.1655093092694525e-07, + "loss": 0.0, + "num_input_tokens_seen": 75746760, + "step": 130565 + }, + { + "epoch": 19.44742329460828, + "grad_norm": 4.307300696382299e-05, + "learning_rate": 1.1623773884261169e-07, + "loss": 0.0, + "num_input_tokens_seen": 75749768, + "step": 130570 + }, + { + "epoch": 19.448168007149242, + "grad_norm": 1.4856355846859515e-05, + "learning_rate": 1.1592496714415147e-07, + "loss": 0.0, + "num_input_tokens_seen": 75752552, + "step": 130575 + }, + { + "epoch": 19.4489127196902, + "grad_norm": 1.3440625480143353e-05, + "learning_rate": 1.1561261583684924e-07, + "loss": 0.0, + "num_input_tokens_seen": 75755240, + "step": 130580 + }, + { + "epoch": 19.44965743223116, + "grad_norm": 5.818193676532246e-05, + "learning_rate": 1.1530068492597856e-07, + "loss": 0.0, + "num_input_tokens_seen": 75758344, + "step": 130585 + }, + { + "epoch": 19.450402144772116, + "grad_norm": 2.0827956177527085e-06, + "learning_rate": 1.1498917441681023e-07, + "loss": 0.0, + "num_input_tokens_seen": 75761736, + "step": 130590 + }, + { + "epoch": 19.451146857313077, + "grad_norm": 1.6617634628346423e-06, + "learning_rate": 1.1467808431460947e-07, + "loss": 0.0, + "num_input_tokens_seen": 75764328, + "step": 130595 + }, + { + "epoch": 19.451891569854038, + "grad_norm": 4.017963419755688e-06, + "learning_rate": 1.143674146246304e-07, + "loss": 0.0, + "num_input_tokens_seen": 75767208, + "step": 130600 + }, + { + "epoch": 19.452636282394995, + "grad_norm": 1.2406937457853928e-05, + "learning_rate": 1.1405716535212163e-07, + "loss": 0.0, + "num_input_tokens_seen": 75770152, + "step": 130605 + }, + { + "epoch": 19.453380994935955, + "grad_norm": 1.0724490493885241e-05, + "learning_rate": 1.1374733650232338e-07, + "loss": 0.0, + "num_input_tokens_seen": 75772904, + "step": 130610 + }, + { + "epoch": 19.454125707476916, + "grad_norm": 1.4407526577997487e-05, + "learning_rate": 1.1343792808047038e-07, + "loss": 0.0, + "num_input_tokens_seen": 75775880, + "step": 130615 + }, + { + "epoch": 19.454870420017873, + "grad_norm": 3.2894761261559324e-06, + "learning_rate": 1.1312894009179176e-07, + "loss": 0.0, + "num_input_tokens_seen": 75778856, + "step": 130620 + }, + { + "epoch": 19.455615132558833, + "grad_norm": 0.0034859550651162863, + "learning_rate": 1.1282037254150279e-07, + "loss": 0.0, + "num_input_tokens_seen": 75781832, + "step": 130625 + }, + { + "epoch": 19.45635984509979, + "grad_norm": 6.638572085648775e-05, + "learning_rate": 1.1251222543482154e-07, + "loss": 0.0, + "num_input_tokens_seen": 75784936, + "step": 130630 + }, + { + "epoch": 19.45710455764075, + "grad_norm": 2.5128076686087297e-06, + "learning_rate": 1.1220449877694938e-07, + "loss": 0.0, + "num_input_tokens_seen": 75788200, + "step": 130635 + }, + { + "epoch": 19.45784927018171, + "grad_norm": 4.071570401720237e-06, + "learning_rate": 1.1189719257309051e-07, + "loss": 0.0, + "num_input_tokens_seen": 75791176, + "step": 130640 + }, + { + "epoch": 19.458593982722668, + "grad_norm": 5.482075266627362e-06, + "learning_rate": 1.1159030682843242e-07, + "loss": 0.0, + "num_input_tokens_seen": 75793832, + "step": 130645 + }, + { + "epoch": 19.45933869526363, + "grad_norm": 5.087029785499908e-05, + "learning_rate": 1.1128384154815984e-07, + "loss": 0.0, + "num_input_tokens_seen": 75797064, + "step": 130650 + }, + { + "epoch": 19.46008340780459, + "grad_norm": 4.779179562319769e-06, + "learning_rate": 1.1097779673745201e-07, + "loss": 0.0, + "num_input_tokens_seen": 75799592, + "step": 130655 + }, + { + "epoch": 19.460828120345546, + "grad_norm": 1.0903412658080924e-05, + "learning_rate": 1.1067217240147698e-07, + "loss": 0.0, + "num_input_tokens_seen": 75802600, + "step": 130660 + }, + { + "epoch": 19.461572832886507, + "grad_norm": 3.812377599388128e-06, + "learning_rate": 1.1036696854540007e-07, + "loss": 0.0, + "num_input_tokens_seen": 75805512, + "step": 130665 + }, + { + "epoch": 19.462317545427464, + "grad_norm": 5.290781700750813e-05, + "learning_rate": 1.100621851743755e-07, + "loss": 0.0, + "num_input_tokens_seen": 75808264, + "step": 130670 + }, + { + "epoch": 19.463062257968424, + "grad_norm": 1.0464082151884213e-05, + "learning_rate": 1.0975782229355469e-07, + "loss": 0.0, + "num_input_tokens_seen": 75811432, + "step": 130675 + }, + { + "epoch": 19.463806970509385, + "grad_norm": 8.716023330634926e-06, + "learning_rate": 1.0945387990807798e-07, + "loss": 0.0, + "num_input_tokens_seen": 75814216, + "step": 130680 + }, + { + "epoch": 19.464551683050342, + "grad_norm": 1.5727671325294068e-06, + "learning_rate": 1.0915035802308016e-07, + "loss": 0.0, + "num_input_tokens_seen": 75817672, + "step": 130685 + }, + { + "epoch": 19.465296395591302, + "grad_norm": 5.7000506785698235e-06, + "learning_rate": 1.0884725664368766e-07, + "loss": 0.0, + "num_input_tokens_seen": 75820840, + "step": 130690 + }, + { + "epoch": 19.46604110813226, + "grad_norm": 5.9047424656455405e-06, + "learning_rate": 1.0854457577502419e-07, + "loss": 0.0, + "num_input_tokens_seen": 75823720, + "step": 130695 + }, + { + "epoch": 19.46678582067322, + "grad_norm": 8.646532478451263e-06, + "learning_rate": 1.0824231542220232e-07, + "loss": 0.0, + "num_input_tokens_seen": 75826664, + "step": 130700 + }, + { + "epoch": 19.46753053321418, + "grad_norm": 3.507733254082268e-06, + "learning_rate": 1.0794047559032627e-07, + "loss": 0.0, + "num_input_tokens_seen": 75829416, + "step": 130705 + }, + { + "epoch": 19.468275245755137, + "grad_norm": 4.685166004492203e-06, + "learning_rate": 1.0763905628449478e-07, + "loss": 0.0, + "num_input_tokens_seen": 75832520, + "step": 130710 + }, + { + "epoch": 19.469019958296098, + "grad_norm": 0.0004428737156558782, + "learning_rate": 1.0733805750980653e-07, + "loss": 0.0, + "num_input_tokens_seen": 75835336, + "step": 130715 + }, + { + "epoch": 19.46976467083706, + "grad_norm": 0.00017683039186522365, + "learning_rate": 1.07037479271338e-07, + "loss": 0.0, + "num_input_tokens_seen": 75838152, + "step": 130720 + }, + { + "epoch": 19.470509383378015, + "grad_norm": 1.2650676580960862e-05, + "learning_rate": 1.0673732157417404e-07, + "loss": 0.0, + "num_input_tokens_seen": 75841096, + "step": 130725 + }, + { + "epoch": 19.471254095918976, + "grad_norm": 2.100439132846077e-06, + "learning_rate": 1.0643758442338004e-07, + "loss": 0.0, + "num_input_tokens_seen": 75843976, + "step": 130730 + }, + { + "epoch": 19.471998808459933, + "grad_norm": 2.5194251520588296e-06, + "learning_rate": 1.0613826782402414e-07, + "loss": 0.0, + "num_input_tokens_seen": 75846952, + "step": 130735 + }, + { + "epoch": 19.472743521000893, + "grad_norm": 5.65015534448321e-06, + "learning_rate": 1.0583937178116066e-07, + "loss": 0.0, + "num_input_tokens_seen": 75850152, + "step": 130740 + }, + { + "epoch": 19.473488233541854, + "grad_norm": 8.588791388319805e-05, + "learning_rate": 1.055408962998411e-07, + "loss": 0.0, + "num_input_tokens_seen": 75852872, + "step": 130745 + }, + { + "epoch": 19.47423294608281, + "grad_norm": 2.6170798719249433e-06, + "learning_rate": 1.0524284138510588e-07, + "loss": 0.0, + "num_input_tokens_seen": 75855912, + "step": 130750 + }, + { + "epoch": 19.47497765862377, + "grad_norm": 0.0006205245153978467, + "learning_rate": 1.0494520704198985e-07, + "loss": 0.0, + "num_input_tokens_seen": 75858568, + "step": 130755 + }, + { + "epoch": 19.475722371164732, + "grad_norm": 0.008636614307761192, + "learning_rate": 1.0464799327552232e-07, + "loss": 0.0, + "num_input_tokens_seen": 75861576, + "step": 130760 + }, + { + "epoch": 19.47646708370569, + "grad_norm": 2.1482558167917887e-06, + "learning_rate": 1.043512000907243e-07, + "loss": 0.0, + "num_input_tokens_seen": 75864744, + "step": 130765 + }, + { + "epoch": 19.47721179624665, + "grad_norm": 2.7082674932898954e-05, + "learning_rate": 1.040548274926112e-07, + "loss": 0.0, + "num_input_tokens_seen": 75867784, + "step": 130770 + }, + { + "epoch": 19.477956508787607, + "grad_norm": 5.804190004710108e-05, + "learning_rate": 1.0375887548618735e-07, + "loss": 0.0, + "num_input_tokens_seen": 75870888, + "step": 130775 + }, + { + "epoch": 19.478701221328567, + "grad_norm": 0.0004430029366631061, + "learning_rate": 1.034633440764543e-07, + "loss": 0.0, + "num_input_tokens_seen": 75873928, + "step": 130780 + }, + { + "epoch": 19.479445933869528, + "grad_norm": 1.5467061302842922e-06, + "learning_rate": 1.031682332684053e-07, + "loss": 0.0, + "num_input_tokens_seen": 75877096, + "step": 130785 + }, + { + "epoch": 19.480190646410485, + "grad_norm": 3.5293544442538405e-06, + "learning_rate": 1.0287354306702524e-07, + "loss": 0.0, + "num_input_tokens_seen": 75880104, + "step": 130790 + }, + { + "epoch": 19.480935358951445, + "grad_norm": 5.658137069985969e-06, + "learning_rate": 1.0257927347729068e-07, + "loss": 0.0, + "num_input_tokens_seen": 75882760, + "step": 130795 + }, + { + "epoch": 19.481680071492406, + "grad_norm": 4.694822564488277e-05, + "learning_rate": 1.0228542450417545e-07, + "loss": 0.0, + "num_input_tokens_seen": 75885608, + "step": 130800 + }, + { + "epoch": 19.482424784033363, + "grad_norm": 0.0005717771127820015, + "learning_rate": 1.0199199615264499e-07, + "loss": 0.0, + "num_input_tokens_seen": 75888424, + "step": 130805 + }, + { + "epoch": 19.483169496574323, + "grad_norm": 2.4040962216531625e-06, + "learning_rate": 1.0169898842765091e-07, + "loss": 0.0, + "num_input_tokens_seen": 75891272, + "step": 130810 + }, + { + "epoch": 19.48391420911528, + "grad_norm": 6.4388154896732885e-06, + "learning_rate": 1.0140640133415036e-07, + "loss": 0.0, + "num_input_tokens_seen": 75894120, + "step": 130815 + }, + { + "epoch": 19.48465892165624, + "grad_norm": 0.00014425758854486048, + "learning_rate": 1.0111423487708105e-07, + "loss": 0.0, + "num_input_tokens_seen": 75897256, + "step": 130820 + }, + { + "epoch": 19.4854036341972, + "grad_norm": 2.9695882403757423e-05, + "learning_rate": 1.008224890613807e-07, + "loss": 0.0, + "num_input_tokens_seen": 75899912, + "step": 130825 + }, + { + "epoch": 19.486148346738158, + "grad_norm": 1.5008120499260258e-05, + "learning_rate": 1.0053116389197592e-07, + "loss": 0.0, + "num_input_tokens_seen": 75902792, + "step": 130830 + }, + { + "epoch": 19.48689305927912, + "grad_norm": 6.032705186953535e-06, + "learning_rate": 1.0024025937379333e-07, + "loss": 0.0, + "num_input_tokens_seen": 75905704, + "step": 130835 + }, + { + "epoch": 19.487637771820076, + "grad_norm": 7.114312211342622e-06, + "learning_rate": 9.994977551174289e-08, + "loss": 0.0, + "num_input_tokens_seen": 75908744, + "step": 130840 + }, + { + "epoch": 19.488382484361036, + "grad_norm": 4.344032731751213e-06, + "learning_rate": 9.965971231073456e-08, + "loss": 0.0, + "num_input_tokens_seen": 75911784, + "step": 130845 + }, + { + "epoch": 19.489127196901997, + "grad_norm": 7.436323812726187e-06, + "learning_rate": 9.937006977566998e-08, + "loss": 0.0, + "num_input_tokens_seen": 75914760, + "step": 130850 + }, + { + "epoch": 19.489871909442954, + "grad_norm": 2.587265726106125e-06, + "learning_rate": 9.90808479114369e-08, + "loss": 0.0, + "num_input_tokens_seen": 75917544, + "step": 130855 + }, + { + "epoch": 19.490616621983914, + "grad_norm": 2.3166217033576686e-06, + "learning_rate": 9.879204672292586e-08, + "loss": 0.0, + "num_input_tokens_seen": 75920616, + "step": 130860 + }, + { + "epoch": 19.491361334524875, + "grad_norm": 7.112638741091359e-06, + "learning_rate": 9.850366621501628e-08, + "loss": 0.0, + "num_input_tokens_seen": 75923560, + "step": 130865 + }, + { + "epoch": 19.492106047065832, + "grad_norm": 9.402147406945005e-05, + "learning_rate": 9.82157063925765e-08, + "loss": 0.0, + "num_input_tokens_seen": 75926376, + "step": 130870 + }, + { + "epoch": 19.492850759606792, + "grad_norm": 2.1968173768982524e-06, + "learning_rate": 9.792816726047482e-08, + "loss": 0.0, + "num_input_tokens_seen": 75929064, + "step": 130875 + }, + { + "epoch": 19.49359547214775, + "grad_norm": 2.436581553411088e-06, + "learning_rate": 9.764104882356572e-08, + "loss": 0.0, + "num_input_tokens_seen": 75932040, + "step": 130880 + }, + { + "epoch": 19.49434018468871, + "grad_norm": 2.115009010594804e-05, + "learning_rate": 9.735435108670088e-08, + "loss": 0.0, + "num_input_tokens_seen": 75934888, + "step": 130885 + }, + { + "epoch": 19.49508489722967, + "grad_norm": 4.721672667074017e-06, + "learning_rate": 9.70680740547264e-08, + "loss": 0.0, + "num_input_tokens_seen": 75937640, + "step": 130890 + }, + { + "epoch": 19.495829609770627, + "grad_norm": 0.0001275793183594942, + "learning_rate": 9.67822177324773e-08, + "loss": 0.0, + "num_input_tokens_seen": 75940584, + "step": 130895 + }, + { + "epoch": 19.496574322311588, + "grad_norm": 4.788014848600142e-06, + "learning_rate": 9.64967821247803e-08, + "loss": 0.0, + "num_input_tokens_seen": 75944008, + "step": 130900 + }, + { + "epoch": 19.49731903485255, + "grad_norm": 4.716830881079659e-05, + "learning_rate": 9.621176723645931e-08, + "loss": 0.0, + "num_input_tokens_seen": 75946920, + "step": 130905 + }, + { + "epoch": 19.498063747393505, + "grad_norm": 3.610365183703834e-06, + "learning_rate": 9.59271730723299e-08, + "loss": 0.0, + "num_input_tokens_seen": 75949704, + "step": 130910 + }, + { + "epoch": 19.498808459934466, + "grad_norm": 1.8229906345368363e-05, + "learning_rate": 9.564299963719936e-08, + "loss": 0.0, + "num_input_tokens_seen": 75952392, + "step": 130915 + }, + { + "epoch": 19.499553172475423, + "grad_norm": 1.0801501957757864e-05, + "learning_rate": 9.53592469358694e-08, + "loss": 0.0, + "num_input_tokens_seen": 75955240, + "step": 130920 + }, + { + "epoch": 19.500297885016384, + "grad_norm": 5.131288162374403e-06, + "learning_rate": 9.507591497313063e-08, + "loss": 0.0, + "num_input_tokens_seen": 75958536, + "step": 130925 + }, + { + "epoch": 19.501042597557344, + "grad_norm": 1.6580271449129214e-06, + "learning_rate": 9.479300375377365e-08, + "loss": 0.0, + "num_input_tokens_seen": 75961544, + "step": 130930 + }, + { + "epoch": 19.5017873100983, + "grad_norm": 1.5404813893837854e-05, + "learning_rate": 9.451051328257799e-08, + "loss": 0.0, + "num_input_tokens_seen": 75964392, + "step": 130935 + }, + { + "epoch": 19.50253202263926, + "grad_norm": 2.106867896145559e-06, + "learning_rate": 9.422844356431481e-08, + "loss": 0.0, + "num_input_tokens_seen": 75967400, + "step": 130940 + }, + { + "epoch": 19.503276735180222, + "grad_norm": 3.327657850604737e-06, + "learning_rate": 9.3946794603747e-08, + "loss": 0.0, + "num_input_tokens_seen": 75970312, + "step": 130945 + }, + { + "epoch": 19.50402144772118, + "grad_norm": 8.869261364452541e-05, + "learning_rate": 9.366556640563462e-08, + "loss": 0.0, + "num_input_tokens_seen": 75973320, + "step": 130950 + }, + { + "epoch": 19.50476616026214, + "grad_norm": 0.0007601675461046398, + "learning_rate": 9.338475897472942e-08, + "loss": 0.0, + "num_input_tokens_seen": 75976296, + "step": 130955 + }, + { + "epoch": 19.505510872803097, + "grad_norm": 7.2375364652543794e-06, + "learning_rate": 9.310437231577207e-08, + "loss": 0.0, + "num_input_tokens_seen": 75979272, + "step": 130960 + }, + { + "epoch": 19.506255585344057, + "grad_norm": 1.3751129699812736e-05, + "learning_rate": 9.282440643350598e-08, + "loss": 0.0, + "num_input_tokens_seen": 75982024, + "step": 130965 + }, + { + "epoch": 19.507000297885018, + "grad_norm": 424.0516357421875, + "learning_rate": 9.254486133265517e-08, + "loss": 0.2875, + "num_input_tokens_seen": 75984648, + "step": 130970 + }, + { + "epoch": 19.507745010425975, + "grad_norm": 2.190755139963585e-06, + "learning_rate": 9.226573701794361e-08, + "loss": 0.0, + "num_input_tokens_seen": 75987688, + "step": 130975 + }, + { + "epoch": 19.508489722966935, + "grad_norm": 2.6974967113346793e-05, + "learning_rate": 9.198703349408977e-08, + "loss": 0.0, + "num_input_tokens_seen": 75990248, + "step": 130980 + }, + { + "epoch": 19.509234435507892, + "grad_norm": 2.695056718948763e-05, + "learning_rate": 9.170875076579821e-08, + "loss": 0.0, + "num_input_tokens_seen": 75992904, + "step": 130985 + }, + { + "epoch": 19.509979148048853, + "grad_norm": 3.5486630167724798e-06, + "learning_rate": 9.143088883777073e-08, + "loss": 0.0, + "num_input_tokens_seen": 75995880, + "step": 130990 + }, + { + "epoch": 19.510723860589813, + "grad_norm": 2.362778104725294e-06, + "learning_rate": 9.115344771470357e-08, + "loss": 0.0, + "num_input_tokens_seen": 75998600, + "step": 130995 + }, + { + "epoch": 19.51146857313077, + "grad_norm": 3.130110781057738e-05, + "learning_rate": 9.087642740128188e-08, + "loss": 0.0, + "num_input_tokens_seen": 76001448, + "step": 131000 + }, + { + "epoch": 19.51221328567173, + "grad_norm": 2.0601122741936706e-05, + "learning_rate": 9.059982790218801e-08, + "loss": 0.0, + "num_input_tokens_seen": 76004488, + "step": 131005 + }, + { + "epoch": 19.51295799821269, + "grad_norm": 3.7698807773267617e-06, + "learning_rate": 9.032364922209047e-08, + "loss": 0.0, + "num_input_tokens_seen": 76007304, + "step": 131010 + }, + { + "epoch": 19.51370271075365, + "grad_norm": 9.464096365263686e-06, + "learning_rate": 9.00478913656605e-08, + "loss": 0.0, + "num_input_tokens_seen": 76010344, + "step": 131015 + }, + { + "epoch": 19.51444742329461, + "grad_norm": 0.00029426749097183347, + "learning_rate": 8.977255433755272e-08, + "loss": 0.0, + "num_input_tokens_seen": 76013128, + "step": 131020 + }, + { + "epoch": 19.515192135835566, + "grad_norm": 2.5149483917630278e-05, + "learning_rate": 8.949763814242173e-08, + "loss": 0.0, + "num_input_tokens_seen": 76015784, + "step": 131025 + }, + { + "epoch": 19.515936848376526, + "grad_norm": 2.4765545276750345e-06, + "learning_rate": 8.922314278490829e-08, + "loss": 0.004, + "num_input_tokens_seen": 76019048, + "step": 131030 + }, + { + "epoch": 19.516681560917487, + "grad_norm": 4.828957571589854e-06, + "learning_rate": 8.89490682696531e-08, + "loss": 0.0, + "num_input_tokens_seen": 76021864, + "step": 131035 + }, + { + "epoch": 19.517426273458444, + "grad_norm": 3.7667821288778214e-06, + "learning_rate": 8.867541460128304e-08, + "loss": 0.0, + "num_input_tokens_seen": 76025160, + "step": 131040 + }, + { + "epoch": 19.518170985999404, + "grad_norm": 8.745824743527919e-06, + "learning_rate": 8.840218178442494e-08, + "loss": 0.0, + "num_input_tokens_seen": 76027912, + "step": 131045 + }, + { + "epoch": 19.518915698540365, + "grad_norm": 1.273039924853947e-05, + "learning_rate": 8.81293698236918e-08, + "loss": 0.0, + "num_input_tokens_seen": 76030952, + "step": 131050 + }, + { + "epoch": 19.519660411081322, + "grad_norm": 2.174897190343472e-06, + "learning_rate": 8.785697872369381e-08, + "loss": 0.0, + "num_input_tokens_seen": 76033896, + "step": 131055 + }, + { + "epoch": 19.520405123622282, + "grad_norm": 2.3151165805757046e-05, + "learning_rate": 8.758500848903283e-08, + "loss": 0.0, + "num_input_tokens_seen": 76036744, + "step": 131060 + }, + { + "epoch": 19.52114983616324, + "grad_norm": 2.608373324619606e-05, + "learning_rate": 8.731345912430245e-08, + "loss": 0.0, + "num_input_tokens_seen": 76040040, + "step": 131065 + }, + { + "epoch": 19.5218945487042, + "grad_norm": 4.643005468096817e-06, + "learning_rate": 8.704233063409339e-08, + "loss": 0.0, + "num_input_tokens_seen": 76043080, + "step": 131070 + }, + { + "epoch": 19.52263926124516, + "grad_norm": 5.543842689803569e-06, + "learning_rate": 8.677162302298258e-08, + "loss": 0.0, + "num_input_tokens_seen": 76045960, + "step": 131075 + }, + { + "epoch": 19.523383973786117, + "grad_norm": 2.769841557892505e-06, + "learning_rate": 8.650133629554413e-08, + "loss": 0.0, + "num_input_tokens_seen": 76048808, + "step": 131080 + }, + { + "epoch": 19.524128686327078, + "grad_norm": 1.9707051251316443e-05, + "learning_rate": 8.623147045634383e-08, + "loss": 0.0, + "num_input_tokens_seen": 76051720, + "step": 131085 + }, + { + "epoch": 19.52487339886804, + "grad_norm": 3.616991989474627e-06, + "learning_rate": 8.596202550994193e-08, + "loss": 0.0, + "num_input_tokens_seen": 76054696, + "step": 131090 + }, + { + "epoch": 19.525618111408996, + "grad_norm": 6.170138021843741e-06, + "learning_rate": 8.569300146089032e-08, + "loss": 0.0, + "num_input_tokens_seen": 76057672, + "step": 131095 + }, + { + "epoch": 19.526362823949956, + "grad_norm": 4.023360816063359e-06, + "learning_rate": 8.542439831373539e-08, + "loss": 0.0, + "num_input_tokens_seen": 76060520, + "step": 131100 + }, + { + "epoch": 19.527107536490913, + "grad_norm": 0.05362854525446892, + "learning_rate": 8.515621607301239e-08, + "loss": 0.0001, + "num_input_tokens_seen": 76063368, + "step": 131105 + }, + { + "epoch": 19.527852249031874, + "grad_norm": 1.6320856275342521e-06, + "learning_rate": 8.488845474325102e-08, + "loss": 0.0, + "num_input_tokens_seen": 76066056, + "step": 131110 + }, + { + "epoch": 19.528596961572834, + "grad_norm": 2.3251384391187457e-06, + "learning_rate": 8.462111432897823e-08, + "loss": 0.0, + "num_input_tokens_seen": 76069000, + "step": 131115 + }, + { + "epoch": 19.52934167411379, + "grad_norm": 5.4483458370668814e-05, + "learning_rate": 8.435419483470707e-08, + "loss": 0.0, + "num_input_tokens_seen": 76072264, + "step": 131120 + }, + { + "epoch": 19.53008638665475, + "grad_norm": 2.7936646347370697e-06, + "learning_rate": 8.408769626495061e-08, + "loss": 0.0001, + "num_input_tokens_seen": 76075400, + "step": 131125 + }, + { + "epoch": 19.530831099195712, + "grad_norm": 3.6474002627073787e-06, + "learning_rate": 8.382161862420801e-08, + "loss": 0.0, + "num_input_tokens_seen": 76078184, + "step": 131130 + }, + { + "epoch": 19.53157581173667, + "grad_norm": 0.00032185050076805055, + "learning_rate": 8.355596191697845e-08, + "loss": 0.0, + "num_input_tokens_seen": 76081000, + "step": 131135 + }, + { + "epoch": 19.53232052427763, + "grad_norm": 2.553558260842692e-06, + "learning_rate": 8.329072614774446e-08, + "loss": 0.0, + "num_input_tokens_seen": 76083816, + "step": 131140 + }, + { + "epoch": 19.533065236818587, + "grad_norm": 1.5265937690855935e-05, + "learning_rate": 8.302591132098857e-08, + "loss": 0.0, + "num_input_tokens_seen": 76086664, + "step": 131145 + }, + { + "epoch": 19.533809949359547, + "grad_norm": 1.8891288391387207e-06, + "learning_rate": 8.276151744118777e-08, + "loss": 0.0, + "num_input_tokens_seen": 76089352, + "step": 131150 + }, + { + "epoch": 19.534554661900508, + "grad_norm": 4.024457211926347e-06, + "learning_rate": 8.249754451280512e-08, + "loss": 0.0, + "num_input_tokens_seen": 76092072, + "step": 131155 + }, + { + "epoch": 19.535299374441465, + "grad_norm": 3.8341172512446065e-06, + "learning_rate": 8.223399254030095e-08, + "loss": 0.0, + "num_input_tokens_seen": 76095048, + "step": 131160 + }, + { + "epoch": 19.536044086982425, + "grad_norm": 2.493093006705749e-06, + "learning_rate": 8.197086152812728e-08, + "loss": 0.0, + "num_input_tokens_seen": 76097896, + "step": 131165 + }, + { + "epoch": 19.536788799523386, + "grad_norm": 2.407114379820996e-06, + "learning_rate": 8.17081514807333e-08, + "loss": 0.0, + "num_input_tokens_seen": 76100744, + "step": 131170 + }, + { + "epoch": 19.537533512064343, + "grad_norm": 2.8178940283396514e-06, + "learning_rate": 8.144586240255159e-08, + "loss": 0.0, + "num_input_tokens_seen": 76103560, + "step": 131175 + }, + { + "epoch": 19.538278224605303, + "grad_norm": 1.3195274732424878e-05, + "learning_rate": 8.118399429801749e-08, + "loss": 0.0, + "num_input_tokens_seen": 76106696, + "step": 131180 + }, + { + "epoch": 19.53902293714626, + "grad_norm": 2.9190150598878972e-06, + "learning_rate": 8.092254717155246e-08, + "loss": 0.0, + "num_input_tokens_seen": 76109672, + "step": 131185 + }, + { + "epoch": 19.53976764968722, + "grad_norm": 2.6848490506381495e-06, + "learning_rate": 8.066152102757518e-08, + "loss": 0.0, + "num_input_tokens_seen": 76112392, + "step": 131190 + }, + { + "epoch": 19.54051236222818, + "grad_norm": 2.3737673473078758e-05, + "learning_rate": 8.040091587049325e-08, + "loss": 0.0, + "num_input_tokens_seen": 76115528, + "step": 131195 + }, + { + "epoch": 19.54125707476914, + "grad_norm": 2.540061905165203e-05, + "learning_rate": 8.014073170471149e-08, + "loss": 0.0, + "num_input_tokens_seen": 76118568, + "step": 131200 + }, + { + "epoch": 19.5420017873101, + "grad_norm": 2.743158802331891e-06, + "learning_rate": 7.988096853462634e-08, + "loss": 0.0, + "num_input_tokens_seen": 76121768, + "step": 131205 + }, + { + "epoch": 19.542746499851056, + "grad_norm": 7.676879249629565e-06, + "learning_rate": 7.962162636462323e-08, + "loss": 0.0, + "num_input_tokens_seen": 76124456, + "step": 131210 + }, + { + "epoch": 19.543491212392016, + "grad_norm": 2.9947248094686074e-06, + "learning_rate": 7.936270519908473e-08, + "loss": 0.0, + "num_input_tokens_seen": 76127240, + "step": 131215 + }, + { + "epoch": 19.544235924932977, + "grad_norm": 1.5012618860055227e-05, + "learning_rate": 7.910420504238514e-08, + "loss": 0.0, + "num_input_tokens_seen": 76130056, + "step": 131220 + }, + { + "epoch": 19.544980637473934, + "grad_norm": 6.678641057078494e-06, + "learning_rate": 7.88461258988904e-08, + "loss": 0.0, + "num_input_tokens_seen": 76132776, + "step": 131225 + }, + { + "epoch": 19.545725350014894, + "grad_norm": 1.1253177945036441e-05, + "learning_rate": 7.858846777296369e-08, + "loss": 0.0, + "num_input_tokens_seen": 76135752, + "step": 131230 + }, + { + "epoch": 19.546470062555855, + "grad_norm": 5.799599421152379e-06, + "learning_rate": 7.833123066895432e-08, + "loss": 0.0, + "num_input_tokens_seen": 76138728, + "step": 131235 + }, + { + "epoch": 19.547214775096812, + "grad_norm": 2.8260587896511424e-06, + "learning_rate": 7.807441459121156e-08, + "loss": 0.0, + "num_input_tokens_seen": 76141800, + "step": 131240 + }, + { + "epoch": 19.547959487637772, + "grad_norm": 2.9770676519547123e-06, + "learning_rate": 7.781801954406809e-08, + "loss": 0.0, + "num_input_tokens_seen": 76144584, + "step": 131245 + }, + { + "epoch": 19.54870420017873, + "grad_norm": 0.0004564093833323568, + "learning_rate": 7.756204553186208e-08, + "loss": 0.0, + "num_input_tokens_seen": 76147560, + "step": 131250 + }, + { + "epoch": 19.54944891271969, + "grad_norm": 2.9889624784118496e-06, + "learning_rate": 7.730649255891509e-08, + "loss": 0.0, + "num_input_tokens_seen": 76150472, + "step": 131255 + }, + { + "epoch": 19.55019362526065, + "grad_norm": 4.561946752801305e-06, + "learning_rate": 7.705136062954587e-08, + "loss": 0.0, + "num_input_tokens_seen": 76153448, + "step": 131260 + }, + { + "epoch": 19.550938337801608, + "grad_norm": 6.853308877907693e-05, + "learning_rate": 7.679664974806212e-08, + "loss": 0.0, + "num_input_tokens_seen": 76156296, + "step": 131265 + }, + { + "epoch": 19.551683050342568, + "grad_norm": 2.4920593205024488e-05, + "learning_rate": 7.654235991876867e-08, + "loss": 0.0, + "num_input_tokens_seen": 76159176, + "step": 131270 + }, + { + "epoch": 19.55242776288353, + "grad_norm": 5.720314675272675e-06, + "learning_rate": 7.628849114596214e-08, + "loss": 0.0, + "num_input_tokens_seen": 76162120, + "step": 131275 + }, + { + "epoch": 19.553172475424486, + "grad_norm": 2.8129912607255392e-05, + "learning_rate": 7.603504343392798e-08, + "loss": 0.0, + "num_input_tokens_seen": 76164712, + "step": 131280 + }, + { + "epoch": 19.553917187965446, + "grad_norm": 1.9347055513208034e-06, + "learning_rate": 7.578201678694885e-08, + "loss": 0.0, + "num_input_tokens_seen": 76167560, + "step": 131285 + }, + { + "epoch": 19.554661900506403, + "grad_norm": 3.802897481364198e-06, + "learning_rate": 7.55294112093019e-08, + "loss": 0.0, + "num_input_tokens_seen": 76171048, + "step": 131290 + }, + { + "epoch": 19.555406613047364, + "grad_norm": 6.305349415924866e-06, + "learning_rate": 7.527722670525594e-08, + "loss": 0.0, + "num_input_tokens_seen": 76173992, + "step": 131295 + }, + { + "epoch": 19.556151325588324, + "grad_norm": 1.911815161292907e-06, + "learning_rate": 7.50254632790659e-08, + "loss": 0.0, + "num_input_tokens_seen": 76177160, + "step": 131300 + }, + { + "epoch": 19.55689603812928, + "grad_norm": 6.033320005371934e-06, + "learning_rate": 7.477412093498947e-08, + "loss": 0.0, + "num_input_tokens_seen": 76180104, + "step": 131305 + }, + { + "epoch": 19.55764075067024, + "grad_norm": 2.907233692894806e-06, + "learning_rate": 7.452319967727328e-08, + "loss": 0.0, + "num_input_tokens_seen": 76183304, + "step": 131310 + }, + { + "epoch": 19.558385463211202, + "grad_norm": 1.5644520317437127e-05, + "learning_rate": 7.427269951015004e-08, + "loss": 0.0, + "num_input_tokens_seen": 76186312, + "step": 131315 + }, + { + "epoch": 19.55913017575216, + "grad_norm": 2.4132089038175764e-06, + "learning_rate": 7.402262043785801e-08, + "loss": 0.0, + "num_input_tokens_seen": 76189128, + "step": 131320 + }, + { + "epoch": 19.55987488829312, + "grad_norm": 9.554160897096153e-06, + "learning_rate": 7.377296246462162e-08, + "loss": 0.0, + "num_input_tokens_seen": 76192264, + "step": 131325 + }, + { + "epoch": 19.560619600834077, + "grad_norm": 8.195197551685851e-06, + "learning_rate": 7.352372559465693e-08, + "loss": 0.0, + "num_input_tokens_seen": 76195208, + "step": 131330 + }, + { + "epoch": 19.561364313375037, + "grad_norm": 3.766615191125311e-05, + "learning_rate": 7.327490983217444e-08, + "loss": 0.0, + "num_input_tokens_seen": 76198024, + "step": 131335 + }, + { + "epoch": 19.562109025915998, + "grad_norm": 2.1647397261403967e-06, + "learning_rate": 7.302651518137638e-08, + "loss": 0.0, + "num_input_tokens_seen": 76200808, + "step": 131340 + }, + { + "epoch": 19.562853738456955, + "grad_norm": 2.829735603882e-05, + "learning_rate": 7.277854164646214e-08, + "loss": 0.0, + "num_input_tokens_seen": 76203560, + "step": 131345 + }, + { + "epoch": 19.563598450997915, + "grad_norm": 4.1981966205639765e-05, + "learning_rate": 7.253098923162005e-08, + "loss": 0.0, + "num_input_tokens_seen": 76206472, + "step": 131350 + }, + { + "epoch": 19.564343163538872, + "grad_norm": 2.6920006348518655e-06, + "learning_rate": 7.22838579410301e-08, + "loss": 0.0, + "num_input_tokens_seen": 76209416, + "step": 131355 + }, + { + "epoch": 19.565087876079833, + "grad_norm": 1.7768304587661987e-06, + "learning_rate": 7.20371477788695e-08, + "loss": 0.0, + "num_input_tokens_seen": 76212200, + "step": 131360 + }, + { + "epoch": 19.565832588620793, + "grad_norm": 5.925491223024437e-06, + "learning_rate": 7.179085874930713e-08, + "loss": 0.0001, + "num_input_tokens_seen": 76215080, + "step": 131365 + }, + { + "epoch": 19.56657730116175, + "grad_norm": 3.382763225090457e-06, + "learning_rate": 7.154499085650079e-08, + "loss": 0.0, + "num_input_tokens_seen": 76217960, + "step": 131370 + }, + { + "epoch": 19.56732201370271, + "grad_norm": 1.6883750504348427e-05, + "learning_rate": 7.129954410460548e-08, + "loss": 0.0, + "num_input_tokens_seen": 76221192, + "step": 131375 + }, + { + "epoch": 19.56806672624367, + "grad_norm": 4.5495009544538334e-05, + "learning_rate": 7.105451849777067e-08, + "loss": 0.0, + "num_input_tokens_seen": 76224200, + "step": 131380 + }, + { + "epoch": 19.56881143878463, + "grad_norm": 0.0004348405927885324, + "learning_rate": 7.080991404012915e-08, + "loss": 0.0, + "num_input_tokens_seen": 76227336, + "step": 131385 + }, + { + "epoch": 19.56955615132559, + "grad_norm": 2.8343999929347774e-06, + "learning_rate": 7.056573073581929e-08, + "loss": 0.0, + "num_input_tokens_seen": 76230184, + "step": 131390 + }, + { + "epoch": 19.570300863866546, + "grad_norm": 3.879747509927256e-06, + "learning_rate": 7.032196858896279e-08, + "loss": 0.0, + "num_input_tokens_seen": 76233096, + "step": 131395 + }, + { + "epoch": 19.571045576407506, + "grad_norm": 2.6334901122027077e-06, + "learning_rate": 7.007862760368133e-08, + "loss": 0.0, + "num_input_tokens_seen": 76236104, + "step": 131400 + }, + { + "epoch": 19.571790288948467, + "grad_norm": 2.1643052150466247e-06, + "learning_rate": 6.983570778408277e-08, + "loss": 0.0, + "num_input_tokens_seen": 76238984, + "step": 131405 + }, + { + "epoch": 19.572535001489424, + "grad_norm": 1.5125229765544645e-05, + "learning_rate": 6.959320913427492e-08, + "loss": 0.0, + "num_input_tokens_seen": 76242184, + "step": 131410 + }, + { + "epoch": 19.573279714030384, + "grad_norm": 0.00016989339201245457, + "learning_rate": 6.935113165834616e-08, + "loss": 0.0, + "num_input_tokens_seen": 76245288, + "step": 131415 + }, + { + "epoch": 19.574024426571345, + "grad_norm": 1.628322570468299e-05, + "learning_rate": 6.910947536039603e-08, + "loss": 0.0022, + "num_input_tokens_seen": 76248072, + "step": 131420 + }, + { + "epoch": 19.574769139112302, + "grad_norm": 5.9186286307522096e-06, + "learning_rate": 6.886824024450178e-08, + "loss": 0.0, + "num_input_tokens_seen": 76251304, + "step": 131425 + }, + { + "epoch": 19.575513851653263, + "grad_norm": 4.233964773447951e-06, + "learning_rate": 6.862742631473795e-08, + "loss": 0.0, + "num_input_tokens_seen": 76254312, + "step": 131430 + }, + { + "epoch": 19.57625856419422, + "grad_norm": 0.00034616290940903127, + "learning_rate": 6.838703357517628e-08, + "loss": 0.0, + "num_input_tokens_seen": 76257224, + "step": 131435 + }, + { + "epoch": 19.57700327673518, + "grad_norm": 4.089987669431139e-06, + "learning_rate": 6.814706202987465e-08, + "loss": 0.0, + "num_input_tokens_seen": 76259912, + "step": 131440 + }, + { + "epoch": 19.57774798927614, + "grad_norm": 4.187047579762293e-06, + "learning_rate": 6.79075116828909e-08, + "loss": 0.0, + "num_input_tokens_seen": 76263048, + "step": 131445 + }, + { + "epoch": 19.578492701817098, + "grad_norm": 2.5101555365836248e-05, + "learning_rate": 6.766838253826902e-08, + "loss": 0.0, + "num_input_tokens_seen": 76266088, + "step": 131450 + }, + { + "epoch": 19.579237414358058, + "grad_norm": 7.335805548791541e-06, + "learning_rate": 6.742967460005023e-08, + "loss": 0.0, + "num_input_tokens_seen": 76268744, + "step": 131455 + }, + { + "epoch": 19.57998212689902, + "grad_norm": 4.183122200629441e-06, + "learning_rate": 6.719138787226464e-08, + "loss": 0.0, + "num_input_tokens_seen": 76271752, + "step": 131460 + }, + { + "epoch": 19.580726839439976, + "grad_norm": 7.5781322266266216e-06, + "learning_rate": 6.695352235894237e-08, + "loss": 0.0, + "num_input_tokens_seen": 76274472, + "step": 131465 + }, + { + "epoch": 19.581471551980936, + "grad_norm": 2.216573875557515e-06, + "learning_rate": 6.671607806409963e-08, + "loss": 0.0, + "num_input_tokens_seen": 76277128, + "step": 131470 + }, + { + "epoch": 19.582216264521893, + "grad_norm": 3.3037408684322145e-06, + "learning_rate": 6.647905499174712e-08, + "loss": 0.0, + "num_input_tokens_seen": 76280136, + "step": 131475 + }, + { + "epoch": 19.582960977062854, + "grad_norm": 5.2212867558409926e-06, + "learning_rate": 6.624245314588994e-08, + "loss": 0.0, + "num_input_tokens_seen": 76283272, + "step": 131480 + }, + { + "epoch": 19.583705689603814, + "grad_norm": 3.042151001864113e-05, + "learning_rate": 6.600627253052216e-08, + "loss": 0.0, + "num_input_tokens_seen": 76286024, + "step": 131485 + }, + { + "epoch": 19.58445040214477, + "grad_norm": 4.2910878619295545e-06, + "learning_rate": 6.577051314964055e-08, + "loss": 0.0, + "num_input_tokens_seen": 76289000, + "step": 131490 + }, + { + "epoch": 19.58519511468573, + "grad_norm": 5.765175956184976e-05, + "learning_rate": 6.55351750072225e-08, + "loss": 0.0, + "num_input_tokens_seen": 76291720, + "step": 131495 + }, + { + "epoch": 19.58593982722669, + "grad_norm": 2.6817073376150802e-05, + "learning_rate": 6.530025810724539e-08, + "loss": 0.0, + "num_input_tokens_seen": 76294504, + "step": 131500 + }, + { + "epoch": 19.58668453976765, + "grad_norm": 0.0003141641791444272, + "learning_rate": 6.506576245367824e-08, + "loss": 0.0, + "num_input_tokens_seen": 76297160, + "step": 131505 + }, + { + "epoch": 19.58742925230861, + "grad_norm": 0.00022507183894049376, + "learning_rate": 6.483168805047901e-08, + "loss": 0.0, + "num_input_tokens_seen": 76300136, + "step": 131510 + }, + { + "epoch": 19.588173964849567, + "grad_norm": 6.519910584756872e-06, + "learning_rate": 6.459803490160843e-08, + "loss": 0.0, + "num_input_tokens_seen": 76303080, + "step": 131515 + }, + { + "epoch": 19.588918677390527, + "grad_norm": 3.229128196835518e-06, + "learning_rate": 6.436480301101055e-08, + "loss": 0.0, + "num_input_tokens_seen": 76305736, + "step": 131520 + }, + { + "epoch": 19.589663389931488, + "grad_norm": 0.0003103617927990854, + "learning_rate": 6.413199238262668e-08, + "loss": 0.0, + "num_input_tokens_seen": 76308616, + "step": 131525 + }, + { + "epoch": 19.590408102472445, + "grad_norm": 3.78429740521824e-06, + "learning_rate": 6.389960302038978e-08, + "loss": 0.0, + "num_input_tokens_seen": 76311336, + "step": 131530 + }, + { + "epoch": 19.591152815013405, + "grad_norm": 3.7700986013078364e-06, + "learning_rate": 6.366763492822448e-08, + "loss": 0.0, + "num_input_tokens_seen": 76313992, + "step": 131535 + }, + { + "epoch": 19.591897527554362, + "grad_norm": 2.2259640900301747e-05, + "learning_rate": 6.343608811004986e-08, + "loss": 0.0, + "num_input_tokens_seen": 76317064, + "step": 131540 + }, + { + "epoch": 19.592642240095323, + "grad_norm": 1.116235216613859e-05, + "learning_rate": 6.320496256977671e-08, + "loss": 0.0, + "num_input_tokens_seen": 76320200, + "step": 131545 + }, + { + "epoch": 19.593386952636283, + "grad_norm": 2.378464841967798e-06, + "learning_rate": 6.297425831131299e-08, + "loss": 0.0, + "num_input_tokens_seen": 76323208, + "step": 131550 + }, + { + "epoch": 19.59413166517724, + "grad_norm": 3.603088407544419e-05, + "learning_rate": 6.274397533855281e-08, + "loss": 0.0, + "num_input_tokens_seen": 76326152, + "step": 131555 + }, + { + "epoch": 19.5948763777182, + "grad_norm": 4.895908659818815e-06, + "learning_rate": 6.251411365539029e-08, + "loss": 0.0, + "num_input_tokens_seen": 76329032, + "step": 131560 + }, + { + "epoch": 19.59562109025916, + "grad_norm": 1.2906953998026438e-05, + "learning_rate": 6.228467326570286e-08, + "loss": 0.0, + "num_input_tokens_seen": 76331912, + "step": 131565 + }, + { + "epoch": 19.59636580280012, + "grad_norm": 2.1601797470793827e-06, + "learning_rate": 6.205565417337356e-08, + "loss": 0.0, + "num_input_tokens_seen": 76334696, + "step": 131570 + }, + { + "epoch": 19.59711051534108, + "grad_norm": 2.420564896965516e-06, + "learning_rate": 6.182705638226872e-08, + "loss": 0.0, + "num_input_tokens_seen": 76337544, + "step": 131575 + }, + { + "epoch": 19.597855227882036, + "grad_norm": 8.278456334664952e-06, + "learning_rate": 6.159887989624635e-08, + "loss": 0.0, + "num_input_tokens_seen": 76340296, + "step": 131580 + }, + { + "epoch": 19.598599940422996, + "grad_norm": 4.947549314238131e-06, + "learning_rate": 6.137112471916729e-08, + "loss": 0.0, + "num_input_tokens_seen": 76342984, + "step": 131585 + }, + { + "epoch": 19.599344652963957, + "grad_norm": 9.663270247983746e-06, + "learning_rate": 6.114379085487565e-08, + "loss": 0.0, + "num_input_tokens_seen": 76345672, + "step": 131590 + }, + { + "epoch": 19.600089365504914, + "grad_norm": 4.887766408501193e-06, + "learning_rate": 6.091687830721282e-08, + "loss": 0.0, + "num_input_tokens_seen": 76348296, + "step": 131595 + }, + { + "epoch": 19.600834078045875, + "grad_norm": 8.23331720312126e-05, + "learning_rate": 6.069038708001462e-08, + "loss": 0.0, + "num_input_tokens_seen": 76351528, + "step": 131600 + }, + { + "epoch": 19.601578790586835, + "grad_norm": 2.4852029127941933e-06, + "learning_rate": 6.046431717710299e-08, + "loss": 0.0, + "num_input_tokens_seen": 76354376, + "step": 131605 + }, + { + "epoch": 19.602323503127792, + "grad_norm": 3.872509296343196e-06, + "learning_rate": 6.023866860229988e-08, + "loss": 0.0012, + "num_input_tokens_seen": 76357320, + "step": 131610 + }, + { + "epoch": 19.603068215668753, + "grad_norm": 7.558685410913313e-06, + "learning_rate": 6.001344135941611e-08, + "loss": 0.0, + "num_input_tokens_seen": 76360072, + "step": 131615 + }, + { + "epoch": 19.60381292820971, + "grad_norm": 1.5986883227014914e-05, + "learning_rate": 5.9788635452257e-08, + "loss": 0.0, + "num_input_tokens_seen": 76362888, + "step": 131620 + }, + { + "epoch": 19.60455764075067, + "grad_norm": 0.00028703297721222043, + "learning_rate": 5.9564250884622255e-08, + "loss": 0.0, + "num_input_tokens_seen": 76365992, + "step": 131625 + }, + { + "epoch": 19.60530235329163, + "grad_norm": 3.155489139317069e-06, + "learning_rate": 5.934028766030053e-08, + "loss": 0.0, + "num_input_tokens_seen": 76368904, + "step": 131630 + }, + { + "epoch": 19.606047065832588, + "grad_norm": 2.4601624772913055e-06, + "learning_rate": 5.911674578307491e-08, + "loss": 0.0, + "num_input_tokens_seen": 76372008, + "step": 131635 + }, + { + "epoch": 19.606791778373548, + "grad_norm": 4.202391210128553e-06, + "learning_rate": 5.88936252567257e-08, + "loss": 0.0, + "num_input_tokens_seen": 76374600, + "step": 131640 + }, + { + "epoch": 19.60753649091451, + "grad_norm": 0.0006925089983269572, + "learning_rate": 5.8670926085016564e-08, + "loss": 0.0, + "num_input_tokens_seen": 76377384, + "step": 131645 + }, + { + "epoch": 19.608281203455466, + "grad_norm": 2.2432245714298915e-06, + "learning_rate": 5.8448648271713925e-08, + "loss": 0.0, + "num_input_tokens_seen": 76380424, + "step": 131650 + }, + { + "epoch": 19.609025915996426, + "grad_norm": 2.5893878046190366e-06, + "learning_rate": 5.822679182057311e-08, + "loss": 0.0, + "num_input_tokens_seen": 76383112, + "step": 131655 + }, + { + "epoch": 19.609770628537383, + "grad_norm": 4.324545443523675e-06, + "learning_rate": 5.8005356735341135e-08, + "loss": 0.0, + "num_input_tokens_seen": 76385864, + "step": 131660 + }, + { + "epoch": 19.610515341078344, + "grad_norm": 5.83091787120793e-05, + "learning_rate": 5.7784343019759436e-08, + "loss": 0.0, + "num_input_tokens_seen": 76388584, + "step": 131665 + }, + { + "epoch": 19.611260053619304, + "grad_norm": 3.970214947912609e-06, + "learning_rate": 5.756375067755837e-08, + "loss": 0.0, + "num_input_tokens_seen": 76391528, + "step": 131670 + }, + { + "epoch": 19.61200476616026, + "grad_norm": 6.435674094973365e-06, + "learning_rate": 5.7343579712468286e-08, + "loss": 0.0, + "num_input_tokens_seen": 76394280, + "step": 131675 + }, + { + "epoch": 19.61274947870122, + "grad_norm": 6.226855475688353e-06, + "learning_rate": 5.712383012820843e-08, + "loss": 0.0, + "num_input_tokens_seen": 76397736, + "step": 131680 + }, + { + "epoch": 19.613494191242182, + "grad_norm": 4.405144409247441e-06, + "learning_rate": 5.6904501928489726e-08, + "loss": 0.0, + "num_input_tokens_seen": 76401032, + "step": 131685 + }, + { + "epoch": 19.61423890378314, + "grad_norm": 1.0077156730403658e-05, + "learning_rate": 5.668559511702032e-08, + "loss": 0.0, + "num_input_tokens_seen": 76403528, + "step": 131690 + }, + { + "epoch": 19.6149836163241, + "grad_norm": 5.01357590110274e-06, + "learning_rate": 5.646710969749447e-08, + "loss": 0.0, + "num_input_tokens_seen": 76406344, + "step": 131695 + }, + { + "epoch": 19.615728328865057, + "grad_norm": 8.13589504105039e-05, + "learning_rate": 5.6249045673606446e-08, + "loss": 0.0, + "num_input_tokens_seen": 76409544, + "step": 131700 + }, + { + "epoch": 19.616473041406017, + "grad_norm": 7.732406629656907e-06, + "learning_rate": 5.603140304903942e-08, + "loss": 0.0, + "num_input_tokens_seen": 76412424, + "step": 131705 + }, + { + "epoch": 19.617217753946978, + "grad_norm": 1.0294498679286335e-05, + "learning_rate": 5.581418182746823e-08, + "loss": 0.0, + "num_input_tokens_seen": 76415272, + "step": 131710 + }, + { + "epoch": 19.617962466487935, + "grad_norm": 3.416933395783417e-05, + "learning_rate": 5.55973820125677e-08, + "loss": 0.0, + "num_input_tokens_seen": 76418056, + "step": 131715 + }, + { + "epoch": 19.618707179028895, + "grad_norm": 0.00024621328338980675, + "learning_rate": 5.538100360799325e-08, + "loss": 0.0, + "num_input_tokens_seen": 76421256, + "step": 131720 + }, + { + "epoch": 19.619451891569852, + "grad_norm": 4.529535726760514e-05, + "learning_rate": 5.516504661740585e-08, + "loss": 0.0, + "num_input_tokens_seen": 76423912, + "step": 131725 + }, + { + "epoch": 19.620196604110813, + "grad_norm": 0.00041364607750438154, + "learning_rate": 5.494951104445256e-08, + "loss": 0.0, + "num_input_tokens_seen": 76426760, + "step": 131730 + }, + { + "epoch": 19.620941316651773, + "grad_norm": 7.282409114850452e-06, + "learning_rate": 5.473439689277493e-08, + "loss": 0.0, + "num_input_tokens_seen": 76429736, + "step": 131735 + }, + { + "epoch": 19.62168602919273, + "grad_norm": 0.00011054707283619791, + "learning_rate": 5.451970416600338e-08, + "loss": 0.0, + "num_input_tokens_seen": 76432712, + "step": 131740 + }, + { + "epoch": 19.62243074173369, + "grad_norm": 9.200692147715017e-05, + "learning_rate": 5.430543286777112e-08, + "loss": 0.0, + "num_input_tokens_seen": 76435752, + "step": 131745 + }, + { + "epoch": 19.62317545427465, + "grad_norm": 1.923564695971436e-06, + "learning_rate": 5.4091583001691923e-08, + "loss": 0.0, + "num_input_tokens_seen": 76438664, + "step": 131750 + }, + { + "epoch": 19.62392016681561, + "grad_norm": 2.620063241920434e-06, + "learning_rate": 5.387815457138512e-08, + "loss": 0.0, + "num_input_tokens_seen": 76441544, + "step": 131755 + }, + { + "epoch": 19.62466487935657, + "grad_norm": 3.32130330207292e-05, + "learning_rate": 5.3665147580450604e-08, + "loss": 0.0, + "num_input_tokens_seen": 76444360, + "step": 131760 + }, + { + "epoch": 19.625409591897526, + "grad_norm": 1.0259492228215095e-05, + "learning_rate": 5.3452562032488275e-08, + "loss": 0.0, + "num_input_tokens_seen": 76447144, + "step": 131765 + }, + { + "epoch": 19.626154304438487, + "grad_norm": 1.6985854017548263e-05, + "learning_rate": 5.324039793109248e-08, + "loss": 0.0, + "num_input_tokens_seen": 76449928, + "step": 131770 + }, + { + "epoch": 19.626899016979447, + "grad_norm": 3.4105244139936985e-06, + "learning_rate": 5.302865527984369e-08, + "loss": 0.0, + "num_input_tokens_seen": 76452744, + "step": 131775 + }, + { + "epoch": 19.627643729520404, + "grad_norm": 4.424988674145425e-06, + "learning_rate": 5.281733408232237e-08, + "loss": 0.0, + "num_input_tokens_seen": 76455528, + "step": 131780 + }, + { + "epoch": 19.628388442061365, + "grad_norm": 8.859263471094891e-05, + "learning_rate": 5.2606434342095115e-08, + "loss": 0.0, + "num_input_tokens_seen": 76458216, + "step": 131785 + }, + { + "epoch": 19.629133154602325, + "grad_norm": 2.481654519215226e-06, + "learning_rate": 5.2395956062728515e-08, + "loss": 0.0, + "num_input_tokens_seen": 76461128, + "step": 131790 + }, + { + "epoch": 19.629877867143282, + "grad_norm": 2.887618938984815e-05, + "learning_rate": 5.218589924777528e-08, + "loss": 0.0, + "num_input_tokens_seen": 76464360, + "step": 131795 + }, + { + "epoch": 19.630622579684243, + "grad_norm": 1.8714283669396536e-06, + "learning_rate": 5.1976263900788136e-08, + "loss": 0.0, + "num_input_tokens_seen": 76467304, + "step": 131800 + }, + { + "epoch": 19.6313672922252, + "grad_norm": 9.03079126146622e-06, + "learning_rate": 5.176705002530313e-08, + "loss": 0.0, + "num_input_tokens_seen": 76470216, + "step": 131805 + }, + { + "epoch": 19.63211200476616, + "grad_norm": 2.0264610611775424e-06, + "learning_rate": 5.155825762485911e-08, + "loss": 0.0, + "num_input_tokens_seen": 76473096, + "step": 131810 + }, + { + "epoch": 19.63285671730712, + "grad_norm": 4.9246768867305946e-06, + "learning_rate": 5.134988670298102e-08, + "loss": 0.0, + "num_input_tokens_seen": 76475784, + "step": 131815 + }, + { + "epoch": 19.633601429848078, + "grad_norm": 3.2437681056762813e-06, + "learning_rate": 5.1141937263188276e-08, + "loss": 0.0, + "num_input_tokens_seen": 76478696, + "step": 131820 + }, + { + "epoch": 19.634346142389038, + "grad_norm": 1.3678298273589462e-05, + "learning_rate": 5.093440930899751e-08, + "loss": 0.0, + "num_input_tokens_seen": 76481448, + "step": 131825 + }, + { + "epoch": 19.63509085493, + "grad_norm": 2.168017817893997e-05, + "learning_rate": 5.072730284391425e-08, + "loss": 0.0, + "num_input_tokens_seen": 76484328, + "step": 131830 + }, + { + "epoch": 19.635835567470956, + "grad_norm": 4.226027158438228e-06, + "learning_rate": 5.0520617871432916e-08, + "loss": 0.0, + "num_input_tokens_seen": 76487208, + "step": 131835 + }, + { + "epoch": 19.636580280011916, + "grad_norm": 4.852030542679131e-06, + "learning_rate": 5.0314354395050724e-08, + "loss": 0.0, + "num_input_tokens_seen": 76490088, + "step": 131840 + }, + { + "epoch": 19.637324992552873, + "grad_norm": 2.2114329567557434e-06, + "learning_rate": 5.010851241824821e-08, + "loss": 0.0, + "num_input_tokens_seen": 76493064, + "step": 131845 + }, + { + "epoch": 19.638069705093834, + "grad_norm": 4.291989625926362e-06, + "learning_rate": 4.990309194450593e-08, + "loss": 0.0, + "num_input_tokens_seen": 76496008, + "step": 131850 + }, + { + "epoch": 19.638814417634794, + "grad_norm": 4.4959115257370286e-06, + "learning_rate": 4.9698092977290556e-08, + "loss": 0.0, + "num_input_tokens_seen": 76498664, + "step": 131855 + }, + { + "epoch": 19.63955913017575, + "grad_norm": 7.216710855573183e-06, + "learning_rate": 4.9493515520068754e-08, + "loss": 0.0, + "num_input_tokens_seen": 76501384, + "step": 131860 + }, + { + "epoch": 19.640303842716712, + "grad_norm": 3.0767919270147104e-06, + "learning_rate": 4.92893595762961e-08, + "loss": 0.0, + "num_input_tokens_seen": 76504168, + "step": 131865 + }, + { + "epoch": 19.64104855525767, + "grad_norm": 0.0009365396690554917, + "learning_rate": 4.908562514941983e-08, + "loss": 0.0, + "num_input_tokens_seen": 76506888, + "step": 131870 + }, + { + "epoch": 19.64179326779863, + "grad_norm": 4.2816072891582735e-06, + "learning_rate": 4.888231224288442e-08, + "loss": 0.0, + "num_input_tokens_seen": 76510056, + "step": 131875 + }, + { + "epoch": 19.64253798033959, + "grad_norm": 2.6025012630270794e-05, + "learning_rate": 4.867942086012045e-08, + "loss": 0.0, + "num_input_tokens_seen": 76512840, + "step": 131880 + }, + { + "epoch": 19.643282692880547, + "grad_norm": 4.084044121555053e-05, + "learning_rate": 4.847695100456129e-08, + "loss": 0.0, + "num_input_tokens_seen": 76515880, + "step": 131885 + }, + { + "epoch": 19.644027405421507, + "grad_norm": 4.989095486962469e-06, + "learning_rate": 4.8274902679623644e-08, + "loss": 0.0, + "num_input_tokens_seen": 76518952, + "step": 131890 + }, + { + "epoch": 19.644772117962468, + "grad_norm": 6.960705377423437e-06, + "learning_rate": 4.807327588871868e-08, + "loss": 0.0, + "num_input_tokens_seen": 76522312, + "step": 131895 + }, + { + "epoch": 19.645516830503425, + "grad_norm": 2.6807128961081617e-06, + "learning_rate": 4.7872070635260333e-08, + "loss": 0.0, + "num_input_tokens_seen": 76524936, + "step": 131900 + }, + { + "epoch": 19.646261543044385, + "grad_norm": 1.2762520782416686e-05, + "learning_rate": 4.7671286922640335e-08, + "loss": 0.0, + "num_input_tokens_seen": 76527656, + "step": 131905 + }, + { + "epoch": 19.647006255585342, + "grad_norm": 2.4249047783087008e-06, + "learning_rate": 4.7470924754253184e-08, + "loss": 0.0, + "num_input_tokens_seen": 76530600, + "step": 131910 + }, + { + "epoch": 19.647750968126303, + "grad_norm": 2.5725334126036614e-05, + "learning_rate": 4.727098413348785e-08, + "loss": 0.0, + "num_input_tokens_seen": 76533320, + "step": 131915 + }, + { + "epoch": 19.648495680667263, + "grad_norm": 0.00043900986202061176, + "learning_rate": 4.707146506371385e-08, + "loss": 0.0, + "num_input_tokens_seen": 76536296, + "step": 131920 + }, + { + "epoch": 19.64924039320822, + "grad_norm": 3.4285237688891357e-06, + "learning_rate": 4.6872367548309036e-08, + "loss": 0.0, + "num_input_tokens_seen": 76539304, + "step": 131925 + }, + { + "epoch": 19.64998510574918, + "grad_norm": 1.8215703221358126e-06, + "learning_rate": 4.6673691590634614e-08, + "loss": 0.0, + "num_input_tokens_seen": 76542056, + "step": 131930 + }, + { + "epoch": 19.65072981829014, + "grad_norm": 1.245191742782481e-05, + "learning_rate": 4.6475437194046237e-08, + "loss": 0.0, + "num_input_tokens_seen": 76545032, + "step": 131935 + }, + { + "epoch": 19.6514745308311, + "grad_norm": 0.00013357054558582604, + "learning_rate": 4.627760436189121e-08, + "loss": 0.0, + "num_input_tokens_seen": 76547816, + "step": 131940 + }, + { + "epoch": 19.65221924337206, + "grad_norm": 3.723517920661834e-06, + "learning_rate": 4.6080193097516877e-08, + "loss": 0.0, + "num_input_tokens_seen": 76550920, + "step": 131945 + }, + { + "epoch": 19.652963955913016, + "grad_norm": 5.666605375154177e-06, + "learning_rate": 4.588320340425667e-08, + "loss": 0.0, + "num_input_tokens_seen": 76553544, + "step": 131950 + }, + { + "epoch": 19.653708668453977, + "grad_norm": 1.8421125105305691e-06, + "learning_rate": 4.5686635285432934e-08, + "loss": 0.0, + "num_input_tokens_seen": 76556360, + "step": 131955 + }, + { + "epoch": 19.654453380994937, + "grad_norm": 1.940905349329114e-06, + "learning_rate": 4.5490488744376335e-08, + "loss": 0.0, + "num_input_tokens_seen": 76559112, + "step": 131960 + }, + { + "epoch": 19.655198093535894, + "grad_norm": 1.7647254253461142e-06, + "learning_rate": 4.529476378439257e-08, + "loss": 0.0, + "num_input_tokens_seen": 76561864, + "step": 131965 + }, + { + "epoch": 19.655942806076855, + "grad_norm": 2.9797342904203106e-06, + "learning_rate": 4.50994604087901e-08, + "loss": 0.0, + "num_input_tokens_seen": 76564648, + "step": 131970 + }, + { + "epoch": 19.656687518617815, + "grad_norm": 0.0009211835567839444, + "learning_rate": 4.490457862087183e-08, + "loss": 0.0, + "num_input_tokens_seen": 76567592, + "step": 131975 + }, + { + "epoch": 19.657432231158772, + "grad_norm": 8.260573849838693e-06, + "learning_rate": 4.471011842392403e-08, + "loss": 0.0, + "num_input_tokens_seen": 76570472, + "step": 131980 + }, + { + "epoch": 19.658176943699733, + "grad_norm": 5.82303300689091e-06, + "learning_rate": 4.451607982123851e-08, + "loss": 0.0, + "num_input_tokens_seen": 76573192, + "step": 131985 + }, + { + "epoch": 19.65892165624069, + "grad_norm": 7.770564843667671e-05, + "learning_rate": 4.432246281609042e-08, + "loss": 0.0, + "num_input_tokens_seen": 76576008, + "step": 131990 + }, + { + "epoch": 19.65966636878165, + "grad_norm": 2.463985538270208e-06, + "learning_rate": 4.4129267411749386e-08, + "loss": 0.0, + "num_input_tokens_seen": 76578856, + "step": 131995 + }, + { + "epoch": 19.66041108132261, + "grad_norm": 8.528682883479632e-06, + "learning_rate": 4.393649361147944e-08, + "loss": 0.0, + "num_input_tokens_seen": 76581928, + "step": 132000 + }, + { + "epoch": 19.661155793863568, + "grad_norm": 2.9101138352416456e-06, + "learning_rate": 4.374414141853911e-08, + "loss": 0.0, + "num_input_tokens_seen": 76585000, + "step": 132005 + }, + { + "epoch": 19.66190050640453, + "grad_norm": 0.00046806767932139337, + "learning_rate": 4.355221083617578e-08, + "loss": 0.0, + "num_input_tokens_seen": 76587816, + "step": 132010 + }, + { + "epoch": 19.662645218945485, + "grad_norm": 2.038032789641875e-06, + "learning_rate": 4.336070186763685e-08, + "loss": 0.0, + "num_input_tokens_seen": 76590728, + "step": 132015 + }, + { + "epoch": 19.663389931486446, + "grad_norm": 3.673048922792077e-06, + "learning_rate": 4.316961451615031e-08, + "loss": 0.0, + "num_input_tokens_seen": 76593640, + "step": 132020 + }, + { + "epoch": 19.664134644027406, + "grad_norm": 7.4004965426865965e-06, + "learning_rate": 4.297894878494968e-08, + "loss": 0.0, + "num_input_tokens_seen": 76596488, + "step": 132025 + }, + { + "epoch": 19.664879356568363, + "grad_norm": 1.0555810149526224e-05, + "learning_rate": 4.27887046772546e-08, + "loss": 0.0, + "num_input_tokens_seen": 76599336, + "step": 132030 + }, + { + "epoch": 19.665624069109324, + "grad_norm": 1.4157916666590609e-05, + "learning_rate": 4.2598882196279165e-08, + "loss": 0.0, + "num_input_tokens_seen": 76602216, + "step": 132035 + }, + { + "epoch": 19.666368781650284, + "grad_norm": 3.972579179389868e-06, + "learning_rate": 4.240948134522915e-08, + "loss": 0.0, + "num_input_tokens_seen": 76605384, + "step": 132040 + }, + { + "epoch": 19.66711349419124, + "grad_norm": 1.6261939890682697e-05, + "learning_rate": 4.2220502127304775e-08, + "loss": 0.0, + "num_input_tokens_seen": 76608392, + "step": 132045 + }, + { + "epoch": 19.667858206732202, + "grad_norm": 2.348088173675933e-06, + "learning_rate": 4.2031944545700696e-08, + "loss": 0.0, + "num_input_tokens_seen": 76611496, + "step": 132050 + }, + { + "epoch": 19.66860291927316, + "grad_norm": 5.117562977829948e-06, + "learning_rate": 4.184380860360049e-08, + "loss": 0.0, + "num_input_tokens_seen": 76614504, + "step": 132055 + }, + { + "epoch": 19.66934763181412, + "grad_norm": 2.6814006560016423e-05, + "learning_rate": 4.165609430418216e-08, + "loss": 0.0, + "num_input_tokens_seen": 76617352, + "step": 132060 + }, + { + "epoch": 19.67009234435508, + "grad_norm": 0.05989944562315941, + "learning_rate": 4.1468801650618175e-08, + "loss": 0.0, + "num_input_tokens_seen": 76620264, + "step": 132065 + }, + { + "epoch": 19.670837056896037, + "grad_norm": 7.962209565448575e-06, + "learning_rate": 4.128193064606989e-08, + "loss": 0.0, + "num_input_tokens_seen": 76623400, + "step": 132070 + }, + { + "epoch": 19.671581769436997, + "grad_norm": 4.421645826369058e-06, + "learning_rate": 4.1095481293698665e-08, + "loss": 0.0, + "num_input_tokens_seen": 76626184, + "step": 132075 + }, + { + "epoch": 19.672326481977958, + "grad_norm": 2.115607685482246e-06, + "learning_rate": 4.0909453596651995e-08, + "loss": 0.0, + "num_input_tokens_seen": 76629000, + "step": 132080 + }, + { + "epoch": 19.673071194518915, + "grad_norm": 2.6136992801184533e-06, + "learning_rate": 4.0723847558071795e-08, + "loss": 0.0, + "num_input_tokens_seen": 76632040, + "step": 132085 + }, + { + "epoch": 19.673815907059875, + "grad_norm": 0.0006343568675220013, + "learning_rate": 4.0538663181097224e-08, + "loss": 0.0, + "num_input_tokens_seen": 76635176, + "step": 132090 + }, + { + "epoch": 19.674560619600832, + "grad_norm": 1.137416620622389e-05, + "learning_rate": 4.035390046885079e-08, + "loss": 0.0, + "num_input_tokens_seen": 76638152, + "step": 132095 + }, + { + "epoch": 19.675305332141793, + "grad_norm": 0.00020225033222232014, + "learning_rate": 4.016955942446055e-08, + "loss": 0.0, + "num_input_tokens_seen": 76641096, + "step": 132100 + }, + { + "epoch": 19.676050044682754, + "grad_norm": 3.098715023952536e-05, + "learning_rate": 3.9985640051035114e-08, + "loss": 0.0, + "num_input_tokens_seen": 76644040, + "step": 132105 + }, + { + "epoch": 19.67679475722371, + "grad_norm": 2.508194484107662e-06, + "learning_rate": 3.980214235168589e-08, + "loss": 0.0, + "num_input_tokens_seen": 76647400, + "step": 132110 + }, + { + "epoch": 19.67753946976467, + "grad_norm": 2.401412530161906e-06, + "learning_rate": 3.96190663295104e-08, + "loss": 0.0, + "num_input_tokens_seen": 76650440, + "step": 132115 + }, + { + "epoch": 19.67828418230563, + "grad_norm": 6.2356621128856204e-06, + "learning_rate": 3.943641198760062e-08, + "loss": 0.0, + "num_input_tokens_seen": 76653384, + "step": 132120 + }, + { + "epoch": 19.67902889484659, + "grad_norm": 5.7020795793505386e-05, + "learning_rate": 3.925417932904574e-08, + "loss": 0.0, + "num_input_tokens_seen": 76656424, + "step": 132125 + }, + { + "epoch": 19.67977360738755, + "grad_norm": 8.957621503213886e-06, + "learning_rate": 3.907236835692385e-08, + "loss": 0.0, + "num_input_tokens_seen": 76659016, + "step": 132130 + }, + { + "epoch": 19.680518319928506, + "grad_norm": 7.14706548023969e-05, + "learning_rate": 3.8890979074301946e-08, + "loss": 0.0, + "num_input_tokens_seen": 76661832, + "step": 132135 + }, + { + "epoch": 19.681263032469467, + "grad_norm": 0.000665899773593992, + "learning_rate": 3.8710011484249795e-08, + "loss": 0.0, + "num_input_tokens_seen": 76665032, + "step": 132140 + }, + { + "epoch": 19.682007745010427, + "grad_norm": 5.324681114871055e-05, + "learning_rate": 3.8529465589820514e-08, + "loss": 0.0, + "num_input_tokens_seen": 76667656, + "step": 132145 + }, + { + "epoch": 19.682752457551384, + "grad_norm": 3.5968660085927695e-06, + "learning_rate": 3.8349341394067204e-08, + "loss": 0.0, + "num_input_tokens_seen": 76670344, + "step": 132150 + }, + { + "epoch": 19.683497170092345, + "grad_norm": 2.2556769181392156e-05, + "learning_rate": 3.816963890003189e-08, + "loss": 0.0, + "num_input_tokens_seen": 76673096, + "step": 132155 + }, + { + "epoch": 19.684241882633305, + "grad_norm": 2.907522457462619e-06, + "learning_rate": 3.799035811075102e-08, + "loss": 0.0, + "num_input_tokens_seen": 76676104, + "step": 132160 + }, + { + "epoch": 19.684986595174262, + "grad_norm": 1.814227107388433e-05, + "learning_rate": 3.7811499029252737e-08, + "loss": 0.0, + "num_input_tokens_seen": 76678760, + "step": 132165 + }, + { + "epoch": 19.685731307715223, + "grad_norm": 3.3920680380106205e-06, + "learning_rate": 3.763306165855962e-08, + "loss": 0.0, + "num_input_tokens_seen": 76681768, + "step": 132170 + }, + { + "epoch": 19.68647602025618, + "grad_norm": 2.1210826162132435e-06, + "learning_rate": 3.745504600168315e-08, + "loss": 0.0, + "num_input_tokens_seen": 76684520, + "step": 132175 + }, + { + "epoch": 19.68722073279714, + "grad_norm": 3.772974196181167e-06, + "learning_rate": 3.72774520616348e-08, + "loss": 0.0, + "num_input_tokens_seen": 76687528, + "step": 132180 + }, + { + "epoch": 19.6879654453381, + "grad_norm": 1.2721696293738205e-05, + "learning_rate": 3.7100279841412177e-08, + "loss": 0.0, + "num_input_tokens_seen": 76690184, + "step": 132185 + }, + { + "epoch": 19.688710157879058, + "grad_norm": 1.4715005818288773e-05, + "learning_rate": 3.6923529344007336e-08, + "loss": 0.0, + "num_input_tokens_seen": 76693128, + "step": 132190 + }, + { + "epoch": 19.68945487042002, + "grad_norm": 3.4904267522506416e-05, + "learning_rate": 3.674720057240955e-08, + "loss": 0.0, + "num_input_tokens_seen": 76696136, + "step": 132195 + }, + { + "epoch": 19.69019958296098, + "grad_norm": 1.8750084564089775e-05, + "learning_rate": 3.657129352959698e-08, + "loss": 0.0, + "num_input_tokens_seen": 76699080, + "step": 132200 + }, + { + "epoch": 19.690944295501936, + "grad_norm": 0.0001058164270943962, + "learning_rate": 3.63958082185395e-08, + "loss": 0.0, + "num_input_tokens_seen": 76701544, + "step": 132205 + }, + { + "epoch": 19.691689008042896, + "grad_norm": 2.3636375772184692e-05, + "learning_rate": 3.622074464220415e-08, + "loss": 0.0, + "num_input_tokens_seen": 76704648, + "step": 132210 + }, + { + "epoch": 19.692433720583853, + "grad_norm": 3.074819233006565e-06, + "learning_rate": 3.604610280354692e-08, + "loss": 0.0, + "num_input_tokens_seen": 76707912, + "step": 132215 + }, + { + "epoch": 19.693178433124814, + "grad_norm": 2.588203460618388e-05, + "learning_rate": 3.587188270551822e-08, + "loss": 0.0, + "num_input_tokens_seen": 76710408, + "step": 132220 + }, + { + "epoch": 19.693923145665774, + "grad_norm": 0.0015256995102390647, + "learning_rate": 3.569808435106292e-08, + "loss": 0.0, + "num_input_tokens_seen": 76713320, + "step": 132225 + }, + { + "epoch": 19.69466785820673, + "grad_norm": 6.5262797761533875e-06, + "learning_rate": 3.552470774311478e-08, + "loss": 0.0, + "num_input_tokens_seen": 76716040, + "step": 132230 + }, + { + "epoch": 19.695412570747692, + "grad_norm": 2.6794441509991884e-06, + "learning_rate": 3.535175288460479e-08, + "loss": 0.0, + "num_input_tokens_seen": 76718920, + "step": 132235 + }, + { + "epoch": 19.69615728328865, + "grad_norm": 2.1416442905319855e-05, + "learning_rate": 3.5179219778452846e-08, + "loss": 0.0, + "num_input_tokens_seen": 76722024, + "step": 132240 + }, + { + "epoch": 19.69690199582961, + "grad_norm": 2.8323331662249984e-06, + "learning_rate": 3.500710842757604e-08, + "loss": 0.0, + "num_input_tokens_seen": 76724936, + "step": 132245 + }, + { + "epoch": 19.69764670837057, + "grad_norm": 3.4600971048348583e-06, + "learning_rate": 3.483541883487762e-08, + "loss": 0.0, + "num_input_tokens_seen": 76727720, + "step": 132250 + }, + { + "epoch": 19.698391420911527, + "grad_norm": 2.499373113096226e-05, + "learning_rate": 3.466415100326359e-08, + "loss": 0.0, + "num_input_tokens_seen": 76731496, + "step": 132255 + }, + { + "epoch": 19.699136133452487, + "grad_norm": 5.485243036673637e-06, + "learning_rate": 3.449330493562608e-08, + "loss": 0.0, + "num_input_tokens_seen": 76734696, + "step": 132260 + }, + { + "epoch": 19.699880845993448, + "grad_norm": 1.3361122910282575e-05, + "learning_rate": 3.4322880634851674e-08, + "loss": 0.0, + "num_input_tokens_seen": 76737544, + "step": 132265 + }, + { + "epoch": 19.700625558534405, + "grad_norm": 2.112735501214047e-06, + "learning_rate": 3.415287810381584e-08, + "loss": 0.0, + "num_input_tokens_seen": 76740328, + "step": 132270 + }, + { + "epoch": 19.701370271075366, + "grad_norm": 2.4316977942362428e-06, + "learning_rate": 3.3983297345391296e-08, + "loss": 0.0, + "num_input_tokens_seen": 76743176, + "step": 132275 + }, + { + "epoch": 19.702114983616323, + "grad_norm": 8.059068022703286e-06, + "learning_rate": 3.381413836244796e-08, + "loss": 0.0, + "num_input_tokens_seen": 76746248, + "step": 132280 + }, + { + "epoch": 19.702859696157283, + "grad_norm": 2.23306392399536e-06, + "learning_rate": 3.36454011578391e-08, + "loss": 0.0, + "num_input_tokens_seen": 76749224, + "step": 132285 + }, + { + "epoch": 19.703604408698244, + "grad_norm": 1.1911084651947021, + "learning_rate": 3.347708573441521e-08, + "loss": 0.0014, + "num_input_tokens_seen": 76751976, + "step": 132290 + }, + { + "epoch": 19.7043491212392, + "grad_norm": 6.034040779923089e-06, + "learning_rate": 3.330919209502126e-08, + "loss": 0.0, + "num_input_tokens_seen": 76754664, + "step": 132295 + }, + { + "epoch": 19.70509383378016, + "grad_norm": 1.109878758143168e-05, + "learning_rate": 3.314172024249662e-08, + "loss": 0.0, + "num_input_tokens_seen": 76757576, + "step": 132300 + }, + { + "epoch": 19.70583854632112, + "grad_norm": 6.959390157135203e-05, + "learning_rate": 3.297467017966405e-08, + "loss": 0.0, + "num_input_tokens_seen": 76760168, + "step": 132305 + }, + { + "epoch": 19.70658325886208, + "grad_norm": 5.002639682061272e-06, + "learning_rate": 3.280804190935183e-08, + "loss": 0.0, + "num_input_tokens_seen": 76762888, + "step": 132310 + }, + { + "epoch": 19.70732797140304, + "grad_norm": 3.471805484878132e-06, + "learning_rate": 3.264183543436883e-08, + "loss": 0.0, + "num_input_tokens_seen": 76765416, + "step": 132315 + }, + { + "epoch": 19.708072683943996, + "grad_norm": 0.00018347421428188682, + "learning_rate": 3.2476050757529466e-08, + "loss": 0.0, + "num_input_tokens_seen": 76768264, + "step": 132320 + }, + { + "epoch": 19.708817396484957, + "grad_norm": 4.539774090517312e-05, + "learning_rate": 3.231068788162872e-08, + "loss": 0.0, + "num_input_tokens_seen": 76771496, + "step": 132325 + }, + { + "epoch": 19.709562109025917, + "grad_norm": 2.8034842216584366e-06, + "learning_rate": 3.214574680946436e-08, + "loss": 0.0, + "num_input_tokens_seen": 76774568, + "step": 132330 + }, + { + "epoch": 19.710306821566874, + "grad_norm": 3.443786408752203e-05, + "learning_rate": 3.198122754382305e-08, + "loss": 0.0, + "num_input_tokens_seen": 76777384, + "step": 132335 + }, + { + "epoch": 19.711051534107835, + "grad_norm": 6.246705652301898e-06, + "learning_rate": 3.181713008748033e-08, + "loss": 0.0, + "num_input_tokens_seen": 76780200, + "step": 132340 + }, + { + "epoch": 19.711796246648795, + "grad_norm": 6.27300760243088e-05, + "learning_rate": 3.1653454443211774e-08, + "loss": 0.0, + "num_input_tokens_seen": 76782984, + "step": 132345 + }, + { + "epoch": 19.712540959189752, + "grad_norm": 2.9882955914217746e-06, + "learning_rate": 3.1490200613779056e-08, + "loss": 0.0, + "num_input_tokens_seen": 76785832, + "step": 132350 + }, + { + "epoch": 19.713285671730713, + "grad_norm": 1.485851862526033e-05, + "learning_rate": 3.132736860194385e-08, + "loss": 0.0, + "num_input_tokens_seen": 76788840, + "step": 132355 + }, + { + "epoch": 19.71403038427167, + "grad_norm": 2.1890471089136554e-06, + "learning_rate": 3.116495841045675e-08, + "loss": 0.0, + "num_input_tokens_seen": 76791528, + "step": 132360 + }, + { + "epoch": 19.71477509681263, + "grad_norm": 7.704506060690619e-06, + "learning_rate": 3.1002970042059984e-08, + "loss": 0.0, + "num_input_tokens_seen": 76794664, + "step": 132365 + }, + { + "epoch": 19.71551980935359, + "grad_norm": 7.093121439538663e-06, + "learning_rate": 3.084140349949027e-08, + "loss": 0.0, + "num_input_tokens_seen": 76797384, + "step": 132370 + }, + { + "epoch": 19.716264521894548, + "grad_norm": 1.5182921742962208e-05, + "learning_rate": 3.0680258785478756e-08, + "loss": 0.0, + "num_input_tokens_seen": 76800328, + "step": 132375 + }, + { + "epoch": 19.71700923443551, + "grad_norm": 9.835014679993037e-06, + "learning_rate": 3.051953590274548e-08, + "loss": 0.0001, + "num_input_tokens_seen": 76802952, + "step": 132380 + }, + { + "epoch": 19.717753946976465, + "grad_norm": 5.982847142149694e-06, + "learning_rate": 3.035923485400771e-08, + "loss": 0.0, + "num_input_tokens_seen": 76805960, + "step": 132385 + }, + { + "epoch": 19.718498659517426, + "grad_norm": 2.9766313218715368e-06, + "learning_rate": 3.0199355641971626e-08, + "loss": 0.0, + "num_input_tokens_seen": 76808904, + "step": 132390 + }, + { + "epoch": 19.719243372058386, + "grad_norm": 2.178209570047329e-06, + "learning_rate": 3.003989826934062e-08, + "loss": 0.0, + "num_input_tokens_seen": 76811720, + "step": 132395 + }, + { + "epoch": 19.719988084599343, + "grad_norm": 6.478713203250663e-06, + "learning_rate": 2.9880862738804196e-08, + "loss": 0.0, + "num_input_tokens_seen": 76814472, + "step": 132400 + }, + { + "epoch": 19.720732797140304, + "grad_norm": 4.022809207526734e-06, + "learning_rate": 2.9722249053054653e-08, + "loss": 0.0, + "num_input_tokens_seen": 76817160, + "step": 132405 + }, + { + "epoch": 19.721477509681264, + "grad_norm": 0.00019820556917693466, + "learning_rate": 2.9564057214767627e-08, + "loss": 0.0, + "num_input_tokens_seen": 76819944, + "step": 132410 + }, + { + "epoch": 19.72222222222222, + "grad_norm": 2.8068659503333038e-06, + "learning_rate": 2.9406287226618756e-08, + "loss": 0.0, + "num_input_tokens_seen": 76823016, + "step": 132415 + }, + { + "epoch": 19.722966934763182, + "grad_norm": 3.227836486985325e-06, + "learning_rate": 2.92489390912698e-08, + "loss": 0.0, + "num_input_tokens_seen": 76826024, + "step": 132420 + }, + { + "epoch": 19.72371164730414, + "grad_norm": 4.990542947780341e-06, + "learning_rate": 2.909201281138252e-08, + "loss": 0.0, + "num_input_tokens_seen": 76828808, + "step": 132425 + }, + { + "epoch": 19.7244563598451, + "grad_norm": 3.223003295715898e-05, + "learning_rate": 2.8935508389607568e-08, + "loss": 0.0, + "num_input_tokens_seen": 76831816, + "step": 132430 + }, + { + "epoch": 19.72520107238606, + "grad_norm": 6.818518158979714e-06, + "learning_rate": 2.8779425828584506e-08, + "loss": 0.0, + "num_input_tokens_seen": 76834632, + "step": 132435 + }, + { + "epoch": 19.725945784927017, + "grad_norm": 8.944474220275879, + "learning_rate": 2.862376513095566e-08, + "loss": 0.0246, + "num_input_tokens_seen": 76837512, + "step": 132440 + }, + { + "epoch": 19.726690497467978, + "grad_norm": 4.07073048336315e-06, + "learning_rate": 2.846852629934671e-08, + "loss": 0.0, + "num_input_tokens_seen": 76840168, + "step": 132445 + }, + { + "epoch": 19.727435210008938, + "grad_norm": 6.5317390181007795e-06, + "learning_rate": 2.831370933638333e-08, + "loss": 0.0, + "num_input_tokens_seen": 76843176, + "step": 132450 + }, + { + "epoch": 19.728179922549895, + "grad_norm": 0.0001003804209176451, + "learning_rate": 2.8159314244680103e-08, + "loss": 0.0, + "num_input_tokens_seen": 76846216, + "step": 132455 + }, + { + "epoch": 19.728924635090856, + "grad_norm": 1.0002014278143179e-05, + "learning_rate": 2.800534102684327e-08, + "loss": 0.0, + "num_input_tokens_seen": 76849128, + "step": 132460 + }, + { + "epoch": 19.729669347631813, + "grad_norm": 3.5407736049819505e-06, + "learning_rate": 2.7851789685476304e-08, + "loss": 0.0, + "num_input_tokens_seen": 76852008, + "step": 132465 + }, + { + "epoch": 19.730414060172773, + "grad_norm": 2.155552692784113e-06, + "learning_rate": 2.7698660223174355e-08, + "loss": 0.0, + "num_input_tokens_seen": 76854920, + "step": 132470 + }, + { + "epoch": 19.731158772713734, + "grad_norm": 0.001136445440351963, + "learning_rate": 2.7545952642521466e-08, + "loss": 0.0, + "num_input_tokens_seen": 76857896, + "step": 132475 + }, + { + "epoch": 19.73190348525469, + "grad_norm": 3.6668254779215204e-06, + "learning_rate": 2.7393666946098906e-08, + "loss": 0.0, + "num_input_tokens_seen": 76860808, + "step": 132480 + }, + { + "epoch": 19.73264819779565, + "grad_norm": 1.073030216502957e-05, + "learning_rate": 2.7241803136479616e-08, + "loss": 0.0, + "num_input_tokens_seen": 76863656, + "step": 132485 + }, + { + "epoch": 19.73339291033661, + "grad_norm": 2.230197878816398e-06, + "learning_rate": 2.7090361216230987e-08, + "loss": 0.0, + "num_input_tokens_seen": 76866600, + "step": 132490 + }, + { + "epoch": 19.73413762287757, + "grad_norm": 3.134380449409946e-06, + "learning_rate": 2.693934118790653e-08, + "loss": 0.0, + "num_input_tokens_seen": 76869608, + "step": 132495 + }, + { + "epoch": 19.73488233541853, + "grad_norm": 2.6800726118381135e-06, + "learning_rate": 2.678874305405976e-08, + "loss": 0.0, + "num_input_tokens_seen": 76872264, + "step": 132500 + }, + { + "epoch": 19.735627047959486, + "grad_norm": 5.9294306993251666e-05, + "learning_rate": 2.663856681723864e-08, + "loss": 0.0, + "num_input_tokens_seen": 76875208, + "step": 132505 + }, + { + "epoch": 19.736371760500447, + "grad_norm": 0.012087803333997726, + "learning_rate": 2.6488812479974478e-08, + "loss": 0.0, + "num_input_tokens_seen": 76877960, + "step": 132510 + }, + { + "epoch": 19.737116473041407, + "grad_norm": 9.657104783400428e-06, + "learning_rate": 2.6339480044801355e-08, + "loss": 0.0, + "num_input_tokens_seen": 76880808, + "step": 132515 + }, + { + "epoch": 19.737861185582364, + "grad_norm": 3.453538283793023e-06, + "learning_rate": 2.6190569514239484e-08, + "loss": 0.0, + "num_input_tokens_seen": 76883688, + "step": 132520 + }, + { + "epoch": 19.738605898123325, + "grad_norm": 2.8075963200535625e-05, + "learning_rate": 2.6042080890806285e-08, + "loss": 0.0, + "num_input_tokens_seen": 76886536, + "step": 132525 + }, + { + "epoch": 19.73935061066428, + "grad_norm": 5.652931122313021e-06, + "learning_rate": 2.5894014177010872e-08, + "loss": 0.0, + "num_input_tokens_seen": 76889608, + "step": 132530 + }, + { + "epoch": 19.740095323205242, + "grad_norm": 1.0378435945312958e-05, + "learning_rate": 2.5746369375351242e-08, + "loss": 0.0, + "num_input_tokens_seen": 76892904, + "step": 132535 + }, + { + "epoch": 19.740840035746203, + "grad_norm": 7.525156433985103e-06, + "learning_rate": 2.55991464883254e-08, + "loss": 0.0, + "num_input_tokens_seen": 76895784, + "step": 132540 + }, + { + "epoch": 19.74158474828716, + "grad_norm": 3.7084812447574222e-06, + "learning_rate": 2.545234551842024e-08, + "loss": 0.0, + "num_input_tokens_seen": 76898792, + "step": 132545 + }, + { + "epoch": 19.74232946082812, + "grad_norm": 8.490353502565995e-05, + "learning_rate": 2.530596646811434e-08, + "loss": 0.0, + "num_input_tokens_seen": 76901768, + "step": 132550 + }, + { + "epoch": 19.74307417336908, + "grad_norm": 4.474369688978186e-06, + "learning_rate": 2.5160009339877944e-08, + "loss": 0.0, + "num_input_tokens_seen": 76904904, + "step": 132555 + }, + { + "epoch": 19.743818885910038, + "grad_norm": 1.5050824913487304e-05, + "learning_rate": 2.5014474136184075e-08, + "loss": 0.0, + "num_input_tokens_seen": 76908072, + "step": 132560 + }, + { + "epoch": 19.744563598451, + "grad_norm": 3.609019586292561e-06, + "learning_rate": 2.486936085948355e-08, + "loss": 0.0, + "num_input_tokens_seen": 76910664, + "step": 132565 + }, + { + "epoch": 19.74530831099196, + "grad_norm": 6.107838271418586e-05, + "learning_rate": 2.4724669512232734e-08, + "loss": 0.0, + "num_input_tokens_seen": 76913608, + "step": 132570 + }, + { + "epoch": 19.746053023532916, + "grad_norm": 0.00834971759468317, + "learning_rate": 2.45804000968769e-08, + "loss": 0.0, + "num_input_tokens_seen": 76916584, + "step": 132575 + }, + { + "epoch": 19.746797736073876, + "grad_norm": 6.705963642161805e-06, + "learning_rate": 2.4436552615850205e-08, + "loss": 0.0, + "num_input_tokens_seen": 76919720, + "step": 132580 + }, + { + "epoch": 19.747542448614833, + "grad_norm": 8.355612408195157e-06, + "learning_rate": 2.4293127071584043e-08, + "loss": 0.0, + "num_input_tokens_seen": 76922632, + "step": 132585 + }, + { + "epoch": 19.748287161155794, + "grad_norm": 2.394225475654821e-06, + "learning_rate": 2.4150123466498698e-08, + "loss": 0.0, + "num_input_tokens_seen": 76925640, + "step": 132590 + }, + { + "epoch": 19.749031873696755, + "grad_norm": 2.8241463496669894e-06, + "learning_rate": 2.400754180301723e-08, + "loss": 0.0, + "num_input_tokens_seen": 76928456, + "step": 132595 + }, + { + "epoch": 19.74977658623771, + "grad_norm": 1.237372725881869e-05, + "learning_rate": 2.38653820835405e-08, + "loss": 0.0, + "num_input_tokens_seen": 76931496, + "step": 132600 + }, + { + "epoch": 19.750521298778672, + "grad_norm": 5.216294539422961e-06, + "learning_rate": 2.3723644310474914e-08, + "loss": 0.0, + "num_input_tokens_seen": 76934312, + "step": 132605 + }, + { + "epoch": 19.75126601131963, + "grad_norm": 9.594878065399826e-06, + "learning_rate": 2.3582328486213e-08, + "loss": 0.0, + "num_input_tokens_seen": 76937128, + "step": 132610 + }, + { + "epoch": 19.75201072386059, + "grad_norm": 6.886782102810685e-06, + "learning_rate": 2.3441434613141743e-08, + "loss": 0.0, + "num_input_tokens_seen": 76940072, + "step": 132615 + }, + { + "epoch": 19.75275543640155, + "grad_norm": 1.5288227587006986e-05, + "learning_rate": 2.3300962693645345e-08, + "loss": 0.0, + "num_input_tokens_seen": 76943144, + "step": 132620 + }, + { + "epoch": 19.753500148942507, + "grad_norm": 4.7497749619651586e-05, + "learning_rate": 2.3160912730091357e-08, + "loss": 0.0, + "num_input_tokens_seen": 76945864, + "step": 132625 + }, + { + "epoch": 19.754244861483468, + "grad_norm": 3.3564285786269465e-06, + "learning_rate": 2.30212847248501e-08, + "loss": 0.0, + "num_input_tokens_seen": 76948968, + "step": 132630 + }, + { + "epoch": 19.754989574024428, + "grad_norm": 9.495577614870854e-06, + "learning_rate": 2.288207868027803e-08, + "loss": 0.0, + "num_input_tokens_seen": 76951848, + "step": 132635 + }, + { + "epoch": 19.755734286565385, + "grad_norm": 1.0138184734387323e-05, + "learning_rate": 2.2743294598726038e-08, + "loss": 0.0, + "num_input_tokens_seen": 76954952, + "step": 132640 + }, + { + "epoch": 19.756478999106346, + "grad_norm": 6.9639249886677135e-06, + "learning_rate": 2.260493248254225e-08, + "loss": 0.0, + "num_input_tokens_seen": 76957960, + "step": 132645 + }, + { + "epoch": 19.757223711647303, + "grad_norm": 2.6865307063417276e-06, + "learning_rate": 2.246699233406091e-08, + "loss": 0.0, + "num_input_tokens_seen": 76961032, + "step": 132650 + }, + { + "epoch": 19.757968424188263, + "grad_norm": 1.7538453676024801e-06, + "learning_rate": 2.2329474155616258e-08, + "loss": 0.0, + "num_input_tokens_seen": 76964008, + "step": 132655 + }, + { + "epoch": 19.758713136729224, + "grad_norm": 3.398274566279724e-05, + "learning_rate": 2.2192377949525888e-08, + "loss": 0.0, + "num_input_tokens_seen": 76966856, + "step": 132660 + }, + { + "epoch": 19.75945784927018, + "grad_norm": 4.0567545511294156e-05, + "learning_rate": 2.205570371811294e-08, + "loss": 0.0, + "num_input_tokens_seen": 76969704, + "step": 132665 + }, + { + "epoch": 19.76020256181114, + "grad_norm": 4.3357205868232995e-05, + "learning_rate": 2.1919451463678353e-08, + "loss": 0.0, + "num_input_tokens_seen": 76972840, + "step": 132670 + }, + { + "epoch": 19.7609472743521, + "grad_norm": 1.4942068446543999e-05, + "learning_rate": 2.178362118853139e-08, + "loss": 0.0, + "num_input_tokens_seen": 76975880, + "step": 132675 + }, + { + "epoch": 19.76169198689306, + "grad_norm": 3.249570454499917e-06, + "learning_rate": 2.1648212894961884e-08, + "loss": 0.0, + "num_input_tokens_seen": 76978664, + "step": 132680 + }, + { + "epoch": 19.76243669943402, + "grad_norm": 1.5411020285682753e-05, + "learning_rate": 2.1513226585256897e-08, + "loss": 0.0, + "num_input_tokens_seen": 76981352, + "step": 132685 + }, + { + "epoch": 19.763181411974976, + "grad_norm": 8.456153409497347e-06, + "learning_rate": 2.137866226170071e-08, + "loss": 0.0, + "num_input_tokens_seen": 76984072, + "step": 132690 + }, + { + "epoch": 19.763926124515937, + "grad_norm": 5.412666268966859e-06, + "learning_rate": 2.1244519926566507e-08, + "loss": 0.0, + "num_input_tokens_seen": 76986856, + "step": 132695 + }, + { + "epoch": 19.764670837056897, + "grad_norm": 2.4667031084391056e-06, + "learning_rate": 2.111079958211637e-08, + "loss": 0.0, + "num_input_tokens_seen": 76989864, + "step": 132700 + }, + { + "epoch": 19.765415549597854, + "grad_norm": 1.4792229194426909e-05, + "learning_rate": 2.0977501230612374e-08, + "loss": 0.0, + "num_input_tokens_seen": 76992840, + "step": 132705 + }, + { + "epoch": 19.766160262138815, + "grad_norm": 2.5054816433112137e-05, + "learning_rate": 2.0844624874305497e-08, + "loss": 0.0, + "num_input_tokens_seen": 76995592, + "step": 132710 + }, + { + "epoch": 19.766904974679775, + "grad_norm": 1.60324289026903e-05, + "learning_rate": 2.0712170515443942e-08, + "loss": 0.0, + "num_input_tokens_seen": 76998664, + "step": 132715 + }, + { + "epoch": 19.767649687220732, + "grad_norm": 0.0012511173263192177, + "learning_rate": 2.0580138156259256e-08, + "loss": 0.0, + "num_input_tokens_seen": 77001576, + "step": 132720 + }, + { + "epoch": 19.768394399761693, + "grad_norm": 0.000726177531760186, + "learning_rate": 2.0448527798985762e-08, + "loss": 0.0, + "num_input_tokens_seen": 77004392, + "step": 132725 + }, + { + "epoch": 19.76913911230265, + "grad_norm": 1.4922409718565177e-05, + "learning_rate": 2.0317339445849458e-08, + "loss": 0.0, + "num_input_tokens_seen": 77007400, + "step": 132730 + }, + { + "epoch": 19.76988382484361, + "grad_norm": 5.229502676229458e-06, + "learning_rate": 2.0186573099059693e-08, + "loss": 0.0, + "num_input_tokens_seen": 77010376, + "step": 132735 + }, + { + "epoch": 19.77062853738457, + "grad_norm": 1.9901146515621804e-05, + "learning_rate": 2.005622876083135e-08, + "loss": 0.0, + "num_input_tokens_seen": 77013288, + "step": 132740 + }, + { + "epoch": 19.771373249925528, + "grad_norm": 3.97486837755423e-06, + "learning_rate": 1.9926306433362683e-08, + "loss": 0.0, + "num_input_tokens_seen": 77016296, + "step": 132745 + }, + { + "epoch": 19.77211796246649, + "grad_norm": 4.130893648834899e-05, + "learning_rate": 1.979680611885193e-08, + "loss": 0.0, + "num_input_tokens_seen": 77019048, + "step": 132750 + }, + { + "epoch": 19.772862675007445, + "grad_norm": 3.838908924080897e-06, + "learning_rate": 1.9667727819486225e-08, + "loss": 0.0, + "num_input_tokens_seen": 77022216, + "step": 132755 + }, + { + "epoch": 19.773607387548406, + "grad_norm": 5.384806627262151e-06, + "learning_rate": 1.9539071537444387e-08, + "loss": 0.0, + "num_input_tokens_seen": 77025128, + "step": 132760 + }, + { + "epoch": 19.774352100089367, + "grad_norm": 7.000843652349431e-06, + "learning_rate": 1.9410837274899674e-08, + "loss": 0.0, + "num_input_tokens_seen": 77028072, + "step": 132765 + }, + { + "epoch": 19.775096812630323, + "grad_norm": 4.976598575012758e-06, + "learning_rate": 1.9283025034019797e-08, + "loss": 0.0, + "num_input_tokens_seen": 77031048, + "step": 132770 + }, + { + "epoch": 19.775841525171284, + "grad_norm": 5.056917416368378e-06, + "learning_rate": 1.9155634816966916e-08, + "loss": 0.0, + "num_input_tokens_seen": 77034440, + "step": 132775 + }, + { + "epoch": 19.776586237712245, + "grad_norm": 5.612100267171627e-06, + "learning_rate": 1.902866662588654e-08, + "loss": 0.0, + "num_input_tokens_seen": 77037480, + "step": 132780 + }, + { + "epoch": 19.7773309502532, + "grad_norm": 5.872206656931667e-06, + "learning_rate": 1.890212046292972e-08, + "loss": 0.0, + "num_input_tokens_seen": 77040488, + "step": 132785 + }, + { + "epoch": 19.778075662794162, + "grad_norm": 3.5747873425862053e-06, + "learning_rate": 1.877599633023086e-08, + "loss": 0.0, + "num_input_tokens_seen": 77043496, + "step": 132790 + }, + { + "epoch": 19.77882037533512, + "grad_norm": 2.5669942260719836e-06, + "learning_rate": 1.865029422992437e-08, + "loss": 0.0, + "num_input_tokens_seen": 77046408, + "step": 132795 + }, + { + "epoch": 19.77956508787608, + "grad_norm": 7.093102340149926e-06, + "learning_rate": 1.8525014164127996e-08, + "loss": 0.0, + "num_input_tokens_seen": 77049224, + "step": 132800 + }, + { + "epoch": 19.78030980041704, + "grad_norm": 2.435409896861529e-06, + "learning_rate": 1.8400156134962265e-08, + "loss": 0.0, + "num_input_tokens_seen": 77052168, + "step": 132805 + }, + { + "epoch": 19.781054512957997, + "grad_norm": 1.0441373888170347e-05, + "learning_rate": 1.82757201445366e-08, + "loss": 0.0, + "num_input_tokens_seen": 77054888, + "step": 132810 + }, + { + "epoch": 19.781799225498958, + "grad_norm": 2.848239319064305e-06, + "learning_rate": 1.8151706194952102e-08, + "loss": 0.0, + "num_input_tokens_seen": 77057800, + "step": 132815 + }, + { + "epoch": 19.782543938039918, + "grad_norm": 2.317115331607056e-06, + "learning_rate": 1.8028114288301535e-08, + "loss": 0.0, + "num_input_tokens_seen": 77060744, + "step": 132820 + }, + { + "epoch": 19.783288650580875, + "grad_norm": 9.467396739637479e-05, + "learning_rate": 1.7904944426677673e-08, + "loss": 0.0, + "num_input_tokens_seen": 77063784, + "step": 132825 + }, + { + "epoch": 19.784033363121836, + "grad_norm": 1.9531718862708658e-05, + "learning_rate": 1.7782196612159406e-08, + "loss": 0.0, + "num_input_tokens_seen": 77066792, + "step": 132830 + }, + { + "epoch": 19.784778075662793, + "grad_norm": 3.8708185456926e-06, + "learning_rate": 1.7659870846820083e-08, + "loss": 0.0, + "num_input_tokens_seen": 77069640, + "step": 132835 + }, + { + "epoch": 19.785522788203753, + "grad_norm": 4.1992123442469165e-05, + "learning_rate": 1.7537967132727483e-08, + "loss": 0.0, + "num_input_tokens_seen": 77072552, + "step": 132840 + }, + { + "epoch": 19.786267500744714, + "grad_norm": 3.306727194285486e-06, + "learning_rate": 1.7416485471938304e-08, + "loss": 0.0, + "num_input_tokens_seen": 77075464, + "step": 132845 + }, + { + "epoch": 19.78701221328567, + "grad_norm": 2.231021790066734e-05, + "learning_rate": 1.7295425866506453e-08, + "loss": 0.0, + "num_input_tokens_seen": 77078472, + "step": 132850 + }, + { + "epoch": 19.78775692582663, + "grad_norm": 5.3786843636771664e-05, + "learning_rate": 1.7174788318477518e-08, + "loss": 0.0, + "num_input_tokens_seen": 77081416, + "step": 132855 + }, + { + "epoch": 19.788501638367592, + "grad_norm": 1.4342170288728084e-05, + "learning_rate": 1.705457282988876e-08, + "loss": 0.0, + "num_input_tokens_seen": 77084360, + "step": 132860 + }, + { + "epoch": 19.78924635090855, + "grad_norm": 2.4272735572594684e-06, + "learning_rate": 1.6934779402771884e-08, + "loss": 0.0, + "num_input_tokens_seen": 77087336, + "step": 132865 + }, + { + "epoch": 19.78999106344951, + "grad_norm": 8.019550477911253e-06, + "learning_rate": 1.681540803915027e-08, + "loss": 0.0, + "num_input_tokens_seen": 77089992, + "step": 132870 + }, + { + "epoch": 19.790735775990466, + "grad_norm": 5.695677828043699e-06, + "learning_rate": 1.669645874103898e-08, + "loss": 0.0, + "num_input_tokens_seen": 77092456, + "step": 132875 + }, + { + "epoch": 19.791480488531427, + "grad_norm": 2.6394386622996535e-06, + "learning_rate": 1.6577931510450283e-08, + "loss": 0.0, + "num_input_tokens_seen": 77095400, + "step": 132880 + }, + { + "epoch": 19.792225201072387, + "grad_norm": 0.00022490399715024978, + "learning_rate": 1.6459826349385365e-08, + "loss": 0.0, + "num_input_tokens_seen": 77098184, + "step": 132885 + }, + { + "epoch": 19.792969913613344, + "grad_norm": 3.679898327391129e-06, + "learning_rate": 1.6342143259839848e-08, + "loss": 0.0, + "num_input_tokens_seen": 77100712, + "step": 132890 + }, + { + "epoch": 19.793714626154305, + "grad_norm": 2.7619796583167044e-06, + "learning_rate": 1.622488224380103e-08, + "loss": 0.0, + "num_input_tokens_seen": 77103592, + "step": 132895 + }, + { + "epoch": 19.794459338695262, + "grad_norm": 1.0737990123743657e-05, + "learning_rate": 1.6108043303250664e-08, + "loss": 0.0, + "num_input_tokens_seen": 77106472, + "step": 132900 + }, + { + "epoch": 19.795204051236222, + "grad_norm": 8.697155863046646e-06, + "learning_rate": 1.5991626440162165e-08, + "loss": 0.0, + "num_input_tokens_seen": 77109416, + "step": 132905 + }, + { + "epoch": 19.795948763777183, + "grad_norm": 3.1051085898070596e-06, + "learning_rate": 1.587563165650341e-08, + "loss": 0.0, + "num_input_tokens_seen": 77112200, + "step": 132910 + }, + { + "epoch": 19.79669347631814, + "grad_norm": 4.74946282338351e-05, + "learning_rate": 1.5760058954233935e-08, + "loss": 0.0, + "num_input_tokens_seen": 77114696, + "step": 132915 + }, + { + "epoch": 19.7974381888591, + "grad_norm": 3.3911415812326595e-05, + "learning_rate": 1.564490833530219e-08, + "loss": 0.0, + "num_input_tokens_seen": 77117832, + "step": 132920 + }, + { + "epoch": 19.79818290140006, + "grad_norm": 1.4142082363832742e-05, + "learning_rate": 1.5530179801659382e-08, + "loss": 0.0, + "num_input_tokens_seen": 77120584, + "step": 132925 + }, + { + "epoch": 19.798927613941018, + "grad_norm": 3.2916980217123637e-06, + "learning_rate": 1.5415873355240086e-08, + "loss": 0.0, + "num_input_tokens_seen": 77123272, + "step": 132930 + }, + { + "epoch": 19.79967232648198, + "grad_norm": 0.0002312347642146051, + "learning_rate": 1.5301988997978857e-08, + "loss": 0.0, + "num_input_tokens_seen": 77126056, + "step": 132935 + }, + { + "epoch": 19.800417039022935, + "grad_norm": 2.0691348254331388e-05, + "learning_rate": 1.5188526731793608e-08, + "loss": 0.0, + "num_input_tokens_seen": 77128744, + "step": 132940 + }, + { + "epoch": 19.801161751563896, + "grad_norm": 0.00039249591645784676, + "learning_rate": 1.50754865586078e-08, + "loss": 0.0, + "num_input_tokens_seen": 77131912, + "step": 132945 + }, + { + "epoch": 19.801906464104857, + "grad_norm": 2.751155307123554e-06, + "learning_rate": 1.4962868480325465e-08, + "loss": 0.0, + "num_input_tokens_seen": 77134856, + "step": 132950 + }, + { + "epoch": 19.802651176645814, + "grad_norm": 3.958442903240211e-06, + "learning_rate": 1.4850672498853413e-08, + "loss": 0.0, + "num_input_tokens_seen": 77137768, + "step": 132955 + }, + { + "epoch": 19.803395889186774, + "grad_norm": 2.9404745873762295e-06, + "learning_rate": 1.4738898616084573e-08, + "loss": 0.0, + "num_input_tokens_seen": 77140648, + "step": 132960 + }, + { + "epoch": 19.804140601727735, + "grad_norm": 2.561651626820094e-06, + "learning_rate": 1.4627546833909101e-08, + "loss": 0.0, + "num_input_tokens_seen": 77143656, + "step": 132965 + }, + { + "epoch": 19.80488531426869, + "grad_norm": 3.7963361592119327e-06, + "learning_rate": 1.4516617154206048e-08, + "loss": 0.0, + "num_input_tokens_seen": 77146536, + "step": 132970 + }, + { + "epoch": 19.805630026809652, + "grad_norm": 1.9955032257712446e-06, + "learning_rate": 1.440610957885169e-08, + "loss": 0.0, + "num_input_tokens_seen": 77149288, + "step": 132975 + }, + { + "epoch": 19.80637473935061, + "grad_norm": 5.065695859229891e-06, + "learning_rate": 1.4296024109711203e-08, + "loss": 0.0, + "num_input_tokens_seen": 77152200, + "step": 132980 + }, + { + "epoch": 19.80711945189157, + "grad_norm": 8.368207090825308e-06, + "learning_rate": 1.4186360748644212e-08, + "loss": 0.0, + "num_input_tokens_seen": 77155080, + "step": 132985 + }, + { + "epoch": 19.80786416443253, + "grad_norm": 1.945306848938344e-06, + "learning_rate": 1.4077119497507562e-08, + "loss": 0.0036, + "num_input_tokens_seen": 77157896, + "step": 132990 + }, + { + "epoch": 19.808608876973487, + "grad_norm": 6.406650209100917e-05, + "learning_rate": 1.3968300358138675e-08, + "loss": 0.0, + "num_input_tokens_seen": 77160616, + "step": 132995 + }, + { + "epoch": 19.809353589514448, + "grad_norm": 0.00018622128118295223, + "learning_rate": 1.3859903332383296e-08, + "loss": 0.0, + "num_input_tokens_seen": 77163528, + "step": 133000 + }, + { + "epoch": 19.81009830205541, + "grad_norm": 8.380767212656792e-06, + "learning_rate": 1.3751928422070515e-08, + "loss": 0.0, + "num_input_tokens_seen": 77166184, + "step": 133005 + }, + { + "epoch": 19.810843014596365, + "grad_norm": 4.895336132904049e-06, + "learning_rate": 1.3644375629023875e-08, + "loss": 0.0, + "num_input_tokens_seen": 77168968, + "step": 133010 + }, + { + "epoch": 19.811587727137326, + "grad_norm": 1.6202448023250327e-05, + "learning_rate": 1.3537244955061368e-08, + "loss": 0.0, + "num_input_tokens_seen": 77172104, + "step": 133015 + }, + { + "epoch": 19.812332439678283, + "grad_norm": 4.27101213062997e-06, + "learning_rate": 1.343053640198988e-08, + "loss": 0.0, + "num_input_tokens_seen": 77174824, + "step": 133020 + }, + { + "epoch": 19.813077152219243, + "grad_norm": 0.000103830847365316, + "learning_rate": 1.3324249971613523e-08, + "loss": 0.0, + "num_input_tokens_seen": 77177512, + "step": 133025 + }, + { + "epoch": 19.813821864760204, + "grad_norm": 3.077824430874898e-06, + "learning_rate": 1.3218385665730859e-08, + "loss": 0.0, + "num_input_tokens_seen": 77180520, + "step": 133030 + }, + { + "epoch": 19.81456657730116, + "grad_norm": 3.110635589109734e-05, + "learning_rate": 1.3112943486129347e-08, + "loss": 0.0, + "num_input_tokens_seen": 77183400, + "step": 133035 + }, + { + "epoch": 19.81531128984212, + "grad_norm": 0.0023746739607304335, + "learning_rate": 1.3007923434585345e-08, + "loss": 0.0, + "num_input_tokens_seen": 77186664, + "step": 133040 + }, + { + "epoch": 19.816056002383082, + "grad_norm": 7.402129995170981e-06, + "learning_rate": 1.290332551288076e-08, + "loss": 0.0, + "num_input_tokens_seen": 77189832, + "step": 133045 + }, + { + "epoch": 19.81680071492404, + "grad_norm": 4.298027306504082e-06, + "learning_rate": 1.2799149722775294e-08, + "loss": 0.0, + "num_input_tokens_seen": 77192712, + "step": 133050 + }, + { + "epoch": 19.817545427465, + "grad_norm": 3.2396274036727846e-06, + "learning_rate": 1.2695396066034205e-08, + "loss": 0.0, + "num_input_tokens_seen": 77195464, + "step": 133055 + }, + { + "epoch": 19.818290140005956, + "grad_norm": 1.5113748304429464e-05, + "learning_rate": 1.2592064544408866e-08, + "loss": 0.0, + "num_input_tokens_seen": 77198280, + "step": 133060 + }, + { + "epoch": 19.819034852546917, + "grad_norm": 5.830457666888833e-06, + "learning_rate": 1.248915515964233e-08, + "loss": 0.0, + "num_input_tokens_seen": 77201416, + "step": 133065 + }, + { + "epoch": 19.819779565087877, + "grad_norm": 6.385042979673017e-06, + "learning_rate": 1.2386667913477645e-08, + "loss": 0.0, + "num_input_tokens_seen": 77204424, + "step": 133070 + }, + { + "epoch": 19.820524277628834, + "grad_norm": 2.0752548152813688e-05, + "learning_rate": 1.228460280764121e-08, + "loss": 0.0, + "num_input_tokens_seen": 77207240, + "step": 133075 + }, + { + "epoch": 19.821268990169795, + "grad_norm": 1.011789936455898e-05, + "learning_rate": 1.2182959843862196e-08, + "loss": 0.0, + "num_input_tokens_seen": 77210056, + "step": 133080 + }, + { + "epoch": 19.822013702710755, + "grad_norm": 3.2962866498564836e-06, + "learning_rate": 1.2081739023855899e-08, + "loss": 0.0, + "num_input_tokens_seen": 77212712, + "step": 133085 + }, + { + "epoch": 19.822758415251712, + "grad_norm": 6.021041826897999e-06, + "learning_rate": 1.198094034933206e-08, + "loss": 0.0, + "num_input_tokens_seen": 77215400, + "step": 133090 + }, + { + "epoch": 19.823503127792673, + "grad_norm": 4.4555699787451886e-06, + "learning_rate": 1.1880563821992096e-08, + "loss": 0.0, + "num_input_tokens_seen": 77218344, + "step": 133095 + }, + { + "epoch": 19.82424784033363, + "grad_norm": 3.091710595981567e-06, + "learning_rate": 1.1780609443534652e-08, + "loss": 0.0, + "num_input_tokens_seen": 77220840, + "step": 133100 + }, + { + "epoch": 19.82499255287459, + "grad_norm": 2.624212811497273e-06, + "learning_rate": 1.1681077215644487e-08, + "loss": 0.0, + "num_input_tokens_seen": 77223560, + "step": 133105 + }, + { + "epoch": 19.82573726541555, + "grad_norm": 3.0390908705157926e-06, + "learning_rate": 1.1581967140009142e-08, + "loss": 0.0, + "num_input_tokens_seen": 77226152, + "step": 133110 + }, + { + "epoch": 19.826481977956508, + "grad_norm": 0.00012292280734982342, + "learning_rate": 1.1483279218296728e-08, + "loss": 0.0, + "num_input_tokens_seen": 77229000, + "step": 133115 + }, + { + "epoch": 19.82722669049747, + "grad_norm": 1.8708748029894195e-05, + "learning_rate": 1.1385013452178128e-08, + "loss": 0.0, + "num_input_tokens_seen": 77231880, + "step": 133120 + }, + { + "epoch": 19.827971403038426, + "grad_norm": 2.1293765257723862e-06, + "learning_rate": 1.1287169843313127e-08, + "loss": 0.0, + "num_input_tokens_seen": 77234856, + "step": 133125 + }, + { + "epoch": 19.828716115579386, + "grad_norm": 0.0003431244404055178, + "learning_rate": 1.1189748393353184e-08, + "loss": 0.0, + "num_input_tokens_seen": 77238120, + "step": 133130 + }, + { + "epoch": 19.829460828120347, + "grad_norm": 2.564691158113419e-06, + "learning_rate": 1.10927491039442e-08, + "loss": 0.0, + "num_input_tokens_seen": 77240872, + "step": 133135 + }, + { + "epoch": 19.830205540661304, + "grad_norm": 2.2776682726544095e-06, + "learning_rate": 1.0996171976726532e-08, + "loss": 0.0, + "num_input_tokens_seen": 77243720, + "step": 133140 + }, + { + "epoch": 19.830950253202264, + "grad_norm": 1.1799314961535856e-05, + "learning_rate": 1.0900017013329434e-08, + "loss": 0.0, + "num_input_tokens_seen": 77248008, + "step": 133145 + }, + { + "epoch": 19.831694965743225, + "grad_norm": 4.876675575360423e-06, + "learning_rate": 1.0804284215379379e-08, + "loss": 0.0, + "num_input_tokens_seen": 77250600, + "step": 133150 + }, + { + "epoch": 19.83243967828418, + "grad_norm": 5.86408759772894e-06, + "learning_rate": 1.070897358449452e-08, + "loss": 0.0, + "num_input_tokens_seen": 77253448, + "step": 133155 + }, + { + "epoch": 19.833184390825142, + "grad_norm": 6.529789061460178e-06, + "learning_rate": 1.0614085122281902e-08, + "loss": 0.0, + "num_input_tokens_seen": 77256488, + "step": 133160 + }, + { + "epoch": 19.8339291033661, + "grad_norm": 3.5579455470724497e-06, + "learning_rate": 1.0519618830348577e-08, + "loss": 0.0, + "num_input_tokens_seen": 77259368, + "step": 133165 + }, + { + "epoch": 19.83467381590706, + "grad_norm": 9.580593177815899e-05, + "learning_rate": 1.0425574710284936e-08, + "loss": 0.0, + "num_input_tokens_seen": 77262120, + "step": 133170 + }, + { + "epoch": 19.83541852844802, + "grad_norm": 5.4340052884072065e-05, + "learning_rate": 1.033195276368415e-08, + "loss": 0.0, + "num_input_tokens_seen": 77264552, + "step": 133175 + }, + { + "epoch": 19.836163240988977, + "grad_norm": 1.6086202094811597e-06, + "learning_rate": 1.0238752992128287e-08, + "loss": 0.0, + "num_input_tokens_seen": 77267400, + "step": 133180 + }, + { + "epoch": 19.836907953529938, + "grad_norm": 6.51942764307023e-06, + "learning_rate": 1.0145975397188311e-08, + "loss": 0.0, + "num_input_tokens_seen": 77270280, + "step": 133185 + }, + { + "epoch": 19.8376526660709, + "grad_norm": 2.4919279439927777e-06, + "learning_rate": 1.0053619980435191e-08, + "loss": 0.0, + "num_input_tokens_seen": 77273224, + "step": 133190 + }, + { + "epoch": 19.838397378611855, + "grad_norm": 0.0004855263396166265, + "learning_rate": 9.961686743426012e-09, + "loss": 0.0, + "num_input_tokens_seen": 77276104, + "step": 133195 + }, + { + "epoch": 19.839142091152816, + "grad_norm": 3.24415987051907e-06, + "learning_rate": 9.870175687715089e-09, + "loss": 0.0, + "num_input_tokens_seen": 77278632, + "step": 133200 + }, + { + "epoch": 19.839886803693773, + "grad_norm": 2.9314137464098167e-06, + "learning_rate": 9.77908681485118e-09, + "loss": 0.0, + "num_input_tokens_seen": 77281320, + "step": 133205 + }, + { + "epoch": 19.840631516234733, + "grad_norm": 2.58981526712887e-06, + "learning_rate": 9.688420126369168e-09, + "loss": 0.0, + "num_input_tokens_seen": 77284040, + "step": 133210 + }, + { + "epoch": 19.841376228775694, + "grad_norm": 8.2904334703926e-06, + "learning_rate": 9.598175623801165e-09, + "loss": 0.0, + "num_input_tokens_seen": 77286760, + "step": 133215 + }, + { + "epoch": 19.84212094131665, + "grad_norm": 2.2703411559632514e-06, + "learning_rate": 9.508353308673723e-09, + "loss": 0.0, + "num_input_tokens_seen": 77289576, + "step": 133220 + }, + { + "epoch": 19.84286565385761, + "grad_norm": 2.4402902454312425e-06, + "learning_rate": 9.4189531825023e-09, + "loss": 0.0, + "num_input_tokens_seen": 77292776, + "step": 133225 + }, + { + "epoch": 19.843610366398572, + "grad_norm": 9.24107080209069e-05, + "learning_rate": 9.329975246799571e-09, + "loss": 0.0, + "num_input_tokens_seen": 77295496, + "step": 133230 + }, + { + "epoch": 19.84435507893953, + "grad_norm": 3.464703013378312e-06, + "learning_rate": 9.241419503069892e-09, + "loss": 0.0, + "num_input_tokens_seen": 77298472, + "step": 133235 + }, + { + "epoch": 19.84509979148049, + "grad_norm": 6.187388862599619e-06, + "learning_rate": 9.153285952803736e-09, + "loss": 0.0, + "num_input_tokens_seen": 77301480, + "step": 133240 + }, + { + "epoch": 19.845844504021446, + "grad_norm": 2.245499445052701e-06, + "learning_rate": 9.065574597494352e-09, + "loss": 0.0001, + "num_input_tokens_seen": 77304200, + "step": 133245 + }, + { + "epoch": 19.846589216562407, + "grad_norm": 2.1949932488496415e-05, + "learning_rate": 8.978285438621115e-09, + "loss": 0.0, + "num_input_tokens_seen": 77306888, + "step": 133250 + }, + { + "epoch": 19.847333929103367, + "grad_norm": 2.4296112314914353e-05, + "learning_rate": 8.891418477660617e-09, + "loss": 0.0, + "num_input_tokens_seen": 77309480, + "step": 133255 + }, + { + "epoch": 19.848078641644324, + "grad_norm": 2.395935780441505e-06, + "learning_rate": 8.804973716081132e-09, + "loss": 0.0001, + "num_input_tokens_seen": 77312328, + "step": 133260 + }, + { + "epoch": 19.848823354185285, + "grad_norm": 9.1624962806236e-05, + "learning_rate": 8.718951155339827e-09, + "loss": 0.0, + "num_input_tokens_seen": 77315656, + "step": 133265 + }, + { + "epoch": 19.849568066726242, + "grad_norm": 2.1308242139639333e-06, + "learning_rate": 8.633350796893869e-09, + "loss": 0.0, + "num_input_tokens_seen": 77318600, + "step": 133270 + }, + { + "epoch": 19.850312779267203, + "grad_norm": 3.9066771932994016e-06, + "learning_rate": 8.548172642186547e-09, + "loss": 0.0, + "num_input_tokens_seen": 77321288, + "step": 133275 + }, + { + "epoch": 19.851057491808163, + "grad_norm": 4.325564077589661e-05, + "learning_rate": 8.463416692658377e-09, + "loss": 0.0, + "num_input_tokens_seen": 77324200, + "step": 133280 + }, + { + "epoch": 19.85180220434912, + "grad_norm": 2.49062350121676e-06, + "learning_rate": 8.37908294973877e-09, + "loss": 0.0, + "num_input_tokens_seen": 77327496, + "step": 133285 + }, + { + "epoch": 19.85254691689008, + "grad_norm": 1.5451243598363362e-05, + "learning_rate": 8.295171414854363e-09, + "loss": 0.0, + "num_input_tokens_seen": 77330248, + "step": 133290 + }, + { + "epoch": 19.85329162943104, + "grad_norm": 2.3604459329362726e-06, + "learning_rate": 8.211682089423467e-09, + "loss": 0.0, + "num_input_tokens_seen": 77333000, + "step": 133295 + }, + { + "epoch": 19.854036341971998, + "grad_norm": 1.5914069081190974e-05, + "learning_rate": 8.128614974856064e-09, + "loss": 0.0, + "num_input_tokens_seen": 77335944, + "step": 133300 + }, + { + "epoch": 19.85478105451296, + "grad_norm": 0.00013438657333608717, + "learning_rate": 8.04597007255381e-09, + "loss": 0.0, + "num_input_tokens_seen": 77338632, + "step": 133305 + }, + { + "epoch": 19.855525767053916, + "grad_norm": 3.1229217256623087e-06, + "learning_rate": 7.963747383915587e-09, + "loss": 0.0, + "num_input_tokens_seen": 77341480, + "step": 133310 + }, + { + "epoch": 19.856270479594876, + "grad_norm": 9.67698451859178e-06, + "learning_rate": 7.881946910329175e-09, + "loss": 0.0, + "num_input_tokens_seen": 77344328, + "step": 133315 + }, + { + "epoch": 19.857015192135837, + "grad_norm": 5.167113158677239e-06, + "learning_rate": 7.800568653174023e-09, + "loss": 0.0, + "num_input_tokens_seen": 77347432, + "step": 133320 + }, + { + "epoch": 19.857759904676794, + "grad_norm": 6.732301699230447e-06, + "learning_rate": 7.719612613829586e-09, + "loss": 0.0, + "num_input_tokens_seen": 77350312, + "step": 133325 + }, + { + "epoch": 19.858504617217754, + "grad_norm": 2.9057732717774343e-06, + "learning_rate": 7.639078793661436e-09, + "loss": 0.0, + "num_input_tokens_seen": 77353352, + "step": 133330 + }, + { + "epoch": 19.859249329758715, + "grad_norm": 1.2358970707282424e-05, + "learning_rate": 7.558967194029598e-09, + "loss": 0.0, + "num_input_tokens_seen": 77356104, + "step": 133335 + }, + { + "epoch": 19.85999404229967, + "grad_norm": 0.00030682794749736786, + "learning_rate": 7.479277816285768e-09, + "loss": 0.0, + "num_input_tokens_seen": 77358888, + "step": 133340 + }, + { + "epoch": 19.860738754840632, + "grad_norm": 2.707256271605729e-06, + "learning_rate": 7.400010661781642e-09, + "loss": 0.0, + "num_input_tokens_seen": 77361640, + "step": 133345 + }, + { + "epoch": 19.86148346738159, + "grad_norm": 9.775849321158603e-06, + "learning_rate": 7.321165731849488e-09, + "loss": 0.0, + "num_input_tokens_seen": 77364616, + "step": 133350 + }, + { + "epoch": 19.86222817992255, + "grad_norm": 4.1020766730071045e-06, + "learning_rate": 7.242743027827126e-09, + "loss": 0.0, + "num_input_tokens_seen": 77367272, + "step": 133355 + }, + { + "epoch": 19.86297289246351, + "grad_norm": 7.358406037383247e-06, + "learning_rate": 7.1647425510384944e-09, + "loss": 0.0, + "num_input_tokens_seen": 77370152, + "step": 133360 + }, + { + "epoch": 19.863717605004467, + "grad_norm": 3.765277824641089e-06, + "learning_rate": 7.087164302796434e-09, + "loss": 0.0, + "num_input_tokens_seen": 77373064, + "step": 133365 + }, + { + "epoch": 19.864462317545428, + "grad_norm": 7.262455710588256e-06, + "learning_rate": 7.010008284416558e-09, + "loss": 0.0, + "num_input_tokens_seen": 77375880, + "step": 133370 + }, + { + "epoch": 19.86520703008639, + "grad_norm": 5.719095497624949e-06, + "learning_rate": 6.933274497200604e-09, + "loss": 0.0, + "num_input_tokens_seen": 77378888, + "step": 133375 + }, + { + "epoch": 19.865951742627345, + "grad_norm": 2.323055241504335e-06, + "learning_rate": 6.856962942447531e-09, + "loss": 0.0, + "num_input_tokens_seen": 77381896, + "step": 133380 + }, + { + "epoch": 19.866696455168306, + "grad_norm": 7.159041160775814e-06, + "learning_rate": 6.781073621442424e-09, + "loss": 0.0, + "num_input_tokens_seen": 77384680, + "step": 133385 + }, + { + "epoch": 19.867441167709263, + "grad_norm": 1.825157823986956e-06, + "learning_rate": 6.70560653546759e-09, + "loss": 0.0, + "num_input_tokens_seen": 77387912, + "step": 133390 + }, + { + "epoch": 19.868185880250223, + "grad_norm": 7.50004937799531e-06, + "learning_rate": 6.6305616857997855e-09, + "loss": 0.0, + "num_input_tokens_seen": 77390824, + "step": 133395 + }, + { + "epoch": 19.868930592791184, + "grad_norm": 3.4188764402642846e-05, + "learning_rate": 6.55593907370744e-09, + "loss": 0.0, + "num_input_tokens_seen": 77393544, + "step": 133400 + }, + { + "epoch": 19.86967530533214, + "grad_norm": 2.1630933133565122e-06, + "learning_rate": 6.481738700450657e-09, + "loss": 0.0, + "num_input_tokens_seen": 77396520, + "step": 133405 + }, + { + "epoch": 19.8704200178731, + "grad_norm": 2.6821953724720515e-05, + "learning_rate": 6.4079605672839886e-09, + "loss": 0.0, + "num_input_tokens_seen": 77399560, + "step": 133410 + }, + { + "epoch": 19.87116473041406, + "grad_norm": 2.394369175817701e-06, + "learning_rate": 6.334604675450884e-09, + "loss": 0.0, + "num_input_tokens_seen": 77402216, + "step": 133415 + }, + { + "epoch": 19.87190944295502, + "grad_norm": 4.183088822173886e-05, + "learning_rate": 6.2616710261920176e-09, + "loss": 0.0, + "num_input_tokens_seen": 77405192, + "step": 133420 + }, + { + "epoch": 19.87265415549598, + "grad_norm": 3.821295649686363e-06, + "learning_rate": 6.189159620739737e-09, + "loss": 0.0, + "num_input_tokens_seen": 77407944, + "step": 133425 + }, + { + "epoch": 19.873398868036936, + "grad_norm": 3.00333158520516e-06, + "learning_rate": 6.117070460318064e-09, + "loss": 0.0, + "num_input_tokens_seen": 77410824, + "step": 133430 + }, + { + "epoch": 19.874143580577897, + "grad_norm": 4.626354711945169e-05, + "learning_rate": 6.045403546148243e-09, + "loss": 0.0, + "num_input_tokens_seen": 77413896, + "step": 133435 + }, + { + "epoch": 19.874888293118858, + "grad_norm": 8.221011739806272e-06, + "learning_rate": 5.974158879434866e-09, + "loss": 0.0, + "num_input_tokens_seen": 77416904, + "step": 133440 + }, + { + "epoch": 19.875633005659815, + "grad_norm": 7.077171630953671e-06, + "learning_rate": 5.903336461388076e-09, + "loss": 0.0, + "num_input_tokens_seen": 77419720, + "step": 133445 + }, + { + "epoch": 19.876377718200775, + "grad_norm": 3.201409299435909e-06, + "learning_rate": 5.832936293201363e-09, + "loss": 0.0, + "num_input_tokens_seen": 77422792, + "step": 133450 + }, + { + "epoch": 19.877122430741732, + "grad_norm": 1.4614918654842768e-05, + "learning_rate": 5.762958376065441e-09, + "loss": 0.0, + "num_input_tokens_seen": 77425480, + "step": 133455 + }, + { + "epoch": 19.877867143282693, + "grad_norm": 4.317099865147611e-06, + "learning_rate": 5.693402711159923e-09, + "loss": 0.0, + "num_input_tokens_seen": 77428840, + "step": 133460 + }, + { + "epoch": 19.878611855823653, + "grad_norm": 3.6583360270014964e-06, + "learning_rate": 5.6242692996616445e-09, + "loss": 0.0, + "num_input_tokens_seen": 77431784, + "step": 133465 + }, + { + "epoch": 19.87935656836461, + "grad_norm": 4.44912257080432e-06, + "learning_rate": 5.555558142736339e-09, + "loss": 0.0, + "num_input_tokens_seen": 77434536, + "step": 133470 + }, + { + "epoch": 19.88010128090557, + "grad_norm": 4.540507688943762e-06, + "learning_rate": 5.487269241549742e-09, + "loss": 0.0, + "num_input_tokens_seen": 77437288, + "step": 133475 + }, + { + "epoch": 19.88084599344653, + "grad_norm": 2.196668901888188e-05, + "learning_rate": 5.419402597250934e-09, + "loss": 0.0, + "num_input_tokens_seen": 77439976, + "step": 133480 + }, + { + "epoch": 19.881590705987488, + "grad_norm": 0.001398004125803709, + "learning_rate": 5.351958210986219e-09, + "loss": 0.0, + "num_input_tokens_seen": 77442760, + "step": 133485 + }, + { + "epoch": 19.88233541852845, + "grad_norm": 7.121307135093957e-06, + "learning_rate": 5.284936083899128e-09, + "loss": 0.0, + "num_input_tokens_seen": 77445512, + "step": 133490 + }, + { + "epoch": 19.883080131069406, + "grad_norm": 3.598929424697417e-06, + "learning_rate": 5.218336217119313e-09, + "loss": 0.0, + "num_input_tokens_seen": 77448392, + "step": 133495 + }, + { + "epoch": 19.883824843610366, + "grad_norm": 6.814763310103444e-06, + "learning_rate": 5.152158611770874e-09, + "loss": 0.0, + "num_input_tokens_seen": 77451304, + "step": 133500 + }, + { + "epoch": 19.884569556151327, + "grad_norm": 3.878111783706117e-06, + "learning_rate": 5.086403268975137e-09, + "loss": 0.0, + "num_input_tokens_seen": 77454376, + "step": 133505 + }, + { + "epoch": 19.885314268692284, + "grad_norm": 1.4733594071003608e-05, + "learning_rate": 5.0210701898395494e-09, + "loss": 0.0, + "num_input_tokens_seen": 77457160, + "step": 133510 + }, + { + "epoch": 19.886058981233244, + "grad_norm": 6.387493158399593e-06, + "learning_rate": 4.956159375468783e-09, + "loss": 0.0, + "num_input_tokens_seen": 77459816, + "step": 133515 + }, + { + "epoch": 19.886803693774205, + "grad_norm": 3.598476041588583e-06, + "learning_rate": 4.891670826959183e-09, + "loss": 0.0, + "num_input_tokens_seen": 77462920, + "step": 133520 + }, + { + "epoch": 19.88754840631516, + "grad_norm": 2.9820492272847332e-05, + "learning_rate": 4.8276045454043185e-09, + "loss": 0.0, + "num_input_tokens_seen": 77465960, + "step": 133525 + }, + { + "epoch": 19.888293118856122, + "grad_norm": 2.845285962393973e-06, + "learning_rate": 4.763960531878331e-09, + "loss": 0.0, + "num_input_tokens_seen": 77468424, + "step": 133530 + }, + { + "epoch": 19.88903783139708, + "grad_norm": 1.9743063603527844e-05, + "learning_rate": 4.700738787466463e-09, + "loss": 0.0, + "num_input_tokens_seen": 77471432, + "step": 133535 + }, + { + "epoch": 19.88978254393804, + "grad_norm": 4.817224180442281e-06, + "learning_rate": 4.637939313226203e-09, + "loss": 0.0, + "num_input_tokens_seen": 77474024, + "step": 133540 + }, + { + "epoch": 19.890527256479, + "grad_norm": 3.7143349800317083e-06, + "learning_rate": 4.575562110228915e-09, + "loss": 0.0, + "num_input_tokens_seen": 77476808, + "step": 133545 + }, + { + "epoch": 19.891271969019957, + "grad_norm": 1.7923623090609908e-05, + "learning_rate": 4.513607179520985e-09, + "loss": 0.0, + "num_input_tokens_seen": 77479912, + "step": 133550 + }, + { + "epoch": 19.892016681560918, + "grad_norm": 5.802807663712883e-06, + "learning_rate": 4.452074522148797e-09, + "loss": 0.0, + "num_input_tokens_seen": 77482568, + "step": 133555 + }, + { + "epoch": 19.89276139410188, + "grad_norm": 3.5388770811550785e-06, + "learning_rate": 4.390964139158737e-09, + "loss": 0.0, + "num_input_tokens_seen": 77485544, + "step": 133560 + }, + { + "epoch": 19.893506106642835, + "grad_norm": 0.00011276362783974037, + "learning_rate": 4.330276031577762e-09, + "loss": 0.0, + "num_input_tokens_seen": 77488456, + "step": 133565 + }, + { + "epoch": 19.894250819183796, + "grad_norm": 0.0027106688357889652, + "learning_rate": 4.270010200430052e-09, + "loss": 0.0, + "num_input_tokens_seen": 77491688, + "step": 133570 + }, + { + "epoch": 19.894995531724753, + "grad_norm": 7.630159234395251e-05, + "learning_rate": 4.210166646737013e-09, + "loss": 0.0, + "num_input_tokens_seen": 77494600, + "step": 133575 + }, + { + "epoch": 19.895740244265713, + "grad_norm": 4.243191597197438e-06, + "learning_rate": 4.150745371508946e-09, + "loss": 0.0, + "num_input_tokens_seen": 77497192, + "step": 133580 + }, + { + "epoch": 19.896484956806674, + "grad_norm": 6.682720140815945e-06, + "learning_rate": 4.0917463757506045e-09, + "loss": 0.0, + "num_input_tokens_seen": 77499976, + "step": 133585 + }, + { + "epoch": 19.89722966934763, + "grad_norm": 3.0061912639212096e-06, + "learning_rate": 4.033169660458413e-09, + "loss": 0.0, + "num_input_tokens_seen": 77502696, + "step": 133590 + }, + { + "epoch": 19.89797438188859, + "grad_norm": 2.7036460323870415e-06, + "learning_rate": 3.975015226617695e-09, + "loss": 0.0, + "num_input_tokens_seen": 77505704, + "step": 133595 + }, + { + "epoch": 19.898719094429552, + "grad_norm": 1.7025677152560093e-05, + "learning_rate": 3.917283075216549e-09, + "loss": 0.0, + "num_input_tokens_seen": 77508520, + "step": 133600 + }, + { + "epoch": 19.89946380697051, + "grad_norm": 1.8038124380836962e-06, + "learning_rate": 3.8599732072264195e-09, + "loss": 0.0, + "num_input_tokens_seen": 77511496, + "step": 133605 + }, + { + "epoch": 19.90020851951147, + "grad_norm": 1.2259637514944188e-05, + "learning_rate": 3.803085623618752e-09, + "loss": 0.0, + "num_input_tokens_seen": 77514312, + "step": 133610 + }, + { + "epoch": 19.900953232052427, + "grad_norm": 2.3388325644191355e-05, + "learning_rate": 3.746620325351113e-09, + "loss": 0.0, + "num_input_tokens_seen": 77517320, + "step": 133615 + }, + { + "epoch": 19.901697944593387, + "grad_norm": 0.0017192503437399864, + "learning_rate": 3.690577313381072e-09, + "loss": 0.0, + "num_input_tokens_seen": 77520392, + "step": 133620 + }, + { + "epoch": 19.902442657134348, + "grad_norm": 7.652379281353205e-06, + "learning_rate": 3.6349565886523163e-09, + "loss": 0.0, + "num_input_tokens_seen": 77523016, + "step": 133625 + }, + { + "epoch": 19.903187369675305, + "grad_norm": 2.5725286832312122e-05, + "learning_rate": 3.579758152105761e-09, + "loss": 0.0, + "num_input_tokens_seen": 77526024, + "step": 133630 + }, + { + "epoch": 19.903932082216265, + "grad_norm": 9.050376684172079e-05, + "learning_rate": 3.524982004676769e-09, + "loss": 0.0, + "num_input_tokens_seen": 77529160, + "step": 133635 + }, + { + "epoch": 19.904676794757222, + "grad_norm": 3.1455336284125224e-05, + "learning_rate": 3.4706281472840495e-09, + "loss": 0.0, + "num_input_tokens_seen": 77532232, + "step": 133640 + }, + { + "epoch": 19.905421507298183, + "grad_norm": 6.607264367630705e-06, + "learning_rate": 3.4166965808518637e-09, + "loss": 0.0, + "num_input_tokens_seen": 77535304, + "step": 133645 + }, + { + "epoch": 19.906166219839143, + "grad_norm": 4.2435003706486896e-06, + "learning_rate": 3.363187306287818e-09, + "loss": 0.0, + "num_input_tokens_seen": 77538280, + "step": 133650 + }, + { + "epoch": 19.9069109323801, + "grad_norm": 4.006737526651705e-06, + "learning_rate": 3.310100324499521e-09, + "loss": 0.0, + "num_input_tokens_seen": 77540936, + "step": 133655 + }, + { + "epoch": 19.90765564492106, + "grad_norm": 2.891487383749336e-05, + "learning_rate": 3.2574356363807013e-09, + "loss": 0.0, + "num_input_tokens_seen": 77543592, + "step": 133660 + }, + { + "epoch": 19.90840035746202, + "grad_norm": 1.8727331507761846e-06, + "learning_rate": 3.2051932428195375e-09, + "loss": 0.0, + "num_input_tokens_seen": 77546344, + "step": 133665 + }, + { + "epoch": 19.909145070002978, + "grad_norm": 1.51898948388407e-05, + "learning_rate": 3.153373144704208e-09, + "loss": 0.0, + "num_input_tokens_seen": 77549000, + "step": 133670 + }, + { + "epoch": 19.90988978254394, + "grad_norm": 0.00036273329169489443, + "learning_rate": 3.1019753429062383e-09, + "loss": 0.0, + "num_input_tokens_seen": 77551880, + "step": 133675 + }, + { + "epoch": 19.910634495084896, + "grad_norm": 4.056230409332784e-06, + "learning_rate": 3.050999838294377e-09, + "loss": 0.0, + "num_input_tokens_seen": 77554696, + "step": 133680 + }, + { + "epoch": 19.911379207625856, + "grad_norm": 0.0004371872346382588, + "learning_rate": 3.000446631729048e-09, + "loss": 0.0, + "num_input_tokens_seen": 77557704, + "step": 133685 + }, + { + "epoch": 19.912123920166817, + "grad_norm": 2.8856534299848136e-06, + "learning_rate": 2.9503157240651226e-09, + "loss": 0.0, + "num_input_tokens_seen": 77560648, + "step": 133690 + }, + { + "epoch": 19.912868632707774, + "grad_norm": 2.2847063974040793e-06, + "learning_rate": 2.900607116151921e-09, + "loss": 0.0, + "num_input_tokens_seen": 77563688, + "step": 133695 + }, + { + "epoch": 19.913613345248734, + "grad_norm": 3.782241765293293e-05, + "learning_rate": 2.8513208088248867e-09, + "loss": 0.0, + "num_input_tokens_seen": 77566600, + "step": 133700 + }, + { + "epoch": 19.914358057789695, + "grad_norm": 2.9177435862948187e-05, + "learning_rate": 2.802456802919462e-09, + "loss": 0.0, + "num_input_tokens_seen": 77569224, + "step": 133705 + }, + { + "epoch": 19.915102770330652, + "grad_norm": 9.680659786681645e-06, + "learning_rate": 2.7540150992627633e-09, + "loss": 0.0, + "num_input_tokens_seen": 77572008, + "step": 133710 + }, + { + "epoch": 19.915847482871612, + "grad_norm": 5.890231932426104e-06, + "learning_rate": 2.705995698668029e-09, + "loss": 0.0, + "num_input_tokens_seen": 77574792, + "step": 133715 + }, + { + "epoch": 19.91659219541257, + "grad_norm": 1.0489426131243818e-05, + "learning_rate": 2.658398601951273e-09, + "loss": 0.0, + "num_input_tokens_seen": 77577704, + "step": 133720 + }, + { + "epoch": 19.91733690795353, + "grad_norm": 0.0018145320937037468, + "learning_rate": 2.6112238099146315e-09, + "loss": 0.0, + "num_input_tokens_seen": 77580200, + "step": 133725 + }, + { + "epoch": 19.91808162049449, + "grad_norm": 0.00013435199798550457, + "learning_rate": 2.564471323354689e-09, + "loss": 0.0, + "num_input_tokens_seen": 77582984, + "step": 133730 + }, + { + "epoch": 19.918826333035447, + "grad_norm": 4.729800366476411e-06, + "learning_rate": 2.5181411430597045e-09, + "loss": 0.0, + "num_input_tokens_seen": 77585992, + "step": 133735 + }, + { + "epoch": 19.919571045576408, + "grad_norm": 4.617987542587798e-06, + "learning_rate": 2.472233269817936e-09, + "loss": 0.0, + "num_input_tokens_seen": 77589000, + "step": 133740 + }, + { + "epoch": 19.92031575811737, + "grad_norm": 2.6180105123785324e-06, + "learning_rate": 2.4267477043982134e-09, + "loss": 0.0, + "num_input_tokens_seen": 77591976, + "step": 133745 + }, + { + "epoch": 19.921060470658325, + "grad_norm": 3.956212822231464e-06, + "learning_rate": 2.3816844475749167e-09, + "loss": 0.0, + "num_input_tokens_seen": 77594824, + "step": 133750 + }, + { + "epoch": 19.921805183199286, + "grad_norm": 3.527999979269225e-06, + "learning_rate": 2.337043500102998e-09, + "loss": 0.0, + "num_input_tokens_seen": 77597992, + "step": 133755 + }, + { + "epoch": 19.922549895740243, + "grad_norm": 2.52060021921352e-06, + "learning_rate": 2.2928248627429595e-09, + "loss": 0.0, + "num_input_tokens_seen": 77600712, + "step": 133760 + }, + { + "epoch": 19.923294608281203, + "grad_norm": 2.7292510367260547e-06, + "learning_rate": 2.249028536238651e-09, + "loss": 0.0, + "num_input_tokens_seen": 77603720, + "step": 133765 + }, + { + "epoch": 19.924039320822164, + "grad_norm": 2.6460556910024025e-06, + "learning_rate": 2.205654521331146e-09, + "loss": 0.0, + "num_input_tokens_seen": 77606408, + "step": 133770 + }, + { + "epoch": 19.92478403336312, + "grad_norm": 3.0717703793925466e-06, + "learning_rate": 2.162702818753193e-09, + "loss": 0.0, + "num_input_tokens_seen": 77608968, + "step": 133775 + }, + { + "epoch": 19.92552874590408, + "grad_norm": 3.7542565678450046e-06, + "learning_rate": 2.120173429226435e-09, + "loss": 0.0, + "num_input_tokens_seen": 77611624, + "step": 133780 + }, + { + "epoch": 19.92627345844504, + "grad_norm": 7.110424576239893e-06, + "learning_rate": 2.0780663534752944e-09, + "loss": 0.0, + "num_input_tokens_seen": 77615080, + "step": 133785 + }, + { + "epoch": 19.927018170986, + "grad_norm": 0.0010678846156224608, + "learning_rate": 2.036381592207537e-09, + "loss": 0.0, + "num_input_tokens_seen": 77618120, + "step": 133790 + }, + { + "epoch": 19.92776288352696, + "grad_norm": 2.2462584183813306e-06, + "learning_rate": 1.9951191461281547e-09, + "loss": 0.0, + "num_input_tokens_seen": 77621096, + "step": 133795 + }, + { + "epoch": 19.928507596067917, + "grad_norm": 3.146005474263802e-05, + "learning_rate": 1.9542790159365887e-09, + "loss": 0.0, + "num_input_tokens_seen": 77623848, + "step": 133800 + }, + { + "epoch": 19.929252308608877, + "grad_norm": 0.001242562779225409, + "learning_rate": 1.913861202318401e-09, + "loss": 0.0, + "num_input_tokens_seen": 77626696, + "step": 133805 + }, + { + "epoch": 19.929997021149838, + "grad_norm": 1.0690954695746768e-05, + "learning_rate": 1.873865705959155e-09, + "loss": 0.0, + "num_input_tokens_seen": 77629704, + "step": 133810 + }, + { + "epoch": 19.930741733690795, + "grad_norm": 1.9376116142666433e-06, + "learning_rate": 1.83429252753331e-09, + "loss": 0.0, + "num_input_tokens_seen": 77632520, + "step": 133815 + }, + { + "epoch": 19.931486446231755, + "grad_norm": 2.2893430013937177e-06, + "learning_rate": 1.7951416677097766e-09, + "loss": 0.0, + "num_input_tokens_seen": 77635432, + "step": 133820 + }, + { + "epoch": 19.932231158772712, + "grad_norm": 4.957498276780825e-06, + "learning_rate": 1.7564131271519123e-09, + "loss": 0.0, + "num_input_tokens_seen": 77638376, + "step": 133825 + }, + { + "epoch": 19.932975871313673, + "grad_norm": 3.6292578897700878e-06, + "learning_rate": 1.7181069065119736e-09, + "loss": 0.0, + "num_input_tokens_seen": 77641448, + "step": 133830 + }, + { + "epoch": 19.933720583854633, + "grad_norm": 5.070062798040453e-06, + "learning_rate": 1.6802230064366653e-09, + "loss": 0.0, + "num_input_tokens_seen": 77644296, + "step": 133835 + }, + { + "epoch": 19.93446529639559, + "grad_norm": 9.411804967385251e-06, + "learning_rate": 1.642761427567141e-09, + "loss": 0.0, + "num_input_tokens_seen": 77647176, + "step": 133840 + }, + { + "epoch": 19.93521000893655, + "grad_norm": 3.955180545744952e-06, + "learning_rate": 1.605722170536228e-09, + "loss": 0.0, + "num_input_tokens_seen": 77650248, + "step": 133845 + }, + { + "epoch": 19.93595472147751, + "grad_norm": 1.6739204511395656e-05, + "learning_rate": 1.5691052359684266e-09, + "loss": 0.0, + "num_input_tokens_seen": 77652872, + "step": 133850 + }, + { + "epoch": 19.93669943401847, + "grad_norm": 5.268406312097795e-06, + "learning_rate": 1.5329106244854618e-09, + "loss": 0.0, + "num_input_tokens_seen": 77655880, + "step": 133855 + }, + { + "epoch": 19.93744414655943, + "grad_norm": 5.18910565006081e-06, + "learning_rate": 1.4971383366951807e-09, + "loss": 0.0, + "num_input_tokens_seen": 77658920, + "step": 133860 + }, + { + "epoch": 19.938188859100386, + "grad_norm": 4.493299911700888e-06, + "learning_rate": 1.4617883732026549e-09, + "loss": 0.0, + "num_input_tokens_seen": 77662120, + "step": 133865 + }, + { + "epoch": 19.938933571641346, + "grad_norm": 3.8055386539781466e-05, + "learning_rate": 1.4268607346074048e-09, + "loss": 0.0, + "num_input_tokens_seen": 77664904, + "step": 133870 + }, + { + "epoch": 19.939678284182307, + "grad_norm": 1.8614392729432439e-06, + "learning_rate": 1.3923554214978485e-09, + "loss": 0.0, + "num_input_tokens_seen": 77667560, + "step": 133875 + }, + { + "epoch": 19.940422996723264, + "grad_norm": 2.343216920053237e-06, + "learning_rate": 1.3582724344568532e-09, + "loss": 0.0, + "num_input_tokens_seen": 77670568, + "step": 133880 + }, + { + "epoch": 19.941167709264224, + "grad_norm": 3.5610526083473815e-06, + "learning_rate": 1.3246117740589592e-09, + "loss": 0.0, + "num_input_tokens_seen": 77673288, + "step": 133885 + }, + { + "epoch": 19.941912421805185, + "grad_norm": 6.8867184381815605e-06, + "learning_rate": 1.2913734408759314e-09, + "loss": 0.0002, + "num_input_tokens_seen": 77676264, + "step": 133890 + }, + { + "epoch": 19.942657134346142, + "grad_norm": 6.549127647303976e-06, + "learning_rate": 1.258557435465657e-09, + "loss": 0.0, + "num_input_tokens_seen": 77679144, + "step": 133895 + }, + { + "epoch": 19.943401846887102, + "grad_norm": 5.382431481848471e-06, + "learning_rate": 1.226163758386023e-09, + "loss": 0.0, + "num_input_tokens_seen": 77682408, + "step": 133900 + }, + { + "epoch": 19.94414655942806, + "grad_norm": 3.472256821623887e-06, + "learning_rate": 1.1941924101838142e-09, + "loss": 0.0, + "num_input_tokens_seen": 77685384, + "step": 133905 + }, + { + "epoch": 19.94489127196902, + "grad_norm": 4.951924893248361e-06, + "learning_rate": 1.1626433913947132e-09, + "loss": 0.0, + "num_input_tokens_seen": 77688488, + "step": 133910 + }, + { + "epoch": 19.94563598450998, + "grad_norm": 1.1417899258958641e-05, + "learning_rate": 1.1315167025571782e-09, + "loss": 0.0, + "num_input_tokens_seen": 77691144, + "step": 133915 + }, + { + "epoch": 19.946380697050937, + "grad_norm": 2.286688868480269e-06, + "learning_rate": 1.1008123441957896e-09, + "loss": 0.0, + "num_input_tokens_seen": 77694024, + "step": 133920 + }, + { + "epoch": 19.947125409591898, + "grad_norm": 2.6854024781641783e-06, + "learning_rate": 1.070530316826801e-09, + "loss": 0.0, + "num_input_tokens_seen": 77697064, + "step": 133925 + }, + { + "epoch": 19.947870122132855, + "grad_norm": 2.804596078931354e-06, + "learning_rate": 1.0406706209636908e-09, + "loss": 0.0, + "num_input_tokens_seen": 77699848, + "step": 133930 + }, + { + "epoch": 19.948614834673815, + "grad_norm": 9.166783456748817e-06, + "learning_rate": 1.01123325711161e-09, + "loss": 0.0, + "num_input_tokens_seen": 77702792, + "step": 133935 + }, + { + "epoch": 19.949359547214776, + "grad_norm": 0.00010834464046638459, + "learning_rate": 9.82218225767384e-10, + "loss": 0.0, + "num_input_tokens_seen": 77705608, + "step": 133940 + }, + { + "epoch": 19.950104259755733, + "grad_norm": 4.4473996240412816e-05, + "learning_rate": 9.536255274195105e-10, + "loss": 0.0, + "num_input_tokens_seen": 77708552, + "step": 133945 + }, + { + "epoch": 19.950848972296694, + "grad_norm": 0.00012111903924960643, + "learning_rate": 9.254551625509367e-10, + "loss": 0.0, + "num_input_tokens_seen": 77711496, + "step": 133950 + }, + { + "epoch": 19.951593684837654, + "grad_norm": 1.800696372811217e-05, + "learning_rate": 8.977071316418339e-10, + "loss": 0.0, + "num_input_tokens_seen": 77714440, + "step": 133955 + }, + { + "epoch": 19.95233839737861, + "grad_norm": 2.5581778118066723e-06, + "learning_rate": 8.703814351557205e-10, + "loss": 0.0, + "num_input_tokens_seen": 77717320, + "step": 133960 + }, + { + "epoch": 19.95308310991957, + "grad_norm": 0.002366063417866826, + "learning_rate": 8.434780735561143e-10, + "loss": 0.0, + "num_input_tokens_seen": 77720520, + "step": 133965 + }, + { + "epoch": 19.95382782246053, + "grad_norm": 2.0278243937355e-06, + "learning_rate": 8.169970473009825e-10, + "loss": 0.0532, + "num_input_tokens_seen": 77723784, + "step": 133970 + }, + { + "epoch": 19.95457253500149, + "grad_norm": 3.192587882949738e-06, + "learning_rate": 7.909383568316386e-10, + "loss": 0.0, + "num_input_tokens_seen": 77726760, + "step": 133975 + }, + { + "epoch": 19.95531724754245, + "grad_norm": 3.4664410577534e-06, + "learning_rate": 7.65302002592172e-10, + "loss": 0.0, + "num_input_tokens_seen": 77729512, + "step": 133980 + }, + { + "epoch": 19.956061960083407, + "grad_norm": 5.289528871799121e-06, + "learning_rate": 7.400879850155695e-10, + "loss": 0.0, + "num_input_tokens_seen": 77732296, + "step": 133985 + }, + { + "epoch": 19.956806672624367, + "grad_norm": 0.00012859459093306214, + "learning_rate": 7.152963045264915e-10, + "loss": 0.0, + "num_input_tokens_seen": 77735272, + "step": 133990 + }, + { + "epoch": 19.957551385165328, + "grad_norm": 7.944551725813653e-06, + "learning_rate": 6.909269615440472e-10, + "loss": 0.0, + "num_input_tokens_seen": 77738248, + "step": 133995 + }, + { + "epoch": 19.958296097706285, + "grad_norm": 2.479202294125571e-06, + "learning_rate": 6.669799564817947e-10, + "loss": 0.0, + "num_input_tokens_seen": 77741032, + "step": 134000 + }, + { + "epoch": 19.959040810247245, + "grad_norm": 1.1345866369083524e-05, + "learning_rate": 6.434552897421897e-10, + "loss": 0.0, + "num_input_tokens_seen": 77743944, + "step": 134005 + }, + { + "epoch": 19.959785522788202, + "grad_norm": 3.320821633678861e-06, + "learning_rate": 6.203529617221371e-10, + "loss": 0.0, + "num_input_tokens_seen": 77746728, + "step": 134010 + }, + { + "epoch": 19.960530235329163, + "grad_norm": 0.00010673551878426224, + "learning_rate": 5.976729728129904e-10, + "loss": 0.0, + "num_input_tokens_seen": 77749544, + "step": 134015 + }, + { + "epoch": 19.961274947870123, + "grad_norm": 2.104892337229103e-05, + "learning_rate": 5.754153233977766e-10, + "loss": 0.0, + "num_input_tokens_seen": 77752552, + "step": 134020 + }, + { + "epoch": 19.96201966041108, + "grad_norm": 2.891597205234575e-06, + "learning_rate": 5.535800138539715e-10, + "loss": 0.0, + "num_input_tokens_seen": 77755240, + "step": 134025 + }, + { + "epoch": 19.96276437295204, + "grad_norm": 1.6631028074698406e-06, + "learning_rate": 5.321670445479488e-10, + "loss": 0.0, + "num_input_tokens_seen": 77758280, + "step": 134030 + }, + { + "epoch": 19.963509085493, + "grad_norm": 6.549376394104911e-06, + "learning_rate": 5.111764158433063e-10, + "loss": 0.0, + "num_input_tokens_seen": 77761192, + "step": 134035 + }, + { + "epoch": 19.96425379803396, + "grad_norm": 1.0354545338486787e-05, + "learning_rate": 4.906081280953157e-10, + "loss": 0.0, + "num_input_tokens_seen": 77764040, + "step": 134040 + }, + { + "epoch": 19.96499851057492, + "grad_norm": 0.0003720379318110645, + "learning_rate": 4.704621816481458e-10, + "loss": 0.0, + "num_input_tokens_seen": 77766920, + "step": 134045 + }, + { + "epoch": 19.965743223115876, + "grad_norm": 8.15680959931342e-06, + "learning_rate": 4.5073857684319043e-10, + "loss": 0.0, + "num_input_tokens_seen": 77770056, + "step": 134050 + }, + { + "epoch": 19.966487935656836, + "grad_norm": 7.562427344964817e-05, + "learning_rate": 4.3143731401629194e-10, + "loss": 0.0, + "num_input_tokens_seen": 77772872, + "step": 134055 + }, + { + "epoch": 19.967232648197797, + "grad_norm": 1.5720846931799315e-05, + "learning_rate": 4.1255839349219063e-10, + "loss": 0.0, + "num_input_tokens_seen": 77775464, + "step": 134060 + }, + { + "epoch": 19.967977360738754, + "grad_norm": 2.0978143311367603e-06, + "learning_rate": 3.9410181559007553e-10, + "loss": 0.0, + "num_input_tokens_seen": 77778248, + "step": 134065 + }, + { + "epoch": 19.968722073279714, + "grad_norm": 5.506544766831212e-05, + "learning_rate": 3.760675806180336e-10, + "loss": 0.0, + "num_input_tokens_seen": 77781064, + "step": 134070 + }, + { + "epoch": 19.969466785820675, + "grad_norm": 2.7389003662392497e-05, + "learning_rate": 3.5845568888692726e-10, + "loss": 0.0, + "num_input_tokens_seen": 77783624, + "step": 134075 + }, + { + "epoch": 19.970211498361632, + "grad_norm": 5.8082300711248536e-06, + "learning_rate": 3.412661406881901e-10, + "loss": 0.0, + "num_input_tokens_seen": 77786632, + "step": 134080 + }, + { + "epoch": 19.970956210902592, + "grad_norm": 2.924883119703736e-06, + "learning_rate": 3.244989363188067e-10, + "loss": 0.0, + "num_input_tokens_seen": 77789416, + "step": 134085 + }, + { + "epoch": 19.97170092344355, + "grad_norm": 1.9502003851812333e-06, + "learning_rate": 3.0815407605633283e-10, + "loss": 0.0, + "num_input_tokens_seen": 77792168, + "step": 134090 + }, + { + "epoch": 19.97244563598451, + "grad_norm": 4.41895354015287e-05, + "learning_rate": 2.922315601783243e-10, + "loss": 0.0, + "num_input_tokens_seen": 77794984, + "step": 134095 + }, + { + "epoch": 19.97319034852547, + "grad_norm": 2.7956680241914e-06, + "learning_rate": 2.7673138895678574e-10, + "loss": 0.0, + "num_input_tokens_seen": 77798184, + "step": 134100 + }, + { + "epoch": 19.973935061066427, + "grad_norm": 2.0659540496126283e-06, + "learning_rate": 2.61653562649844e-10, + "loss": 0.0, + "num_input_tokens_seen": 77801480, + "step": 134105 + }, + { + "epoch": 19.974679773607388, + "grad_norm": 7.947054837131873e-05, + "learning_rate": 2.469980815128503e-10, + "loss": 0.0, + "num_input_tokens_seen": 77804328, + "step": 134110 + }, + { + "epoch": 19.97542448614835, + "grad_norm": 4.9092768676928245e-06, + "learning_rate": 2.3276494579560492e-10, + "loss": 0.0, + "num_input_tokens_seen": 77807176, + "step": 134115 + }, + { + "epoch": 19.976169198689306, + "grad_norm": 0.00015763813280500472, + "learning_rate": 2.1895415573680578e-10, + "loss": 0.0, + "num_input_tokens_seen": 77810024, + "step": 134120 + }, + { + "epoch": 19.976913911230266, + "grad_norm": 1.6678252450219588e-06, + "learning_rate": 2.055657115695997e-10, + "loss": 0.0, + "num_input_tokens_seen": 77812744, + "step": 134125 + }, + { + "epoch": 19.977658623771223, + "grad_norm": 2.0369609501358354e-06, + "learning_rate": 1.925996135215824e-10, + "loss": 0.0, + "num_input_tokens_seen": 77816104, + "step": 134130 + }, + { + "epoch": 19.978403336312184, + "grad_norm": 3.199680577381514e-06, + "learning_rate": 1.8005586181202295e-10, + "loss": 0.0, + "num_input_tokens_seen": 77818984, + "step": 134135 + }, + { + "epoch": 19.979148048853144, + "grad_norm": 6.013000165694393e-05, + "learning_rate": 1.6793445664908813e-10, + "loss": 0.0, + "num_input_tokens_seen": 77822408, + "step": 134140 + }, + { + "epoch": 19.9798927613941, + "grad_norm": 6.48897139399196e-06, + "learning_rate": 1.5623539824372035e-10, + "loss": 0.0, + "num_input_tokens_seen": 77825064, + "step": 134145 + }, + { + "epoch": 19.98063747393506, + "grad_norm": 3.1363055313704535e-05, + "learning_rate": 1.4495868678743307e-10, + "loss": 0.0, + "num_input_tokens_seen": 77827944, + "step": 134150 + }, + { + "epoch": 19.98138218647602, + "grad_norm": 7.263114821398631e-05, + "learning_rate": 1.3410432247173977e-10, + "loss": 0.0, + "num_input_tokens_seen": 77830984, + "step": 134155 + }, + { + "epoch": 19.98212689901698, + "grad_norm": 4.0556633393862285e-06, + "learning_rate": 1.2367230548537834e-10, + "loss": 0.0, + "num_input_tokens_seen": 77833864, + "step": 134160 + }, + { + "epoch": 19.98287161155794, + "grad_norm": 3.383727062100661e-06, + "learning_rate": 1.1366263599765781e-10, + "loss": 0.0, + "num_input_tokens_seen": 77836840, + "step": 134165 + }, + { + "epoch": 19.983616324098897, + "grad_norm": 0.0002524412702769041, + "learning_rate": 1.040753141834383e-10, + "loss": 0.0, + "num_input_tokens_seen": 77839624, + "step": 134170 + }, + { + "epoch": 19.984361036639857, + "grad_norm": 8.354456076631323e-05, + "learning_rate": 9.491034020092659e-11, + "loss": 0.0, + "num_input_tokens_seen": 77842536, + "step": 134175 + }, + { + "epoch": 19.985105749180818, + "grad_norm": 9.856811630015727e-06, + "learning_rate": 8.616771420555391e-11, + "loss": 0.0, + "num_input_tokens_seen": 77845608, + "step": 134180 + }, + { + "epoch": 19.985850461721775, + "grad_norm": 1.3857836620445596e-06, + "learning_rate": 7.784743634720038e-11, + "loss": 0.0, + "num_input_tokens_seen": 77848296, + "step": 134185 + }, + { + "epoch": 19.986595174262735, + "grad_norm": 2.7505595880938927e-06, + "learning_rate": 6.994950676186829e-11, + "loss": 0.0, + "num_input_tokens_seen": 77851240, + "step": 134190 + }, + { + "epoch": 19.987339886803692, + "grad_norm": 3.667222927106195e-06, + "learning_rate": 6.247392558833553e-11, + "loss": 0.0, + "num_input_tokens_seen": 77854152, + "step": 134195 + }, + { + "epoch": 19.988084599344653, + "grad_norm": 0.00012130488903494552, + "learning_rate": 5.542069295150221e-11, + "loss": 0.0, + "num_input_tokens_seen": 77856872, + "step": 134200 + }, + { + "epoch": 19.988829311885613, + "grad_norm": 1.6077421605587006e-05, + "learning_rate": 4.878980896794172e-11, + "loss": 0.0, + "num_input_tokens_seen": 77859720, + "step": 134205 + }, + { + "epoch": 19.98957402442657, + "grad_norm": 0.0017781185451894999, + "learning_rate": 4.258127375145193e-11, + "loss": 0.0, + "num_input_tokens_seen": 77862824, + "step": 134210 + }, + { + "epoch": 19.99031873696753, + "grad_norm": 1.629909274925012e-05, + "learning_rate": 3.679508740472848e-11, + "loss": 0.0, + "num_input_tokens_seen": 77865416, + "step": 134215 + }, + { + "epoch": 19.99106344950849, + "grad_norm": 2.245618361484958e-06, + "learning_rate": 3.143125003046699e-11, + "loss": 0.0, + "num_input_tokens_seen": 77868232, + "step": 134220 + }, + { + "epoch": 19.99180816204945, + "grad_norm": 1.8481645383872092e-05, + "learning_rate": 2.648976171470974e-11, + "loss": 0.0, + "num_input_tokens_seen": 77871208, + "step": 134225 + }, + { + "epoch": 19.99255287459041, + "grad_norm": 5.9672056522686034e-06, + "learning_rate": 2.197062254349902e-11, + "loss": 0.0, + "num_input_tokens_seen": 77874248, + "step": 134230 + }, + { + "epoch": 19.993297587131366, + "grad_norm": 1.7685573766357265e-05, + "learning_rate": 1.7873832591774885e-11, + "loss": 0.0, + "num_input_tokens_seen": 77877256, + "step": 134235 + }, + { + "epoch": 19.994042299672326, + "grad_norm": 3.0936334951547906e-06, + "learning_rate": 1.4199391928926276e-11, + "loss": 0.0, + "num_input_tokens_seen": 77880264, + "step": 134240 + }, + { + "epoch": 19.994787012213287, + "grad_norm": 4.7753510443726555e-06, + "learning_rate": 1.0947300618791013e-11, + "loss": 0.0, + "num_input_tokens_seen": 77882920, + "step": 134245 + }, + { + "epoch": 19.995531724754244, + "grad_norm": 7.1208651206688955e-06, + "learning_rate": 8.11755871410469e-12, + "loss": 0.0, + "num_input_tokens_seen": 77885832, + "step": 134250 + }, + { + "epoch": 19.996276437295204, + "grad_norm": 9.520303137833253e-06, + "learning_rate": 5.710166262051786e-12, + "loss": 0.0, + "num_input_tokens_seen": 77889288, + "step": 134255 + }, + { + "epoch": 19.997021149836165, + "grad_norm": 1.0857627785298973e-05, + "learning_rate": 3.725123307041223e-12, + "loss": 0.0, + "num_input_tokens_seen": 77891880, + "step": 134260 + }, + { + "epoch": 19.997765862377122, + "grad_norm": 2.524471710785292e-06, + "learning_rate": 2.162429879604133e-12, + "loss": 0.0, + "num_input_tokens_seen": 77894952, + "step": 134265 + }, + { + "epoch": 19.998510574918082, + "grad_norm": 7.52212827137555e-06, + "learning_rate": 1.022086004720535e-12, + "loss": 0.0, + "num_input_tokens_seen": 77897512, + "step": 134270 + }, + { + "epoch": 19.99925528745904, + "grad_norm": 2.3730899556539953e-06, + "learning_rate": 3.040917045948888e-13, + "loss": 0.0, + "num_input_tokens_seen": 77900488, + "step": 134275 + }, + { + "epoch": 20.0, + "grad_norm": 2.746027575994958e-06, + "learning_rate": 8.446993104982426e-15, + "loss": 0.0, + "num_input_tokens_seen": 77902976, + "step": 134280 + }, + { + "epoch": 20.0, + "eval_loss": 3.837527275085449, + "eval_runtime": 51.3188, + "eval_samples_per_second": 58.146, + "eval_steps_per_second": 14.537, + "num_input_tokens_seen": 77902976, + "step": 134280 + }, + { + "epoch": 20.0, + "num_input_tokens_seen": 77902976, + "step": 134280, + "total_flos": 3.514797523668566e+18, + "train_loss": 0.18788277705373982, + "train_runtime": 32107.5081, + "train_samples_per_second": 16.727, + "train_steps_per_second": 4.182 + } + ], + "logging_steps": 5, + "max_steps": 134280, + "num_input_tokens_seen": 77902976, + "num_train_epochs": 20, + "save_steps": 6714, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.514797523668566e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}